Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subregion analysis for GPS data #44

Merged
merged 18 commits into from May 4, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -81,7 +81,7 @@ geo_tag: Counts the number of tags from each region, as well as extracts AlexNet

geo_lng: Counts the languages that make up the image tags, and whether or not they are local to the country the image is from. Also extracts image-level features to compare if locals and tourist portray a country differently

Note: Geography-Based analyses require a mapping from images to location. The 2 primary ways we've encountered these mappings in existing datasets are geography labels (ie. String formatted locations like 'Manhattan'), and GPS labels (latitude and longitude coordinate pairs). Our analyses supports both types of geography mappings. Namely, the user should specify in their dataset class the `geography_info_type` to be one of the following:
Note: Geography-Based analyses require a mapping from images to location. The 2 formats of geography annotations supported are (ie. String formatted locations like 'Manhattan'), and GPS labels (latitude and longitude coordinate pairs). Namely, the user should specify in their dataset class the `geography_info_type` to be one of the following:
- 'GPS_LABEL': datasets with mappings from image to GPS coordinates
- 'STRING_FORMATTED_LABEL', datasets with mappings from image to string-formatted labels

Expand Down
38 changes: 18 additions & 20 deletions analysis_notebooks/Geography Analysis.ipynb
Expand Up @@ -724,8 +724,8 @@
"outputs": [],
"source": [
"if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
" if not os.path.exists(\"results/{0}/6\".format(folder_name)):\n",
" os.mkdir(\"results/{0}/6\".format(folder_name))\n",
" if not os.path.exists(\"results/{0}/geo_tag\".format(folder_name)):\n",
" os.mkdir(\"results/{0}/geo_tag\".format(folder_name))\n",
" info_stats = pickle.load(open(\"results/{}/geo_tag.pkl\".format(folder_name), \"rb\")) #20GB\n",
" country_tags = info_stats['country_tags']\n",
" tag_to_subregion_features = info_stats['tag_to_subregion_features']\n",
Expand All @@ -741,7 +741,7 @@
" subregion_tags[subregion] = np.add(subregion_tags[subregion], counts)\n",
" total_counts = total_counts.astype(int)\n",
" sum_total_counts = int(np.sum(total_counts))\n",
" if not os.path.exists('checkpoints/{}/6_a.pkl'.format(folder_name)):\n",
" if not os.path.exists('checkpoints/{}/geo_tag_a.pkl'.format(folder_name)):\n",
" pvalues_over = {} # pvalue : '[country]: [tag] (country num and total num info for now)'\n",
" pvalues_under = {} \n",
" for country, counts in country_tags.items():\n",
Expand All @@ -760,7 +760,7 @@
" else:\n",
" pvalues_under[p] = tag_info\n",
" else:\n",
" pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/6_a.pkl'.format(folder_name), 'rb'))\n",
" pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/geo_tag_a.pkl'.format(folder_name), 'rb'))\n",
" subregion_pvalues_over = {}\n",
" subregion_pvalues_under = {}\n",
" for subregion, counts in subregion_tags.items():\n",
Expand All @@ -778,9 +778,8 @@
" subregion_pvalues_under[p] = tag_info\n",
" \n",
"elif (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"REGION_LABEL\"):\n",
" print(\"Geo_tag work for region label formatted dataset\")\n",
" if not os.path.exists(\"results/{0}/6\".format(folder_name)):\n",
" os.mkdir(\"results/{0}/6\".format(folder_name))\n",
" if not os.path.exists(\"results/{0}/geo_tag\".format(folder_name)):\n",
" os.mkdir(\"results/{0}/geo_tag\".format(folder_name))\n",
" info_stats = pickle.load(open(\"results/{}/geo_tag_region.pkl\".format(folder_name), \"rb\")) #20GB\n",
" region_tags = info_stats['region_tags']\n",
" tag_to_region_features = info_stats['tag_to_region_features']\n",
Expand All @@ -794,7 +793,7 @@
" total_counts = total_counts.astype(int)\n",
" sum_total_counts = int(np.sum(total_counts))\n",
"\n",
" if not os.path.exists('checkpoints/{}/6_a.pkl'.format(folder_name)):\n",
" if not os.path.exists('checkpoints/{}/geo_tag_a.pkl'.format(folder_name)):\n",
" pvalues_over = {} # pvalue : '[region]: [tag] (region num and total num info for now)'\n",
" pvalues_under = {} \n",
" for region, counts in region_tags.items():\n",
Expand All @@ -813,12 +812,11 @@
" else:\n",
" pvalues_under[p] = tag_info\n",
" else:\n",
" pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/6_a.pkl'.format(folder_name), 'rb'))\n",
" pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/geo_tag_a.pkl'.format(folder_name), 'rb'))\n",
"\n",
"elif dataset.geography_info_type == \"GPS_LABEL\":\n",
" print(\"Geo_tag work for gps formatted dataset\")\n",
" if not os.path.exists(\"results/{0}/6\".format(folder_name)):\n",
" os.mkdir(\"results/{0}/6\".format(folder_name))\n",
" if not os.path.exists(\"results/{0}/geo_tag\".format(folder_name)):\n",
" os.mkdir(\"results/{0}/geo_tag\".format(folder_name))\n",
" info_stats = pickle.load(open(\"results/{}/geo_tag_gps.pkl\".format(folder_name), \"rb\")) #20GB\n",
" region_tags = info_stats['region_tags']\n",
" subregion_tags = info_stats.get('subregion_tags', None)\n",
Expand All @@ -833,7 +831,7 @@
" total_counts = total_counts.astype(int)\n",
" sum_total_counts = int(np.sum(total_counts))\n",
"\n",
" if not os.path.exists('checkpoints/{}/6_a.pkl'.format(folder_name)):\n",
" if not os.path.exists('checkpoints/{}/geo_tag_a.pkl'.format(folder_name)):\n",
" pvalues_over = {} # pvalue : '[region]: [tag] (region num and total num info for now)'\n",
" pvalues_under = {} \n",
" for region, counts in region_tags.items():\n",
Expand All @@ -852,7 +850,7 @@
" else:\n",
" pvalues_under[p] = tag_info\n",
" else:\n",
" pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/6_a.pkl'.format(folder_name), 'rb'))\n",
" pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/geo_tag_a.pkl'.format(folder_name), 'rb'))\n",
"\n",
" if subregion_tags is not None:\n",
" subregion_total_counts = np.zeros(len(categories))\n",
Expand Down Expand Up @@ -942,7 +940,7 @@
" import warnings\n",
" warnings.filterwarnings(\"ignore\")\n",
"\n",
" if not os.path.exists('checkpoints/{}/6_b.pkl'.format(folder_name)):\n",
" if not os.path.exists('checkpoints/{}/geo_tag_b.pkl'.format(folder_name)):\n",
" phrase_to_value = {}\n",
" # Look at appearance differences in how a tag is represented across subregions\n",
" for tag in tag_to_subregion_features.keys():\n",
Expand Down Expand Up @@ -988,7 +986,7 @@
" fontP = FontProperties()\n",
" fontP.set_size('small')\n",
" lgd = plt.legend(handles=handles, bbox_to_anchor=(1.04,1), loc=\"upper left\", prop=fontP)\n",
" plt.savefig('results/{0}/{1}/{2}_tsne.png'.format(folder_name, 6, dataset.categories[tag]), bbox_extra_artists=(lgd,), bbox_inches='tight')\n",
" plt.savefig('results/{0}/{1}/{2}_tsne.png'.format(folder_name, \"geo_tag\", dataset.categories[tag]), bbox_extra_artists=(lgd,), bbox_inches='tight')\n",
" plt.close()\n",
"\n",
" class_preds = clf.predict(all_features)\n",
Expand Down Expand Up @@ -1034,10 +1032,10 @@
" phrase = dataset.labels_to_names[dataset.categories[tag]]\n",
" phrase_to_value[phrase] = [value, all_subregions[diff_subregion], acc, p_value, num_features, j_to_acc]\n",
" \n",
" pickle.dump([original_labels, class_probs, class_preds, diff_subregion, all_filepaths], open('results/{0}/{1}/{2}_info.pkl'.format(folder_name, 6, dataset.labels_to_names[dataset.categories[tag]]), 'wb'))\n",
" pickle.dump(phrase_to_value, open('checkpoints/{}/6_b.pkl'.format(folder_name), 'wb'))\n",
" pickle.dump([original_labels, class_probs, class_preds, diff_subregion, all_filepaths], open('results/{0}/{1}/{2}_info.pkl'.format(folder_name, \"geo_tag\", dataset.labels_to_names[dataset.categories[tag]]), 'wb'))\n",
" pickle.dump(phrase_to_value, open('checkpoints/{}/geo_tag_b.pkl'.format(folder_name), 'wb'))\n",
" else:\n",
" phrase_to_value = pickle.load(open('checkpoints/{}/6_b.pkl'.format(folder_name), 'rb'))\n",
" phrase_to_value = pickle.load(open('checkpoints/{}/geo_tag_b.pkl'.format(folder_name), 'rb'))\n",
"\n",
" svm_options = []\n",
" best_tag = None\n",
Expand All @@ -1055,7 +1053,7 @@
" else:\n",
" if tag is None:\n",
" return\n",
" this_info = pickle.load(open('results/{0}/{1}/{2}_info.pkl'.format(folder_name, 6, tag), 'rb'))\n",
" this_info = pickle.load(open('results/{0}/{1}/{2}_info.pkl'.format(folder_name, \"geo_tag\", tag), 'rb'))\n",
" labels, class_probs, class_preds, diff_subregion, all_filepaths = this_info\n",
" value, region, acc, p_value, num_features, j_to_acc = phrase_to_value[tag]\n",
" if num is not None:\n",
Expand Down
2 changes: 1 addition & 1 deletion measurements/geography_based.py
Expand Up @@ -173,7 +173,7 @@ def geo_tag(dataloader, args):
if (dataloader.dataset.geography_info_type == "GPS_LABEL"):
print("redirecting to geo_tag_gps()...")
return geo_tag_gps(dataloader, args)
if (dataloader.dataset.geography_info_type == "STRING_FORMATTED_LABEL" and dataloader.dataset.geography_label_string_type == "REGION_LABEL"):
elif (dataloader.dataset.geography_info_type == "STRING_FORMATTED_LABEL" and dataloader.dataset.geography_label_string_type == "REGION_LABEL"):
print("redirecting to geo_tag_region()...")
return geo_tag_region(dataloader, args)
country_tags = {}
Expand Down