New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Subregion analysis for GPS data #44
Changes from 12 commits
fc208c3
ac202e4
56b08f6
9505ad3
d6e5343
78cd536
b4da2db
ffd84fa
a873540
a12ef8a
69e9232
aaf6d58
9cee230
9c9c6c2
97eee74
08f1dd8
03f7131
e1618ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -311,10 +311,7 @@ | |
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# check if we use global country labels vs GPS label\n", | ||
"is_country_counts = dataset.geo_boundaries is None\n", | ||
"\n", | ||
"if is_country_counts: \n", | ||
"if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" counts = pickle.load(open(\"results/{}/geo_ctr.pkl\".format(folder_name), \"rb\"))\n", | ||
" iso3_to_subregion = pickle.load(open('util_files/iso3_to_subregion_mappings.pkl', 'rb'))\n", | ||
" gc = GeonamesCache()\n", | ||
|
@@ -358,12 +355,21 @@ | |
" for key in iso3_to_scaledpop.keys():\n", | ||
" iso3_to_scaledpop[key] /= min(iso3_to_scaledpop.values())\n", | ||
"\n", | ||
"\n", | ||
"else:\n", | ||
"elif (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"REGION_LABEL\"):\n", | ||
" ctr_dict = pickle.load(open(\"results/{}/geo_ctr_region.pkl\".format(folder_name), \"rb\"))\n", | ||
" counts = ctr_dict[\"region_to_id\"]\n", | ||
" total = sum(counts.values())\n", | ||
" region_count_phrases = []\n", | ||
" for region, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):\n", | ||
" region_count_phrases.append(\"{0}: {1} {2}%\".format(region, count, round(100.*count/total)))\n", | ||
" \n", | ||
"elif dataset.geography_info_type == \"GPS_LABEL\":\n", | ||
" counts_gps = pickle.load(open(\"results/{}/geo_ctr_gps.pkl\".format(folder_name), \"rb\"))\n", | ||
" region_to_id = counts_gps[\"region_to_id\"]\n", | ||
" id_to_gps = counts_gps[\"id_to_gps\"]\n", | ||
" subregion_to_id = counts_gps.get(\"subregion_to_id\", None)\n", | ||
" geo_boundaries = dataset.geo_boundaries\n", | ||
" subregion_boundaries = dataset.subregion_boundaries\n", | ||
" choro_data = pd.read_csv(dataset.choropleth_filepath)\n", | ||
"\n", | ||
" counts = {}\n", | ||
|
@@ -372,9 +378,15 @@ | |
" total = sum(counts.values())\n", | ||
" country_count_phrases = []\n", | ||
"\n", | ||
"\n", | ||
" for country, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):\n", | ||
" country_count_phrases.append(\"{0}: {1} {2}%\".format(country, count, round(100.*count/total)))\n", | ||
" \n", | ||
" \n", | ||
" subregion_counts = None\n", | ||
" if subregion_to_id is not None:\n", | ||
" subregion_counts = {}\n", | ||
" for subregion in subregion_to_id:\n", | ||
" subregion_counts[subregion] = len(subregion_to_id[subregion])\n", | ||
"\n", | ||
" colors = [\n", | ||
" 'red',\n", | ||
|
@@ -405,17 +417,21 @@ | |
" print(country_count_phrases[-1-i])\n", | ||
" \n", | ||
"def subregion_counts_num():\n", | ||
" if is_country_counts:\n", | ||
" if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" print(\"Subregion Counts\\n\")\n", | ||
" total_subregion = sum(subregion_counts.values())\n", | ||
" for subregion, count in sorted(subregion_counts.items(), key=lambda x: x[1], reverse=True):\n", | ||
" print(\"{0}: {1} {2}%\".format(subregion, count, round(100.*count/total_subregion)))\n", | ||
" elif dataset.geography_info_type == \"GPS_LABEL\" and subregion_to_id is not None:\n", | ||
" print(\"Subregion Counts\\n\")\n", | ||
" total_subregion = sum(subregion_counts.values())\n", | ||
" for subregion, count in sorted(subregion_counts.items(), key=lambda x: x[1], reverse=True):\n", | ||
" print(\"{0}: {1} {2}%\".format(subregion, count, round(100.*count/total_subregion)))\n", | ||
" else:\n", | ||
" print(\"Subregion counts for gps labels coming soon\")\n", | ||
"\n", | ||
"\n", | ||
" print(\"Subregion analysis not available\")\n", | ||
" \n", | ||
"def region_map():\n", | ||
" if is_country_counts:\n", | ||
" if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" fig = plt.figure(figsize=(16, 7))\n", | ||
"\n", | ||
" fontsize = 20\n", | ||
|
@@ -460,7 +476,8 @@ | |
" plt.show()\n", | ||
"\n", | ||
" print(\"Total countries: {}\".format(len(iso3_to_bin)))\n", | ||
" else:\n", | ||
" \n", | ||
" elif dataset.geography_info_type == \"GPS_LABEL\":\n", | ||
" m = folium.Map()\n", | ||
" folium.GeoJson(geo_boundaries, name=\"geojson\").add_to(m)\n", | ||
" count = 0\n", | ||
|
@@ -477,21 +494,50 @@ | |
" ).add_to(m)\n", | ||
" count += 1\n", | ||
" return m\n", | ||
" \n", | ||
" elif (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"REGION_LABEL\"):\n", | ||
" m = folium.Map()\n", | ||
" \n", | ||
" region_names = []\n", | ||
" region_counts = []\n", | ||
" for region in counts:\n", | ||
" region_names.append(region)\n", | ||
" region_counts.append(counts[region])\n", | ||
"\n", | ||
" counts_new = {\"region\": region_names, \"region_counts\": region_counts}\n", | ||
" counts_df = pd.DataFrame.from_dict(counts_new)\n", | ||
" \n", | ||
" # Change \"Data\" to column name of csv you want\n", | ||
" folium.Choropleth(\n", | ||
" geo_data=geo_boundaries,\n", | ||
" name=\"choropleth\",\n", | ||
" data=counts_df,\n", | ||
" columns=[\"region\", \"region_counts\"],\n", | ||
" key_on=\"feature.properties.{0}\".format(dataset.geo_boundaries_key_name),\n", | ||
" fill_color=\"BuPu\",\n", | ||
" fill_opacity=0.7,\n", | ||
" line_opacity=0.2,\n", | ||
" legend_name=\"Image Counts over subregion\",\n", | ||
" ).add_to(m)\n", | ||
"\n", | ||
" folium.LayerControl().add_to(m)\n", | ||
" return m\n", | ||
" \n", | ||
"def choropleth():\n", | ||
" if not is_country_counts:\n", | ||
" if dataset.geography_info_type == \"GPS_LABEL\":\n", | ||
" m = folium.Map()\n", | ||
"\n", | ||
" \n", | ||
" # Change \"Data\" to column name of csv you want\n", | ||
" folium.Choropleth(\n", | ||
" geo_data=geo_boundaries,\n", | ||
" name=\"choropleth\",\n", | ||
" data=choro_data,\n", | ||
" columns=[\"Region\", \"Data\"],\n", | ||
" key_on=\"feature.properties.name_1\",\n", | ||
" columns=[\"region\", \"population\"],\n", | ||
" key_on=\"feature.properties.{0}\".format(dataset.geo_boundaries_key_name),\n", | ||
" fill_color=\"BuPu\",\n", | ||
" fill_opacity=0.7,\n", | ||
" line_opacity=0.2,\n", | ||
" legend_name=\"My Data\",\n", | ||
" legend_name=\"Data\",\n", | ||
" ).add_to(m)\n", | ||
"\n", | ||
" folium.LayerControl().add_to(m)\n", | ||
|
@@ -511,9 +557,10 @@ | |
" count += 1\n", | ||
" return m\n", | ||
" else:\n", | ||
" print(\"No custom subregion data available\")\n", | ||
" print(\"No choropleth available\")\n", | ||
" \n", | ||
"def country_map_population():\n", | ||
" if is_country_counts:\n", | ||
" if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" fig = plt.figure(figsize=(16, 7))\n", | ||
"\n", | ||
" fontsize = 20\n", | ||
|
@@ -554,7 +601,7 @@ | |
" fig.suptitle('Dataset representation scaled by country population, logarithmic scale', fontsize=fontsize, y=.95)\n", | ||
" plt.show()\n", | ||
" else:\n", | ||
" print(\"No country information available\")" | ||
" print(\"No population information available, use choropleth instead\")" | ||
] | ||
}, | ||
{ | ||
|
@@ -601,7 +648,7 @@ | |
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Visualization of representation by country" | ||
"Visualization of representation by region" | ||
] | ||
}, | ||
{ | ||
|
@@ -676,9 +723,7 @@ | |
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# check if we use global country labels vs GPS label\n", | ||
"is_country = dataset.geo_boundaries is None\n", | ||
"if is_country:\n", | ||
"if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" if not os.path.exists(\"results/{0}/6\".format(folder_name)):\n", | ||
" os.mkdir(\"results/{0}/6\".format(folder_name))\n", | ||
" info_stats = pickle.load(open(\"results/{}/geo_tag.pkl\".format(folder_name), \"rb\")) #20GB\n", | ||
|
@@ -731,13 +776,52 @@ | |
" subregion_pvalues_over[p] = tag_info\n", | ||
" else:\n", | ||
" subregion_pvalues_under[p] = tag_info\n", | ||
" \n", | ||
"elif (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"REGION_LABEL\"):\n", | ||
" print(\"Geo_tag work for region label formatted dataset\")\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this print necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed, thanks |
||
" if not os.path.exists(\"results/{0}/6\".format(folder_name)):\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think "6" has been deprecated for the new names of the analyses, please map accordingly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done, replaced '6' with "geo_tag" |
||
" os.mkdir(\"results/{0}/6\".format(folder_name))\n", | ||
" info_stats = pickle.load(open(\"results/{}/geo_tag_region.pkl\".format(folder_name), \"rb\")) #20GB\n", | ||
" region_tags = info_stats['region_tags']\n", | ||
" tag_to_region_features = info_stats['tag_to_region_features']\n", | ||
"\n", | ||
" categories = dataset.categories\n", | ||
" total_counts = np.zeros(len(categories))\n", | ||
"\n", | ||
" for region, counts in region_tags.items():\n", | ||
" total_counts = np.add(total_counts, counts)\n", | ||
"\n", | ||
"if not is_country:\n", | ||
" total_counts = total_counts.astype(int)\n", | ||
" sum_total_counts = int(np.sum(total_counts))\n", | ||
"\n", | ||
" if not os.path.exists('checkpoints/{}/6_a.pkl'.format(folder_name)):\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here. if prerun_geo doesn't have these changes either, please add in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed 6_a to new naming convention ("geo_tag_6a" ). Also added new changes to prerun_geo |
||
" pvalues_over = {} # pvalue : '[region]: [tag] (region num and total num info for now)'\n", | ||
" pvalues_under = {} \n", | ||
" for region, counts in region_tags.items():\n", | ||
" tags_for_region = int(np.sum(counts))\n", | ||
" if tags_for_region < 50: # threshold for region to have at least 50 tags so there are enough samples for analysis\n", | ||
" continue\n", | ||
" for i, count in enumerate(counts):\n", | ||
" this_counts = np.zeros(tags_for_region)\n", | ||
" this_counts[:int(count)] = 1\n", | ||
" that_counts = np.zeros(sum_total_counts - tags_for_region)\n", | ||
" that_counts[:total_counts[i] - int(count)] = 1\n", | ||
" p = stats.ttest_ind(this_counts, that_counts)[1]\n", | ||
" tag_info = '{0}-{1} ({2}/{3} vs {4}/{5})'.format(region, categories[i], int(count), tags_for_region, int(total_counts[i] - count), sum_total_counts - tags_for_region)\n", | ||
" if np.mean(this_counts) > np.mean(that_counts):\n", | ||
" pvalues_over[p] = tag_info\n", | ||
" else:\n", | ||
" pvalues_under[p] = tag_info\n", | ||
" else:\n", | ||
" pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/6_a.pkl'.format(folder_name), 'rb'))\n", | ||
"\n", | ||
"elif dataset.geography_info_type == \"GPS_LABEL\":\n", | ||
" print(\"Geo_tag work for gps formatted dataset\")\n", | ||
" if not os.path.exists(\"results/{0}/6\".format(folder_name)):\n", | ||
" os.mkdir(\"results/{0}/6\".format(folder_name))\n", | ||
" info_stats = pickle.load(open(\"results/{}/geo_tag_gps.pkl\".format(folder_name), \"rb\")) #20GB\n", | ||
" region_tags = info_stats['region_tags']\n", | ||
" subregion_tags = info_stats.get('subregion_tags', None)\n", | ||
" tag_to_region_features = info_stats['tag_to_region_features']\n", | ||
"\n", | ||
" categories = dataset.categories\n", | ||
|
@@ -770,8 +854,34 @@ | |
" else:\n", | ||
" pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/6_a.pkl'.format(folder_name), 'rb'))\n", | ||
"\n", | ||
"def tag_rep_by_country(topn):\n", | ||
" if is_country:\n", | ||
" if subregion_tags is not None:\n", | ||
" subregion_total_counts = np.zeros(len(categories))\n", | ||
" for region, counts in subregion_tags.items():\n", | ||
" subregion_total_counts = np.add(subregion_total_counts, counts)\n", | ||
"\n", | ||
" subregion_total_counts = subregion_total_counts.astype(int)\n", | ||
" sum_subregion_total_counts = int(np.sum(subregion_total_counts))\n", | ||
"\n", | ||
" subregion_pvalues_over = {} \n", | ||
" subregion_pvalues_under = {} \n", | ||
" for region, counts in subregion_tags.items():\n", | ||
" tags_for_region = int(np.sum(counts))\n", | ||
" if tags_for_region < 50: # threshold for subregion to have at least 50 tags so there are enough samples for analysis\n", | ||
" continue\n", | ||
" for i, count in enumerate(counts):\n", | ||
" this_counts = np.zeros(tags_for_region)\n", | ||
" this_counts[:int(count)] = 1\n", | ||
" that_counts = np.zeros(sum_subregion_total_counts - tags_for_region)\n", | ||
" that_counts[:subregion_total_counts[i] - int(count)] = 1\n", | ||
" p = stats.ttest_ind(this_counts, that_counts)[1]\n", | ||
" tag_info = '{0}-{1} ({2}/{3} vs {4}/{5})'.format(region, categories[i], int(count), tags_for_region, int(subregion_total_counts[i] - count), sum_subregion_total_counts - tags_for_region)\n", | ||
" if np.mean(this_counts) > np.mean(that_counts):\n", | ||
" subregion_pvalues_over[p] = tag_info\n", | ||
" else:\n", | ||
" subregion_pvalues_under[p] = tag_info\n", | ||
" \n", | ||
"def tag_rep_by_region(topn):\n", | ||
" if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" if first_pass:\n", | ||
" to_write[1] = [\"(geo_tag) Overrepresentations of tags by country (tag in country vs tag in rest of the countries):\"]\n", | ||
" for p, content in sorted(pvalues_over.items(), key=lambda x: x[0])[:4]:\n", | ||
|
@@ -808,7 +918,16 @@ | |
"\n", | ||
"\n", | ||
"def tag_rep_by_subregion(topn):\n", | ||
" if is_country:\n", | ||
" if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" print(\"By Subregion\\n\")\n", | ||
" print('Over represented\\n')\n", | ||
" for p, content in sorted(subregion_pvalues_over.items(), key=lambda x: x[0])[:topn]:\n", | ||
" print('{0}: {1}'.format(round(p, 4), content))\n", | ||
" print('\\nUnder represented\\n')\n", | ||
" for p, content in sorted(subregion_pvalues_under.items(), key=lambda x: x[0])[:topn]:\n", | ||
" print('{0}: {1}'.format(round(p, 4), content))\n", | ||
" \n", | ||
" elif dataset.geography_info_type == \"GPS_LABEL\" and subregion_tags is not None:\n", | ||
" print(\"By Subregion\\n\")\n", | ||
" print('Over represented\\n')\n", | ||
" for p, content in sorted(subregion_pvalues_over.items(), key=lambda x: x[0])[:topn]:\n", | ||
|
@@ -819,7 +938,7 @@ | |
" else:\n", | ||
" print(\"No subregion data for gps-formatted datasets\")\n", | ||
"\n", | ||
"if is_country:\n", | ||
"if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" import warnings\n", | ||
" warnings.filterwarnings(\"ignore\")\n", | ||
"\n", | ||
|
@@ -931,9 +1050,9 @@ | |
" svm_options.append(('{0} in {1}: {2}% and {3}x'.format(phrase, region, round(100.*acc, 3), round(value, 3)), phrase))\n", | ||
"\n", | ||
"def show_svm_tag(tag, num):\n", | ||
" if not is_country:\n", | ||
" if not (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" print(\"SVM metric not setup for gps datasets\")\n", | ||
" if is_country:\n", | ||
" else:\n", | ||
" if tag is None:\n", | ||
" return\n", | ||
" this_info = pickle.load(open('results/{0}/{1}/{2}_info.pkl'.format(folder_name, 6, tag), 'rb'))\n", | ||
|
@@ -1018,7 +1137,7 @@ | |
" print(\"Out: Correct\")\n", | ||
" else:\n", | ||
" to_write[2].append(\"Out: Correct\")\n", | ||
" display_chunk(False, False, to_save, 'd')\n" | ||
" display_chunk(False, False, to_save, 'd')" | ||
] | ||
}, | ||
{ | ||
|
@@ -1033,7 +1152,7 @@ | |
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Over- and under- representations of tags by country. The first fraction shows how many of this country's tags are made up of this one, and the second fraction shows how many of all of the country's tags are made up of this one." | ||
"Over- and under- representations of tags by region. The first fraction shows how many of this region's tags are made up of this one, and the second fraction shows how many of all of the region's tags are made up of this one." | ||
] | ||
}, | ||
{ | ||
|
@@ -1042,7 +1161,7 @@ | |
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"interact(tag_rep_by_country, topn=widgets.IntSlider(min=1, max=30, step=1, value=10));" | ||
"interact(tag_rep_by_region, topn=widgets.IntSlider(min=1, max=30, step=1, value=10));" | ||
] | ||
}, | ||
{ | ||
|
@@ -1075,7 +1194,7 @@ | |
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if is_country:\n", | ||
"if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n", | ||
" num_widget = widgets.IntSlider(min=1, max=20, step=1, value=5)\n", | ||
" tag_widget = widgets.Dropdown(options=svm_options, layout=Layout(width='400px'))\n", | ||
" all_things = [widgets.Label('Tag, acc, acc/acc_random',layout=Layout(padding='0px 0px 0px 5px', width='170px')), tag_widget, widgets.Label('Num',layout=Layout(padding='0px 5px 0px 40px', width='80px')), num_widget]\n", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"The 2 primary ways we've encountered these mappings in existing datasets are geography labels" -> something like "The 2 formats of geography annotations supported are"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
modified, thank you