princetonvisualai · Angelina-Wang · May 4, 2021 · Apr 20, 2021 · Apr 20, 2021 · Apr 22, 2021
diff --git a/README.md b/README.md
@@ -75,12 +75,16 @@ att_scn: Counts the types of scenes each gender occurs with
 
 ### Geography-Based
 
-geo_ctr: Counts the number of images from each country
+geo_ctr: Counts the number of images from each region
 
-geo_tag: Counts the number of tags from each country, as well as extracts AlexNet features pretrained on ImageNet for each tag, grouping by subregion
+geo_tag: Counts the number of tags from each region, as well as extracts AlexNet features pretrained on ImageNet for each tag, grouping by subregion
 
 geo_lng: Counts the languages that make up the image tags, and whether or not they are local to the country the image is from. Also extracts image-level features to compare if locals and tourist portray a country differently
 
+Note: Geography-Based analyses require a mapping from images to location. The 2 primary ways we've encountered these mappings in existing datasets are geography labels (ie. String formatted locations like 'Manhattan'), and GPS labels (latitude and longitude coordinate pairs). Our analyses supports both types of geography mappings. Namely, the user should specify in their dataset class the `geography_info_type` to be one of the following:
+- 'GPS_LABEL': datasets with mappings from image to GPS coordinates
+- 'STRING_FORMATTED_LABEL', datasets with mappings from image to string-formatted labels
+
 ## Potential Environment Issues
 - If FileNotFoundError: [Errno 2] No such file or directory: appears from importing basemap at epsgf = open(os.path.join(pyproj_datadir,'epsg')), change the PROJ_LIB variable as suggested [here](https://stackoverflow.com/questions/58683341/basemap-wont-import-because-epsg-file-or-directory-cant-be-found-macos-ana).
 In the jupyter notebook, this may involve setting it in a cell like

diff --git a/analysis_notebooks/Geography Analysis.ipynb b/analysis_notebooks/Geography Analysis.ipynb
@@ -311,10 +311,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# check if we use global country labels vs GPS label\n",
-    "is_country_counts = dataset.geo_boundaries is None\n",
-    "\n",
-    "if is_country_counts: \n",
+    "if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
     "    counts = pickle.load(open(\"results/{}/geo_ctr.pkl\".format(folder_name), \"rb\"))\n",
     "    iso3_to_subregion = pickle.load(open('util_files/iso3_to_subregion_mappings.pkl', 'rb'))\n",
     "    gc = GeonamesCache()\n",
@@ -358,12 +355,21 @@
     "    for key in iso3_to_scaledpop.keys():\n",
     "        iso3_to_scaledpop[key] /= min(iso3_to_scaledpop.values())\n",
     "\n",
-    "\n",
-    "else:\n",
+    "elif (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"REGION_LABEL\"):\n",
+    "    ctr_dict = pickle.load(open(\"results/{}/geo_ctr_region.pkl\".format(folder_name), \"rb\"))\n",
+    "    counts = ctr_dict[\"region_to_id\"]\n",
+    "    total = sum(counts.values())\n",
+    "    region_count_phrases = []\n",
+    "    for region, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):\n",
+    "        region_count_phrases.append(\"{0}: {1}   {2}%\".format(region, count, round(100.*count/total)))\n",
+    "        \n",
+    "elif dataset.geography_info_type == \"GPS_LABEL\":\n",
     "    counts_gps = pickle.load(open(\"results/{}/geo_ctr_gps.pkl\".format(folder_name), \"rb\"))\n",
     "    region_to_id = counts_gps[\"region_to_id\"]\n",
     "    id_to_gps = counts_gps[\"id_to_gps\"]\n",
+    "    subregion_to_id = counts_gps.get(\"subregion_to_id\", None)\n",
     "    geo_boundaries = dataset.geo_boundaries\n",
+    "    subregion_boundaries = dataset.subregion_boundaries\n",
     "    choro_data = pd.read_csv(dataset.choropleth_filepath)\n",
     "\n",
     "    counts = {}\n",
@@ -372,9 +378,15 @@
     "    total = sum(counts.values())\n",
     "    country_count_phrases = []\n",
     "\n",
-    "\n",
     "    for country, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):\n",
     "        country_count_phrases.append(\"{0}: {1}   {2}%\".format(country, count, round(100.*count/total)))\n",
+    "    \n",
+    "    \n",
+    "    subregion_counts = None\n",
+    "    if subregion_to_id is not None:\n",
+    "        subregion_counts = {}\n",
+    "        for subregion in subregion_to_id:\n",
+    "            subregion_counts[subregion] = len(subregion_to_id[subregion])\n",
     "\n",
     "    colors = [\n",
     "        'red',\n",
@@ -405,17 +417,21 @@
     "        print(country_count_phrases[-1-i])\n",
     "   \n",
     "def subregion_counts_num():\n",
-    "    if is_country_counts:\n",
+    "    if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
+    "        print(\"Subregion Counts\\n\")\n",
+    "        total_subregion = sum(subregion_counts.values())\n",
+    "        for subregion, count in sorted(subregion_counts.items(), key=lambda x: x[1], reverse=True):\n",
+    "            print(\"{0}: {1}    {2}%\".format(subregion, count, round(100.*count/total_subregion)))\n",
+    "    elif dataset.geography_info_type == \"GPS_LABEL\" and subregion_to_id is not None:\n",
     "        print(\"Subregion Counts\\n\")\n",
     "        total_subregion = sum(subregion_counts.values())\n",
     "        for subregion, count in sorted(subregion_counts.items(), key=lambda x: x[1], reverse=True):\n",
     "            print(\"{0}: {1}    {2}%\".format(subregion, count, round(100.*count/total_subregion)))\n",
     "    else:\n",
-    "        print(\"Subregion counts for gps labels coming soon\")\n",
-    "\n",
-    "\n",
+    "        print(\"Subregion analysis not available\")\n",
+    "            \n",
     "def region_map():\n",
-    "    if is_country_counts:\n",
+    "    if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
     "        fig = plt.figure(figsize=(16, 7))\n",
     "\n",
     "        fontsize = 20\n",
@@ -460,7 +476,8 @@
     "        plt.show()\n",
     "\n",
     "        print(\"Total countries: {}\".format(len(iso3_to_bin)))\n",
-    "    else:\n",
+    "        \n",
+    "    elif dataset.geography_info_type == \"GPS_LABEL\":\n",
     "        m = folium.Map()\n",
     "        folium.GeoJson(geo_boundaries, name=\"geojson\").add_to(m)\n",
     "        count = 0\n",
@@ -477,21 +494,50 @@
     "                ).add_to(m)\n",
     "            count += 1\n",
     "        return m\n",
+    "    \n",
+    "    elif (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"REGION_LABEL\"):\n",
+    "        m = folium.Map()\n",
+    "        \n",
+    "        region_names = []\n",
+    "        region_counts = []\n",
+    "        for region in counts:\n",
+    "            region_names.append(region)\n",
+    "            region_counts.append(counts[region])\n",
+    "\n",
+    "        counts_new = {\"region\": region_names, \"region_counts\": region_counts}\n",
+    "        counts_df = pd.DataFrame.from_dict(counts_new)\n",
+    "        \n",
+    "        # Change \"Data\" to column name of csv you want\n",
+    "        folium.Choropleth(\n",
+    "            geo_data=geo_boundaries,\n",
+    "            name=\"choropleth\",\n",
+    "            data=counts_df,\n",
+    "            columns=[\"region\", \"region_counts\"],\n",
+    "            key_on=\"feature.properties.{0}\".format(dataset.geo_boundaries_key_name),\n",
+    "            fill_color=\"BuPu\",\n",
+    "            fill_opacity=0.7,\n",
+    "            line_opacity=0.2,\n",
+    "            legend_name=\"Image Counts over subregion\",\n",
+    "        ).add_to(m)\n",
     "\n",
+    "        folium.LayerControl().add_to(m)\n",
+    "        return m\n",
+    "    \n",
     "def choropleth():\n",
-    "    if not is_country_counts:\n",
+    "    if dataset.geography_info_type == \"GPS_LABEL\":\n",
     "        m = folium.Map()\n",
-    "\n",
+    "        \n",
+    "        # Change \"Data\" to column name of csv you want\n",
     "        folium.Choropleth(\n",
     "            geo_data=geo_boundaries,\n",
     "            name=\"choropleth\",\n",
     "            data=choro_data,\n",
-    "            columns=[\"Region\", \"Data\"],\n",
-    "            key_on=\"feature.properties.name_1\",\n",
+    "            columns=[\"region\", \"population\"],\n",
+    "            key_on=\"feature.properties.{0}\".format(dataset.geo_boundaries_key_name),\n",
     "            fill_color=\"BuPu\",\n",
     "            fill_opacity=0.7,\n",
     "            line_opacity=0.2,\n",
-    "            legend_name=\"My Data\",\n",
+    "            legend_name=\"Data\",\n",
     "        ).add_to(m)\n",
     "\n",
     "        folium.LayerControl().add_to(m)\n",
@@ -511,9 +557,10 @@
     "            count += 1\n",
     "        return m\n",
     "    else:\n",
-    "        print(\"No custom subregion data available\")\n",
+    "        print(\"No choropleth available\")\n",
+    "        \n",
     "def country_map_population():\n",
-    "    if is_country_counts:\n",
+    "    if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
     "        fig = plt.figure(figsize=(16, 7))\n",
     "\n",
     "        fontsize = 20\n",
@@ -554,7 +601,7 @@
     "        fig.suptitle('Dataset representation scaled by country population, logarithmic scale', fontsize=fontsize, y=.95)\n",
     "        plt.show()\n",
     "    else:\n",
-    "        print(\"No country information available\")"
+    "        print(\"No population information available, use choropleth instead\")"
    ]
   },
   {
@@ -601,7 +648,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Visualization of representation by country"
+    "Visualization of representation by region"
    ]
   },
   {
@@ -676,9 +723,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# check if we use global country labels vs GPS label\n",
-    "is_country = dataset.geo_boundaries is None\n",
-    "if is_country:\n",
+    "if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
     "    if not os.path.exists(\"results/{0}/6\".format(folder_name)):\n",
     "        os.mkdir(\"results/{0}/6\".format(folder_name))\n",
     "    info_stats = pickle.load(open(\"results/{}/geo_tag.pkl\".format(folder_name), \"rb\")) #20GB\n",
@@ -731,13 +776,52 @@
     "                subregion_pvalues_over[p] = tag_info\n",
     "            else:\n",
     "                subregion_pvalues_under[p] = tag_info\n",
+    "                \n",
+    "elif (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"REGION_LABEL\"):\n",
+    "    print(\"Geo_tag work for region label formatted dataset\")\n",
+    "    if not os.path.exists(\"results/{0}/6\".format(folder_name)):\n",
+    "        os.mkdir(\"results/{0}/6\".format(folder_name))\n",
+    "    info_stats = pickle.load(open(\"results/{}/geo_tag_region.pkl\".format(folder_name), \"rb\")) #20GB\n",
+    "    region_tags = info_stats['region_tags']\n",
+    "    tag_to_region_features = info_stats['tag_to_region_features']\n",
+    "\n",
+    "    categories = dataset.categories\n",
+    "    total_counts = np.zeros(len(categories))\n",
+    "\n",
+    "    for region, counts in region_tags.items():\n",
+    "        total_counts = np.add(total_counts, counts)\n",
     "\n",
-    "if not is_country:\n",
+    "    total_counts = total_counts.astype(int)\n",
+    "    sum_total_counts = int(np.sum(total_counts))\n",
+    "\n",
+    "    if not os.path.exists('checkpoints/{}/6_a.pkl'.format(folder_name)):\n",
+    "        pvalues_over = {} # pvalue : '[region]: [tag] (region num and total num info for now)'\n",
+    "        pvalues_under = {} \n",
+    "        for region, counts in region_tags.items():\n",
+    "            tags_for_region = int(np.sum(counts))\n",
+    "            if tags_for_region < 50: # threshold for region to have at least 50 tags so there are enough samples for analysis\n",
+    "                continue\n",
+    "            for i, count in enumerate(counts):\n",
+    "                this_counts = np.zeros(tags_for_region)\n",
+    "                this_counts[:int(count)] = 1\n",
+    "                that_counts = np.zeros(sum_total_counts - tags_for_region)\n",
+    "                that_counts[:total_counts[i] - int(count)] = 1\n",
+    "                p = stats.ttest_ind(this_counts, that_counts)[1]\n",
+    "                tag_info = '{0}-{1} ({2}/{3} vs {4}/{5})'.format(region, categories[i], int(count), tags_for_region, int(total_counts[i] - count), sum_total_counts - tags_for_region)\n",
+    "                if np.mean(this_counts) > np.mean(that_counts):\n",
+    "                    pvalues_over[p] = tag_info\n",
+    "                else:\n",
+    "                    pvalues_under[p] = tag_info\n",
+    "    else:\n",
+    "        pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/6_a.pkl'.format(folder_name), 'rb'))\n",
+    "\n",
+    "elif dataset.geography_info_type == \"GPS_LABEL\":\n",
     "    print(\"Geo_tag work for gps formatted dataset\")\n",
     "    if not os.path.exists(\"results/{0}/6\".format(folder_name)):\n",
     "        os.mkdir(\"results/{0}/6\".format(folder_name))\n",
     "    info_stats = pickle.load(open(\"results/{}/geo_tag_gps.pkl\".format(folder_name), \"rb\")) #20GB\n",
     "    region_tags = info_stats['region_tags']\n",
+    "    subregion_tags = info_stats.get('subregion_tags', None)\n",
     "    tag_to_region_features = info_stats['tag_to_region_features']\n",
     "\n",
     "    categories = dataset.categories\n",
@@ -770,8 +854,34 @@
     "    else:\n",
     "        pvalues_under, pvalues_over = pickle.load(open('checkpoints/{}/6_a.pkl'.format(folder_name), 'rb'))\n",
     "\n",
-    "def tag_rep_by_country(topn):\n",
-    "    if is_country:\n",
+    "    if subregion_tags is not None:\n",
+    "        subregion_total_counts = np.zeros(len(categories))\n",
+    "        for region, counts in subregion_tags.items():\n",
+    "            subregion_total_counts = np.add(subregion_total_counts, counts)\n",
+    "\n",
+    "        subregion_total_counts = subregion_total_counts.astype(int)\n",
+    "        sum_subregion_total_counts = int(np.sum(subregion_total_counts))\n",
+    "\n",
+    "        subregion_pvalues_over = {} \n",
+    "        subregion_pvalues_under = {} \n",
+    "        for region, counts in subregion_tags.items():\n",
+    "            tags_for_region = int(np.sum(counts))\n",
+    "            if tags_for_region < 50: # threshold for subregion to have at least 50 tags so there are enough samples for analysis\n",
+    "                continue\n",
+    "            for i, count in enumerate(counts):\n",
+    "                this_counts = np.zeros(tags_for_region)\n",
+    "                this_counts[:int(count)] = 1\n",
+    "                that_counts = np.zeros(sum_subregion_total_counts - tags_for_region)\n",
+    "                that_counts[:subregion_total_counts[i] - int(count)] = 1\n",
+    "                p = stats.ttest_ind(this_counts, that_counts)[1]\n",
+    "                tag_info = '{0}-{1} ({2}/{3} vs {4}/{5})'.format(region, categories[i], int(count), tags_for_region, int(subregion_total_counts[i] - count), sum_subregion_total_counts - tags_for_region)\n",
+    "                if np.mean(this_counts) > np.mean(that_counts):\n",
+    "                    subregion_pvalues_over[p] = tag_info\n",
+    "                else:\n",
+    "                    subregion_pvalues_under[p] = tag_info\n",
+    "                    \n",
+    "def tag_rep_by_region(topn):\n",
+    "    if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
     "        if first_pass:\n",
     "            to_write[1] = [\"(geo_tag) Overrepresentations of tags by country (tag in country vs tag in rest of the countries):\"]\n",
     "            for p, content in sorted(pvalues_over.items(), key=lambda x: x[0])[:4]:\n",
@@ -808,7 +918,16 @@
     "\n",
     "\n",
     "def tag_rep_by_subregion(topn):\n",
-    "    if is_country:\n",
+    "    if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
+    "        print(\"By Subregion\\n\")\n",
+    "        print('Over represented\\n')\n",
+    "        for p, content in sorted(subregion_pvalues_over.items(), key=lambda x: x[0])[:topn]:\n",
+    "           print('{0}: {1}'.format(round(p, 4), content))\n",
+    "        print('\\nUnder represented\\n')\n",
+    "        for p, content in sorted(subregion_pvalues_under.items(), key=lambda x: x[0])[:topn]:\n",
+    "           print('{0}: {1}'.format(round(p, 4), content))\n",
+    "            \n",
+    "    elif dataset.geography_info_type == \"GPS_LABEL\" and subregion_tags is not None:\n",
     "        print(\"By Subregion\\n\")\n",
     "        print('Over represented\\n')\n",
     "        for p, content in sorted(subregion_pvalues_over.items(), key=lambda x: x[0])[:topn]:\n",
@@ -819,7 +938,7 @@
     "    else:\n",
     "        print(\"No subregion data for gps-formatted datasets\")\n",
     "\n",
-    "if is_country:\n",
+    "if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
     "    import warnings\n",
     "    warnings.filterwarnings(\"ignore\")\n",
     "\n",
@@ -931,9 +1050,9 @@
     "        svm_options.append(('{0} in {1}: {2}% and {3}x'.format(phrase, region, round(100.*acc, 3), round(value, 3)), phrase))\n",
     "\n",
     "def show_svm_tag(tag, num):\n",
-    "    if not is_country:\n",
+    "    if not (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
     "        print(\"SVM metric not setup for gps datasets\")\n",
-    "    if is_country:\n",
+    "    else:\n",
     "        if tag is None:\n",
     "            return\n",
     "        this_info = pickle.load(open('results/{0}/{1}/{2}_info.pkl'.format(folder_name, 6, tag), 'rb'))\n",
@@ -1018,7 +1137,7 @@
     "            print(\"Out: Correct\")\n",
     "        else:\n",
     "            to_write[2].append(\"Out: Correct\")\n",
-    "        display_chunk(False, False, to_save, 'd')\n"
+    "        display_chunk(False, False, to_save, 'd')"
    ]
   },
   {
@@ -1033,7 +1152,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Over- and under- representations of tags by country. The first fraction shows how many of this country's tags are made up of this one, and the second fraction shows how many of all of the country's tags are made up of this one."
+    "Over- and under- representations of tags by region. The first fraction shows how many of this region's tags are made up of this one, and the second fraction shows how many of all of the region's tags are made up of this one."
    ]
   },
   {
@@ -1042,7 +1161,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "interact(tag_rep_by_country, topn=widgets.IntSlider(min=1, max=30, step=1, value=10));"
+    "interact(tag_rep_by_region, topn=widgets.IntSlider(min=1, max=30, step=1, value=10));"
    ]
   },
   {
@@ -1075,7 +1194,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if is_country:\n",
+    "if (dataset.geography_info_type == \"STRING_FORMATTED_LABEL\" and dataset.geography_label_string_type == \"COUNTRY_LABEL\"):\n",
     "    num_widget = widgets.IntSlider(min=1, max=20, step=1, value=5)\n",
     "    tag_widget = widgets.Dropdown(options=svm_options, layout=Layout(width='400px'))\n",
     "    all_things = [widgets.Label('Tag, acc, acc/acc_random',layout=Layout(padding='0px 0px 0px 5px', width='170px')), tag_widget, widgets.Label('Num',layout=Layout(padding='0px 5px 0px 40px', width='80px')), num_widget]\n",