# Map Creation

Merging the MOOC Metrics (`mooc-metrics.csv`) with a map. Merging on country ISO code, as provided by Google Analytics.

In [1]:
import pandas as pd
import geopandas as gpd

### Countries

Read in the countries and slim down to just the population (for adjusted rates), name, and ISO A2.

In [2]:
ne_50m = '../data/maps/ne_50m_admin_0_countries.geojson'
countries = gpd.read_file(ne_50m)

In [3]:
countries.columns

Index(['abbrev', 'abbrev_len', 'adm0_a3', 'adm0_a3_is', 'adm0_a3_un',
       'adm0_a3_us', 'adm0_a3_wb', 'adm0_dif', 'admin', 'brk_a3', 'brk_diff',
       'brk_group', 'brk_name', 'continent', 'economy', 'featurecla',
       'fips_10', 'formal_en', 'formal_fr', 'gdp_md_est', 'gdp_year',
       'geometry', 'geou_dif', 'geounit', 'gu_a3', 'homepart', 'income_grp',
       'iso_a2', 'iso_a3', 'iso_n3', 'labelrank', 'lastcensus', 'level',
       'long_len', 'mapcolor13', 'mapcolor7', 'mapcolor8', 'mapcolor9', 'name',
       'name_alt', 'name_len', 'name_long', 'name_sort', 'note_adm0',
       'note_brk', 'pop_est', 'pop_year', 'postal', 'region_un', 'region_wb',
       'scalerank', 'sov_a3', 'sovereignt', 'su_a3', 'su_dif', 'subregion',
       'subunit', 'tiny', 'type', 'un_a3', 'wb_a2', 'wb_a3', 'wikipedia',
       'woe_id'],
      dtype='object')

In [4]:
country_df = countries[['iso_a2', 'name_sort', 'scalerank', 'pop_est', 'geometry']]
country_df.head()

Unnamed: 0,iso_a2,name_sort,scalerank,pop_est,geometry
0,AW,Aruba,3,103065.0,"POLYGON ((-69.89912109375 12.45200195312499, -..."
1,AF,Afghanistan,1,28400000.0,"POLYGON ((74.89130859375001 37.231640625, 74.8..."
2,AO,Angola,1,12799293.0,"(POLYGON ((14.19082031250008 -5.8759765625, 14..."
3,AI,Anguilla,1,14436.0,"POLYGON ((-63.001220703125 18.22177734374999, ..."
4,AL,Albania,1,3639453.0,"POLYGON ((20.06396484375 42.547265625, 20.1035..."


### MOOC Metrics

In [5]:
metrics_f = '../data/canonical/mooc-metrics.csv'
metrics_df = pd.read_csv(metrics_f)

print ("{} rows".format(len(metrics_df)))
metrics_df.head()

1071 rows


Unnamed: 0,year,country,page,new_users,uniq_pg_views,pg_views
0,2017,BD,poetry-and-plays-2017,6975,8331,10151
1,2017,IN,poetry-and-plays-2017,3819,8556,12849
2,2017,NP,poetry-and-plays-2017,1866,2517,3236
3,2017,PK,poetry-and-plays-2017,1372,2171,2916
4,2017,IN,fiction-and-nonfiction-2017,1711,12900,20355


In [6]:
# sanity check to make sure we can merge
list_of_map_countries = country_df['iso_a2'].tolist()
test_country = lambda x: x in list_of_map_countries

for country in metrics_df['country'].unique().tolist():
    if (test_country(country) == False):
        print (country)

XK
ZZ
nan
GF
RE
BQ
YT
MQ


| ISO A-2 | Country |
|---|---|
| `XK` | Temporary for Kosovo |
| `ZZ` | Unknown |
| `nan` | Not available, pandas error |
| `GF` | [French Guiana](https://en.wikipedia.org/wiki/French_Guiana) |
| `RE` | [Reunion](https://en.wikipedia.org/wiki/R%C3%A9union)
| `BQ` | [Bonaire, Sint Eustatius and Saba](https://en.wikipedia.org/wiki/Caribbean_Netherlands) |
| `YT` | [Mayotte](https://en.wikipedia.org/wiki/Mayotte) |
| `MT` | [Martinique](https://en.wikipedia.org/wiki/Martinique) |


From [Wikipedia article on ISO 3166 A-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2)...

<img src="https://upload.wikimedia.org/wikipedia/commons/d/d2/Kingdom_of_the_Netherlands_location_tree.svg"></img>

In [7]:
country_df[country_df.name_sort == 'Kosovo']['iso_a2']

119    -99
Name: iso_a2, dtype: object

In [8]:
bad_isos = ['nan', 'ZZ', 'GF', 'RE', 'BQ', 'YT', 'MQ']

mask_bad_iso = lambda x: (metrics_df['country'].isin(bad_isos))

metrics_df[mask_bad_iso]

Unnamed: 0,year,country,page,new_users,uniq_pg_views,pg_views
162,2016,ZZ,how-writers-write-fiction-2016,64,87,118
227,2017,ZZ,fiction-and-nonfiction-2017,48,112,210
261,2017,ZZ,poetry-and-plays-2017,31,49,63
438,2016,ZZ,whitman-2016,18,44,56
469,2015,ZZ,how-writers-write-fiction-2015,35,239,315
545,2016,ZZ,how-writers-write-fiction-2015,7,9,10
584,2017,GF,poetry-and-plays-2017,6,6,6
745,2017,RE,poetry-and-plays-2017,4,5,5
788,2016,BQ,how-writers-write-fiction-2015,2,5,5
904,2017,YT,poetry-and-plays-2017,2,2,2


In [9]:
# reassign the ISO_A2 for kosovo

#mask_koso = (metrics_df['country'] == 'XK')
#metrics_df.loc[mask_koso, 'country'] = -99

# Change source to change it easier
mask_koso_src = (country_df['name_sort'] == 'Kosovo')
country_df.loc[mask_koso_src, 'iso_a2'] = 'XK'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [10]:
# Drop the rows from the dataframe which are unidentifiable
mask_zz = (metrics_df['country'] == 'ZZ')

metrics_df = metrics_df.loc[~mask_zz]

In [11]:
# keep only the identified countries
mask_good_iso = lambda x: (~metrics_df['country'].isin(bad_isos))

export_df = metrics_df[mask_good_iso]

print ("{} rows".format(len(export_df)))

1057 rows


### Merging

In [12]:
# get rid of antartica
country_df = country_df[country_df.iso_a2 != 'AQ']

# rename the columns
country_df.columns = ['iso', 'country', 'scalerank', 'population', 'geometry']

# drop the scale rank column
country_df.drop('scalerank', axis=1, inplace=True)

In [13]:
country_df

Unnamed: 0,iso,country,population,geometry
0,AW,Aruba,103065.0,"POLYGON ((-69.89912109375 12.45200195312499, -..."
1,AF,Afghanistan,28400000.0,"POLYGON ((74.89130859375001 37.231640625, 74.8..."
2,AO,Angola,12799293.0,"(POLYGON ((14.19082031250008 -5.8759765625, 14..."
3,AI,Anguilla,14436.0,"POLYGON ((-63.001220703125 18.22177734374999, ..."
4,AL,Albania,3639453.0,"POLYGON ((20.06396484375 42.547265625, 20.1035..."
5,AX,Aland,27153.0,"(POLYGON ((20.611328125 60.04067382812499, 20...."
6,AD,Andorra,83888.0,"POLYGON ((1.7060546875 42.50332031249999, 1.67..."
7,AE,United Arab Emirates,4798491.0,(POLYGON ((53.92783203125001 24.17719726562498...
8,AR,Argentina,40913584.0,(POLYGON ((-64.54916992187498 -54.716210937499...
9,AM,Armenia,2967004.0,(POLYGON ((45.55234375000006 40.61606445312506...


In [14]:
# rename before merge
export_df.rename(columns={'country': 'iso'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [15]:
rate_df = export_df.merge(country_df.drop('geometry', axis=1), on='iso')
rate_df.head(10)

Unnamed: 0,year,iso,page,new_users,uniq_pg_views,pg_views,country,population
0,2017,BD,poetry-and-plays-2017,6975,8331,10151,Bangladesh,156050900.0
1,2017,BD,fiction-and-nonfiction-2017,108,283,496,Bangladesh,156050900.0
2,2016,BD,how-writers-write-fiction-2016,52,80,115,Bangladesh,156050900.0
3,2016,BD,flash-write-2016,32,140,208,Bangladesh,156050900.0
4,2015,BD,how-writers-write-fiction-2015,29,165,232,Bangladesh,156050900.0
5,2016,BD,how-writers-write-fiction-2015,10,15,16,Bangladesh,156050900.0
6,2016,BD,whitman-2016,10,12,12,Bangladesh,156050900.0
7,2017,BD,how-writers-write-fiction-2016,2,2,2,Bangladesh,156050900.0
8,2017,IN,poetry-and-plays-2017,3819,8556,12849,India,1166079000.0
9,2017,IN,fiction-and-nonfiction-2017,1711,12900,20355,India,1166079000.0


In [16]:
# Calculate population adjusted rates
rate_df['new_users_rate'] = rate_df['new_users'] / rate_df['population']
rate_df['uniq_pg_views_rate'] = rate_df['uniq_pg_views'] / rate_df['population']
rate_df['pg_views_rate'] = rate_df['pg_views'] / rate_df['population']

In [17]:
rate_df

Unnamed: 0,year,iso,page,new_users,uniq_pg_views,pg_views,country,population,new_users_rate,uniq_pg_views_rate,pg_views_rate
0,2017,BD,poetry-and-plays-2017,6975,8331,10151,Bangladesh,1.560509e+08,4.469696e-05,5.338643e-05,6.504930e-05
1,2017,BD,fiction-and-nonfiction-2017,108,283,496,Bangladesh,1.560509e+08,6.920820e-07,1.813511e-06,3.178450e-06
2,2016,BD,how-writers-write-fiction-2016,52,80,115,Bangladesh,1.560509e+08,3.332246e-07,5.126533e-07,7.369391e-07
3,2016,BD,flash-write-2016,32,140,208,Bangladesh,1.560509e+08,2.050613e-07,8.971433e-07,1.332899e-06
4,2015,BD,how-writers-write-fiction-2015,29,165,232,Bangladesh,1.560509e+08,1.858368e-07,1.057347e-06,1.486695e-06
5,2016,BD,how-writers-write-fiction-2015,10,15,16,Bangladesh,1.560509e+08,6.408166e-08,9.612249e-08,1.025307e-07
6,2016,BD,whitman-2016,10,12,12,Bangladesh,1.560509e+08,6.408166e-08,7.689799e-08,7.689799e-08
7,2017,BD,how-writers-write-fiction-2016,2,2,2,Bangladesh,1.560509e+08,1.281633e-08,1.281633e-08,1.281633e-08
8,2017,IN,poetry-and-plays-2017,3819,8556,12849,India,1.166079e+09,3.275078e-06,7.337409e-06,1.101898e-05
9,2017,IN,fiction-and-nonfiction-2017,1711,12900,20355,India,1.166079e+09,1.467310e-06,1.106271e-05,1.745593e-05


In [18]:
rate_df.to_csv('../data/canonical/mooc-metrics-population.csv', index=False)

In [20]:
country_df.to_file('../data/maps/countries-pop.geojson', driver='GeoJSON')