# Map Creation

Merging the MOOC Metrics (`mooc-metrics.csv`) with a map. Merging on country ISO code, as provided by Google Analytics.

In [1]:
import pandas as pd
import geopandas as gpd
import lxml

In [20]:
cols_google = ['country', 'latitude', 'longitude', 'name']
html_scrape = pd.read_html("https://developers.google.com/public-data/docs/canonical/countries_csv", skiprows=1)

country_capitals = html_scrape[0]
country_capitals.columns = cols_google

country_capitals.describe()

Unnamed: 0,latitude,longitude
count,244.0,244.0
mean,16.253109,13.294814
std,27.031206,73.976477
min,-75.250973,-177.156097
25%,-0.30171,-38.092008
50%,16.869235,18.182149
75%,38.965238,49.046734
max,77.553604,179.414413


In [21]:
country_capitals.head()

Unnamed: 0,country,latitude,longitude,name
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


### Countries

Read in the countries and slim down to just the population (for adjusted rates), name, and ISO A2.

In [22]:
ne_50m = '../data/maps/ne_50m_admin_0_countries.geojson'
countries = gpd.read_file(ne_50m)

In [23]:
countries.columns

Index(['abbrev', 'abbrev_len', 'adm0_a3', 'adm0_a3_is', 'adm0_a3_un',
       'adm0_a3_us', 'adm0_a3_wb', 'adm0_dif', 'admin', 'brk_a3', 'brk_diff',
       'brk_group', 'brk_name', 'continent', 'economy', 'featurecla',
       'fips_10', 'formal_en', 'formal_fr', 'gdp_md_est', 'gdp_year',
       'geometry', 'geou_dif', 'geounit', 'gu_a3', 'homepart', 'income_grp',
       'iso_a2', 'iso_a3', 'iso_n3', 'labelrank', 'lastcensus', 'level',
       'long_len', 'mapcolor13', 'mapcolor7', 'mapcolor8', 'mapcolor9', 'name',
       'name_alt', 'name_len', 'name_long', 'name_sort', 'note_adm0',
       'note_brk', 'pop_est', 'pop_year', 'postal', 'region_un', 'region_wb',
       'scalerank', 'sov_a3', 'sovereignt', 'su_a3', 'su_dif', 'subregion',
       'subunit', 'tiny', 'type', 'un_a3', 'wb_a2', 'wb_a3', 'wikipedia',
       'woe_id'],
      dtype='object')

In [24]:
pop_df = countries[['iso_a2', 'pop_est']]
pop_df.head()

Unnamed: 0,iso_a2,pop_est
0,AW,103065.0
1,AF,28400000.0
2,AO,12799293.0
3,AI,14436.0
4,AL,3639453.0


### MOOC Metrics

In [25]:
metrics_f = '../data/canonical/mooc-metrics.csv'
metrics_df = pd.read_csv(metrics_f)

print ("{} rows".format(len(metrics_df)))
metrics_df.head()

1071 rows


Unnamed: 0,year,country,page,new_users,uniq_pg_views,pg_views
0,2017,BD,poetry-and-plays-2017,6975,8331,10151
1,2017,IN,poetry-and-plays-2017,3819,8556,12849
2,2017,NP,poetry-and-plays-2017,1866,2517,3236
3,2017,PK,poetry-and-plays-2017,1372,2171,2916
4,2017,IN,fiction-and-nonfiction-2017,1711,12900,20355


In [26]:
# sanity check to make sure we can merge
list_of_map_countries = country_capitals['country'].tolist()
test_country = lambda x: x in list_of_map_countries

for country in metrics_df['country'].unique().tolist():
    if (test_country(country) == False):
        print (country)

ZZ
SS
BQ
CW
MF


In [7]:
country_df[country_df.name_sort == 'Kosovo']['iso_a2']

119    -99
Name: iso_a2, dtype: object

In [27]:
bad_isos = ['nan', 'ZZ', 'GF', 'RE', 'BQ', 'YT', 'MQ', 'SS', 'CW']

mask_bad_iso = lambda x: (metrics_df['country'].isin(bad_isos))

metrics_df[mask_bad_iso]

Unnamed: 0,year,country,page,new_users,uniq_pg_views,pg_views
162,2016,ZZ,how-writers-write-fiction-2016,64,87,118
227,2017,ZZ,fiction-and-nonfiction-2017,48,112,210
261,2017,ZZ,poetry-and-plays-2017,31,49,63
367,2017,SS,poetry-and-plays-2017,17,19,21
438,2016,ZZ,whitman-2016,18,44,56
469,2015,ZZ,how-writers-write-fiction-2015,35,239,315
498,2017,SS,fiction-and-nonfiction-2017,10,11,15
545,2016,ZZ,how-writers-write-fiction-2015,7,9,10
584,2017,GF,poetry-and-plays-2017,6,6,6
745,2017,RE,poetry-and-plays-2017,4,5,5


In [30]:
# reassign the ISO_A2 for kosovo
# Change source to change it easier

mask_koso_src = (pop_df['iso_a2'] == -99)
pop_df.loc[mask_koso_src, 'iso_a2'] = 'XK'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [31]:
# keep only the identified countries
mask_good_iso = lambda x: (~metrics_df['country'].isin(bad_isos))

export_df = metrics_df[mask_good_iso]

print ("{} rows".format(len(export_df)))

1054 rows


### Merging

In [35]:
# get rid of antartica
pop_df = pop_df[pop_df.iso_a2 != 'AQ']

# rename the columns
pop_df.columns = ['iso', 'population']

### Merging

1. Google's country capitals merge with Natural Earth populations.
... tbd

In [39]:
pop_df.head()

Unnamed: 0,iso,population
0,AW,103065.0
1,AF,28400000.0
2,AO,12799293.0
3,AI,14436.0
4,AL,3639453.0


In [38]:
export_df.head()

Unnamed: 0,year,country,page,new_users,uniq_pg_views,pg_views
0,2017,BD,poetry-and-plays-2017,6975,8331,10151
1,2017,IN,poetry-and-plays-2017,3819,8556,12849
2,2017,NP,poetry-and-plays-2017,1866,2517,3236
3,2017,PK,poetry-and-plays-2017,1372,2171,2916
4,2017,IN,fiction-and-nonfiction-2017,1711,12900,20355


In [40]:
country_capitals.head()

Unnamed: 0,country,latitude,longitude,name
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla


In [43]:
geo_df = country_capitals.merge(pop_df, left_on='country', right_on='iso', how='left')

print (geo_df.shape)
geo_df.head()

(245, 6)


Unnamed: 0,country,latitude,longitude,name,iso,population
0,AD,42.546245,1.601554,Andorra,AD,83888.0
1,AE,23.424076,53.847818,United Arab Emirates,AE,4798491.0
2,AF,33.93911,67.709953,Afghanistan,AF,28400000.0
3,AG,17.060816,-61.796428,Antigua and Barbuda,AG,85632.0
4,AI,18.220554,-63.068615,Anguilla,AI,14436.0


In [44]:
geo_df.drop('iso', axis=1, inplace=True)

In [63]:
rate_df = export_df.merge(geo_df[['country', 'population']], on='country')
rate_df.head(10)

Unnamed: 0,year,country,page,new_users,uniq_pg_views,pg_views,population
0,2017,BD,poetry-and-plays-2017,6975,8331,10151,156050900.0
1,2017,BD,fiction-and-nonfiction-2017,108,283,496,156050900.0
2,2016,BD,how-writers-write-fiction-2016,52,80,115,156050900.0
3,2016,BD,flash-write-2016,32,140,208,156050900.0
4,2015,BD,how-writers-write-fiction-2015,29,165,232,156050900.0
5,2016,BD,how-writers-write-fiction-2015,10,15,16,156050900.0
6,2016,BD,whitman-2016,10,12,12,156050900.0
7,2017,BD,how-writers-write-fiction-2016,2,2,2,156050900.0
8,2017,IN,poetry-and-plays-2017,3819,8556,12849,1166079000.0
9,2017,IN,fiction-and-nonfiction-2017,1711,12900,20355,1166079000.0


In [64]:
# Calculate population adjusted rates
rate_df['new_users_rate'] = rate_df['new_users'] / rate_df['population']
rate_df['uniq_pg_views_rate'] = rate_df['uniq_pg_views'] / rate_df['population']
rate_df['pg_views_rate'] = rate_df['pg_views'] / rate_df['population']

### Convert lat long to point using GeoPandas.

```
from geopandas import GeoDataFrame
from shapely.geometry import Point

geometry = [Point(xy) for xy in zip(df.Lon, df.Lat)]
df = df.drop(['Lon', 'Lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(df, crs=crs, geometry=geometry)
```

In [50]:
from shapely.geometry import Point

In [51]:
pts = [Point(xy) for xy in zip(geo_df.longitude, geo_df.latitude)]
geo_df = geo_df.drop(['latitude', 'longitude'], axis=1)

crs = {'init': 'epsg:4326'}

In [56]:
geo_export = gpd.GeoDataFrame(geo_df, crs=crs, geometry=pts)
geo_export.head()

Unnamed: 0,country,name,population,geometry
0,AD,Andorra,83888.0,POINT (1.601554 42.546245)
1,AE,United Arab Emirates,4798491.0,POINT (53.847818 23.424076)
2,AF,Afghanistan,28400000.0,POINT (67.709953 33.93911)
3,AG,Antigua and Barbuda,85632.0,POINT (-61.79642800000001 17.060816)
4,AI,Anguilla,14436.0,POINT (-63.068615 18.220554)


### Single File

Add the classes as a list into the raw GeoJSON.

In [57]:
import json
country_json_str = geo_export.to_json()

In [58]:
country_json = json.loads(country_json_str)
country_json.keys()

dict_keys(['type', 'features'])

In [59]:
country_json['features'][0]

{'geometry': {'coordinates': [1.6015540000000001, 42.546245], 'type': 'Point'},
 'id': '0',
 'properties': {'country': 'AD', 'name': 'Andorra', 'population': 83888.0},
 'type': 'Feature'}

In [22]:
# each feature is saved in a features list.
# we can use a simple list comprehension if we can use the ISO as a lookup key

# create the lookup for said key


In [66]:
rate_df.head()

Unnamed: 0,year,country,page,new_users,uniq_pg_views,pg_views,new_users_rate,uniq_pg_views_rate,pg_views_rate
0,2017,BD,poetry-and-plays-2017,6975,8331,10151,4.469696e-05,5.338643e-05,6.50493e-05
1,2017,BD,fiction-and-nonfiction-2017,108,283,496,6.92082e-07,1.813511e-06,3.17845e-06
2,2016,BD,how-writers-write-fiction-2016,52,80,115,3.332246e-07,5.126533e-07,7.369391e-07
3,2016,BD,flash-write-2016,32,140,208,2.050613e-07,8.971433e-07,1.332899e-06
4,2015,BD,how-writers-write-fiction-2015,29,165,232,1.858368e-07,1.057347e-06,1.486695e-06


In [65]:
rate_df.drop(['population'], 
             axis=1, inplace=True)

In [67]:
list_of_classes = rate_df.to_dict('records')

iso_lookup = {}

for record in list_of_classes:
    iso = record['country']
    if (iso in iso_lookup):
        iso_lookup[iso].append(record)
    else:
        iso_lookup[iso] = []
        iso_lookup[iso].append(record)

In [68]:
for country in country_json['features']:
    iso = country['properties']['country']
    
    if (iso in iso_lookup):
        #print (iso, len(iso_lookup[iso]))
        country['properties']['classes'] = iso_lookup[iso]
    else:
        country['properties']['classes'] = []

In [74]:
# filter out the countries with no classes
country_json['features'] = [feat for feat in country_json['features'] if (len(feat['properties']['classes']) > 0)]

In [75]:
with open('../data/canonical/capital-classes.geojson', 'w') as outfile:
    json.dump(country_json, outfile)