# Data viz

In [None]:
import folium
import pandas as pd
import numpy as np
from geopy import GeoNames
from helpers import *
import unicodedata


In [None]:
grants = pd.read_csv('P3_GrantExport.csv', sep=";")
grants['University'] = grants['University'].astype('str').apply(lambda s: s.split(' -')[0])

Load data about swiss localities and the cantons they're in.

Add some locality names by hand to this dataframe (e.g. Zürcher => ZH, Vaud => VD, Valais => VS)

In [None]:
# load some spreadsheet with cities and shit
swiss_locs = pd.read_excel('be-b-00.04-osv-01.xls', sheetname=1, parse_cols=[0,2]) 
swiss_locs.columns = ['Canton', 'Locality']

add_data = pd.DataFrame({'Canton': ['AG', 'BE', 'BS', 'FR', 'FR', 'JU', 'JU', 'JU', 'ZH', 'VD', 'VD', 'VD', 'VS', 'VS', 'VS', 'VS'],
                       'Locality': ['Aargauer', 'Berner', 'Basler', 'Fribourgeois', 'Fribourgeoise', 'Jura', 'Jurassien', 'Jurassienne', 'Zürcher', 'Vaud', 'Vaudois', 'Vaudoise', 'Valais', 'Valaisan', 'Valaisanne', 'Walliser']})
add_instit = pd.DataFrame({'Canton': ['VD', 'VD', 'VD', 'ZH', 'ZH'],
                    'Locality': ['EPFL', 'ETHL', 'CHUV', 'ETHZ', 'EPFZ']})
add = pd.concat([add_instit, add_data])
swiss_locs = pd.concat([add, swiss_locs])
swiss_locs.reset_index(drop=True)
swiss_locs.head(10)

In [None]:
grants = grants.replace('', np.nan)
grants = grants.replace('nan', np.nan)
grants = grants.replace('Nicht zuteilbar', np.nan)
grants.shape

According to the documentation, the 'Universty' field is empty when the research is carried out at a non-swiss institution. So we can safely get rid of all those entries.

In [None]:
grants = grants[~(pd.isnull(grants.University))]
grants.shape

In [None]:
unis = grants['University'].unique()
# we get the institution if there is no field in University
institutions = grants[(~pd.isnull(grants.Institution))]['Institution'].unique()
institutions = institutions[~pd.isnull(institutions)]

### Google maps API geocoding

In [None]:
import googlemaps
key = None
with open('key', 'r') as f:
    key = f.readline().strip()
gmaps = googlemaps.Client(key)
       
def get_geocodes(name_arr, dic):
    for obj in name_arr:
        if obj not in dic: # run query if there is not already a mapping 
            location = gmaps.geocode(obj)
            address = ''
            if location:
                for comp in location[0]['address_components']:
                    if 'administrative_area_level_1' in comp['types']:
                        address = comp['short_name']
            dic[obj] = address    

In [None]:
unis_dict = {}
unis_dict = load_dict('unis_dict.json')
get_geocodes(unis, unis_dict) # populate unis_dict with uni->canton mapping
save_dict(unis_dict, 'unis_dict.json')

In [None]:
inst_dict = {}
inst_dict = load_dict('inst_dict.json')
get_geocodes(institutions, inst_dict) # populate inst_dict with inst->canton mapping
save_dict(inst_dict, 'inst_dict.json')

In [None]:
def update_full():
    """Sets full_dict to the """
    full_dict = {}
    for elem in (unis_dict, inst_dict): 
        full_dict.update(elem)
    return full_dict

def get_canton(df):
    """Adds the Canton column to the grants df with values mapped to by unis_dict and inst_dict"""
    full_dict = update_full()
    if full_dict[df['University']]:
        return full_dict[df['University']]
    elif not pd.isnull(df['Institution']):
        return full_dict[df['Institution']]
    else:
        return ''

In [None]:
grants['Canton'] = grants.apply(get_canton, axis=1)

In [None]:
grants[grants.Canton == ''].ix[:, ['University','Institution','Canton']].shape

In [None]:
grants[grants.Canton != ''].ix[:, ['University','Institution','Canton']].shape

### Complete university and institution to canton mappings with `swiss_locs` dataframe

In [None]:
def null_check_loc(loc_dict):
    """Returns a dict with new mappings that were found by checking swiss localities df"""
    loc_null = {k: v for k, v in loc_dict.items() if not v}
    for k, v in loc_null.items():
        for row in zip(swiss_locs['Canton'], swiss_locs['Locality']):
            if caseless_contains(" " + row[1] + " ", " " + k + " "): # to check for full words as substrings
                loc_null[k] = row[0]
    return loc_null
            

Compute the possible new mappings for the unis dictionary

In [None]:
unis_null = null_check_loc(unis_dict)
unis_new = {k: v for k,v in unis_null.items() if v}

Check the changes 'by hand'. In our case there were two errors.

In [None]:
unis_new["Inst. universit. romand de Santé au Travail"] = 'VD'
unis_new["Centre de rech. sur l'environnement alpin"] = 'VS'
unis_new

In [None]:
def update_dict(new_dict, full_dict):
    """Updates a dictionary with key, value in new_dict"""
    for k,v in new_dict.items():
        full_dict[k] = v

In [None]:
update_dict(unis_new, unis_dict)
save_dict(unis_dict, 'unis_dict.json')

Compute the possible new mappings for the institutions dictionary

In [None]:
inst_null = null_check_loc(inst_dict)
inst_new = {k: v for k,v in inst_null.items() if v}

Fixing bullshit errors by working like an ass

In [None]:
inst_new["Unité d'Oncologie-Hématologie- Immunologie Hôpital de la Tour"] = 'GE'
inst_new["Unité d'Allergologie-Immunologie Hôpital de la Tour"] = 'GE'
inst_new["UNI: Moscow State University Scientific Rese arch Computer Center  Moscow RUS"] = ''
inst_new["Stift. Pro Kloster St. Johann in Müstair"] = 'GR'
inst_new["Services généraux sécurité et santé au travail EPFL - SB - SB-SG - SB-SST"] = 'VD'
inst_new["Schweiz. Institut für Alternativen zu Tierversuchen SIAT Technopark"] = ''
inst_new["Schweiz. Fachstelle für behindertengerechtes Bauen"] = ''
inst_new["Regionalspital Biel Urologie"] = 'BE'
inst_new["Rapp Trans AG Verkehrs- und Transportberatung"] = ''
inst_new["PAN - Büro für Wald und Landschaft"] = ''
inst_new["Musée Forel"] = 'VD'
inst_new["Marie Meierhof Institut für das Kind"] = 'ZH'
inst_new["Laboratoire interdisciplinaire de performance intégrée au projet EPFL - ENAC - IA - LIPID"] = 'VD'
inst_new["Laboratoire de recherche sur les particules atmosphériques EPFL - ENAC - IIE - APRL"] = 'VD'
inst_new["Laboratoire de mécanique des roches EPFL - ENAC - IIC - LMR"] = 'VD'
inst_new["Laboratoire d'algorithmique pour l'information en réseaux EPFL - IC - IIF - ARNI"] = 'VD'
inst_new["Kantonsbibliothek Appenzell A.Rh."] = 'AR'
inst_new["Kantonsschule Appenzell A.Rh."] = 'AR'
inst_new["Institut universitaire romand de Santé au Travail"] = 'VD'
inst_new["Institut suisse de recherche expérimentale sur le cancer EPFL SV ISREC"] = 'VD'
inst_new["Institut des sols, roches et fondations Laboratoire de mécanique des roches"] = ''
inst_new["Institut Suisse de Recherche Expérimentale sur le Cancer EPFL SV ISREC"] = 'VD'
inst_new["Institut Suisse de Recherche Expérimentale sur le Cancer EPFL - SV - ISREC"] = 'VD'
inst_new["Institut Suisse de Recherche Expérimentale sur le Cancer (ISREC)"] = 'VD'
inst_new["Institut ERASM Etude et recherches appliquées à la sociologie et au marketing"] = ''
inst_new["FE Wirtschafts- und Sozialwissenschaften Eidg. Forschungsanstalt für Wald Schnee und Landschaft WSL"] = 'ZH'
inst_new["F. Hoffmann-La Roche AG PDRD"] = 'AG'
inst_new["Erziehungsdirektion des Kantons Appenzell A.Rh."] = 'AR'
inst_new["Division d'Immunologie Moléculaire Institut Ludwig de Recherche sur le Cancer"] = 'VD'
inst_new["Chaire de théorie ergodique et géométrique des groupes EPFL - SB - MATHGEOM - EGG"] = 'VD'
inst_new["Chaire de théorie ergodique et géométrique des groupes EPFL - SB - IMB - EGG"] = 'VD'
inst_new["Centre suisse de recherche et d'information sur le vitrail"] = 'FR'
inst_new["Centre de recherche sur l'Asie moderne (IUHEI/IUED)"] = ''
inst_new["Centre de Recherche sur le Phénomène Urbain (CREPU/EAUG)"] = ''
inst_new["Centre d'initiation au cinéma et aux communications"] = ''
inst_new["Centre Ludwig de l'Université de Lausanne pour la recherche sur le cancer"] = 'VD'
inst_new["Archäologie, Bauhütte Stiftung Pro Kloster St. Johann in Müstair"] = 'GR'
inst_new["Arbeitsgemeinschaft Swissaid / Fastenopfer / Brot für alle / Helvetas / Caritas"] = ''
inst_new["Angewandte Gewässerökologie Forschungszentrum für Limnologie EAWAG Kastanienbaum"] = ''
inst_new["Advokaturbüro Arn + Friedrich"] = ''



In [None]:
update_dict(inst_new, inst_dict)
save_dict(inst_dict, 'inst_dict.json')

Update the rows in `grants` where we now have canton values, either by the University or Institution fields.

In [None]:
grants['Canton'] = grants.apply(get_canton, axis=1)