In [95]:
%matplotlib inline

import numpy as np
import requests
import re
import pickle
import pandas as pd
import matplotlib.pyplot as pl

In [168]:
GAPI_KEY = 'AIzaSyAKgQXny3tOfMtUj9PCHDL-lBqBMBdTd-k'

In [169]:
cols = ['Project Number', 'Institution', 'University', 'Approved Amount']
na_values = ['data not included in P3', 'Nicht zuteilbar - NA']

dtypes = {
        'Approved Amount': np.float64
}

raw = pd.read_csv('P3_GrantExport.csv', sep = ';', na_values=na_values, index_col='Project Number', dtype=dtypes, usecols=cols)
df = raw.dropna()

In [170]:
df.sample(10)

Unnamed: 0_level_0,Institution,University,Approved Amount
Project Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
130046,Hôpitaux Universitaires de Genève Service d' I...,HES de Suisse occidentale - HES-SO,334080.0
4943,Institut de chimie Université de Neuchâtel,Université de Neuchâtel - NE,165415.0
168023,"Institut für Lebensmittelwissenschaften, Ernäh...",ETH Zürich - ETHZ,600093.0
65092,Geologisch-Paläontologisches Institut Universi...,Universität Basel - BS,244700.0
42790,Parc Scientifique EPF - Lausanne,EPF Lausanne - EPFL,3000.0
25809,Laboratirio di Microbiologia Applicata DACD-SUPSI,SUP della Svizzera italiana - SUPSI,5000.0
27884,Kinderspital Zürich Universitäts-Kinderklinik,Universität Zürich - ZH,659832.0
938,Institut für Populäre Kulturen Universität Zürich,Universität Zürich - ZH,46400.0
103623,Département de patristique et d'histoire de l'...,Université de Fribourg - FR,158805.0
66113,Geologisches Institut ETH Zürich,ETH Zürich - ETHZ,1504384.0


In [171]:
df.index.is_unique

True

In [172]:
with_canton = df.copy()

word_to_canton = {
    'bern': 'BE',
    'lausanne': 'VD',
    'genève': 'GE',
    'geneva': 'GE',
    'luzern': 'LU',
    'zürich': 'ZH',
    'lugano': 'TI',
    'basel': 'BS',
    'vaud': 'VD',
    'fribourg': 'FR',
    'davos': 'GR'
}

cantons = ['ZH','BE','LU','UR','SZ','OW','NW','GL','ZG','FR','SO','BS','BL','SH','AR','AI','SG','GR','AG','TG','TI','VD','VS','NE','GE','JU']

def guess_canton(text):
    lower = text.lower()
    for word in word_to_canton:
        if word in lower:
            return word_to_canton[word]
        
    return ''

def ex_canton_str(s):
    m = re.search(r'\b([A-Z]+)\b$', s.strip())
    if m != None and m.group(1) in cantons:
        return m.group(1)
    else:
        return ''

def ex_canton(text, axis):
    guess = guess_canton(text)
    if guess:
        return guess
    
    res = text.split('-')
    
    if len(res) < 2: 
        return text.strip()
    else:
        return ex_canton_str(res[1])
        
    
def ex_uni(text, axis):
    res = text.split('-')
    
    if len(res) < 2 or ex_canton_str(res[1]) == '':
        return text.strip()
    else:
        return res[0].strip()

with_canton['Canton']     = with_canton['University'].apply(ex_canton, axis=1)
with_canton['University'] = with_canton['University'].apply(ex_uni, axis=1)

with_canton.sample(10)

Unnamed: 0_level_0,Institution,University,Approved Amount,Canton
Project Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
46862,Institut de Zoologie Faculté des Sciences Univ...,Université de Neuchâtel,247500.0,NE
26310,Institut für Geologie Universität Bern,Universität Bern,299679.0,BE
159945,Laboratoire d'optique biomédicale EPFL - STI -...,EPF Lausanne - EPFL,170418.0,VD
2563,Institut für Atmosphäre und Klima ETH Zürich,ETH Zürich - ETHZ,220238.0,ZH
120369,Département de Psychiatrie Hôpitaux Universita...,Université de Genève,210000.0,GE
163994,Institut für Geologie Universität Bern,Universität Bern,60000.0,BE
139021,Unité d'Investigations chirurgicales Départeme...,Université de Genève,1475939.0,GE
108246,Département des neurosciences fondamentales Fa...,Université de Lausanne - LA,260000.0,VD
100730,Institut für Physiologie der Universität Basel...,Universität Basel,107960.0,BS
112280,Institut für Biogeochemie und Schadstoffdynami...,ETH Zürich - ETHZ,41858.0,ZH


In [173]:
grouped = with_canton.groupby(['Canton', 'University']).sum().reset_index()
len(grouped)

76

In [174]:
def is_known_canton(x, axis):
    return x.strip() in cantons

wc = grouped.copy()
wc['IsCanton'] = wc['Canton'].apply(is_known_canton, axis=1)

In [175]:
len(wc[wc['IsCanton'] == False])

54

In [176]:
wc[wc['IsCanton'] == False]

Unnamed: 0,Canton,University,Approved Amount,IsCanton
0,,AO Research Institute - AORI,3435621.0,False
1,,Allergie- und Asthmaforschung - SIAF,19169960.0,False
2,,Biotechnologie Institut Thurgau - BITG,2492535.0,False
3,,Centre de rech. sur l'environnement alpin - CR...,1567678.0,False
4,,Eidg. Anstalt für Wasserversorgung - EAWAG,73975850.0,False
5,,"Eidg. Forschungsanstalt für Wald,Schnee,Land -...",48360390.0,False
6,,Eidg. Hochschulinstitut für Berufsbildung - EHB,2086572.0,False
7,,Eidg. Material und Prüfungsanstalt - EMPA,57930690.0,False
8,,Ente Ospedaliero Cantonale - EOC,5067172.0,False
9,,Fachhochschule Kalaidos - FHKD,1090280.0,False


In [177]:
wc[wc['IsCanton']]

Unnamed: 0,Canton,University,Approved Amount,IsCanton
53,BE,Berner Fachhochschule - BFH,31028700.0,True
54,BE,Pädagogische Hochschule Bern - PHBern,1836136.0,True
55,BE,Robert Walser-Stiftung Bern - RWS,569579.0,True
56,BE,Universität Bern,1490646000.0,True
57,BS,Staatsunabh. Theologische Hochschule Basel - STHB,17300.0,True
58,BS,Universität Basel,1326427000.0,True
59,FR,Haute école pédagogique fribourgeoise - HEPFR,1547498.0,True
60,FR,Université de Fribourg,448092400.0,True
62,GE,Université de Genève,1810170000.0,True
63,GR,Physikal.-Meteorolog. Observatorium Davos - PMOD,12098440.0,True


In [124]:
'''
params = {
    'username': 'ada_drs3',
    'country': 'CH',
    'type': 'json'
}

def geoname_query(q):
    params['q'] = q
    # print('Searching for %s...' % q)
    return requests.get('http://api.geonames.org/search', params)

def search_by(col):
    for i in wc[wc['IsCanton'] == False].index:
        row = wc.iloc[i]
        res = geoname_query(row[col].strip())
        json = res.json()
        
        if json['totalResultsCount'] > 0:
            canton = json['geonames'][0]['adminCode1']
            print('=> Found ' + canton)
            wc.set_value(i,'Canton', canton)
            
#search_by('University')
#search_by('Canton')
'''

"\nparams = {\n    'username': 'ada_drs3',\n    'country': 'CH',\n    'type': 'json'\n}\n\ndef geoname_query(q):\n    params['q'] = q\n    # print('Searching for %s...' % q)\n    return requests.get('http://api.geonames.org/search', params)\n\ndef search_by(col):\n    for i in wc[wc['IsCanton'] == False].index:\n        row = wc.iloc[i]\n        res = geoname_query(row[col].strip())\n        json = res.json()\n        \n        if json['totalResultsCount'] > 0:\n            canton = json['geonames'][0]['adminCode1']\n            print('=> Found ' + canton)\n            wc.set_value(i,'Canton', canton)\n            \n#search_by('University')\n#search_by('Canton')\n"

In [125]:
#wc_geonames = wc.copy()
#wc_geonames['IsCanton'] = wc_geonames['Canton'].apply(is_known_canton, axis=1)

In [126]:
#len(wc_geonames[wc_geonames['IsCanton'] == False])

In [127]:
#wc_geonames[wc_geonames['IsCanton'] == False]

In [178]:
def get_placeId(uni):
    url = 'https://maps.googleapis.com/maps/api/place/textsearch/json?'
    params = {
        'query': uni,
         'key': GAPI_KEY
    }
    res = requests.get(url, params=params).json()
    if res['status']=='OK':
        return res['results'][0]['place_id']
    else:
        print(res)
        return None

In [179]:
# We retrieve the all list of 'address_components' b.c. the canton 'short_name' is indexed at 5 or 6 depending 
# on the JSON result.
def get_cantonCode(placeId):
    url = 'https://maps.googleapis.com/maps/api/geocode/json?'
    params = {
        'place_id': placeId,
        'key': GAPI_KEY
    }
    res = requests.get(url,params=params).json()
    if res['status']=='OK': 
        return res['results']
    else:
        print(res)
        return ''

In [180]:
# Get the canton 'short_name' for all the university' entries that did not have a result with geoname API request.

place_ids = {}
geocodes = {}

for i in wc.index:
    query = wc.iloc[i]['University']
    print('GMap request for %s' % query)
    place_ids[query] = get_placeId(query)
    if place_ids[query] != None:
        geocodes[query] = get_cantonCode(place_ids[query])
    else:
        geocodes[query]=None
    
pickle.dump(place_ids, open('place_ids.p','wb'))
pickle.dump(geocodes, open('geocodes.p','wb'))

GMap request for AO Research Institute - AORI
{'status': 'ZERO_RESULTS', 'results': [], 'html_attributions': []}
GMap request for Allergie- und Asthmaforschung - SIAF
GMap request for Biotechnologie Institut Thurgau - BITG
GMap request for Centre de rech. sur l'environnement alpin - CREALP
GMap request for Eidg. Anstalt für Wasserversorgung - EAWAG
GMap request for Eidg. Forschungsanstalt für Wald,Schnee,Land - WSL
{'status': 'ZERO_RESULTS', 'results': [], 'html_attributions': []}
GMap request for Eidg. Hochschulinstitut für Berufsbildung - EHB
GMap request for Eidg. Material und Prüfungsanstalt - EMPA
GMap request for Ente Ospedaliero Cantonale - EOC
GMap request for Fachhochschule Kalaidos - FHKD
GMap request for Fachhochschule Nordwestschweiz (ohne PH) - FHNW
GMap request for Fachhochschule Ostschweiz - FHO
GMap request for Fernfachhochschule Schweiz (Mitglied SUPSI) - FFHS
GMap request for Firmen/Privatwirtschaft - FP
{'status': 'ZERO_RESULTS', 'results': [], 'html_attributions': [

In [181]:
def get_short_name(geocode):
    short_names = [
        comp['short_name']
        for comp in geocode['address_components']
        if 'administrative_area_level_1' in comp['types']
    ]
    
    if len(short_names) > 0:
        return short_names[0]
    else:
        return None

def get_locality(geocode):
    localities = [
        comp['long_name']
        for comp in geocode['address_components']
        if 'locality' in comp['types']
    ]
    
    if len(localities) > 0:
        return localities[0]
    else:
        return None
    
def get_location(geocode):
    return geocode['geometry']['location']
    
def get_geo_info(geocode):
    if geocode == None or geocode[0] == None:
        return None

    return {
        'canton':   get_short_name(geocode[0]),
        'locality': get_locality(geocode[0]),
        'location': get_location(geocode[0])
    }

uni_geo_infos = {}

for uni in geocodes:
    uni_geo_infos[uni] = get_geo_info(geocodes[uni])
    
pickle.dump(uni_geo_infos, open('uni_geo_infos.p','wb'))
    
[i for i in uni_geo_infos if uni_geo_infos[i] == None]

['Pädagogische Hochschule Wallis - PHVS',
 'Forschungsanstalten Agroscope - AGS',
 'Swiss Center for Electronics and Microtech. - CSEM',
 'Eidg. Forschungsanstalt für Wald,Schnee,Land - WSL',
 'Firmen/Privatwirtschaft - FP',
 'NPO (Biblioth., Museen, Verwalt.) - NPO',
 'Weitere Spitäler - ASPIT',
 'Forschungsinstitut für Opthalmologie - IRO',
 'Weitere Institute - FINST',
 'Schweizer Kompetenzzentrum Sozialwissensch. - FORS',
 'Zürcher Fachhochschule (ohne PH) - ZFH',
 'Pädag. Hochschule Tessin (Teilschule SUPSI) - ASP',
 'AO Research Institute - AORI',
 'Physikal.-Meteorolog. Observatorium Davos - PMOD',
 'Staatsunabh. Theologische Hochschule Basel - STHB',
 'Forschungskommission SAGW']

In [324]:
# We reapply to see if the results the we got are legit.
wc['IsCanton'] = wc['Canton'].apply(is_known_canton, axis=1)
wc[wc['IsCanton'] == False]

Unnamed: 0,Canton,University,Approved Amount,IsCanton
0,,Forschungsanstalten Agroscope,33115720.0,False
2,,Pädag. Hochschule Tessin (Teilschule SUPSI),159317.0,False
3,,Weitere Spitäler,10749810.0,False
12,,Swiss Center for Electronics and Microtech.,18068250.0,False
15,,Eidg. Material und Prüfungsanstalt,57930690.0,False
19,,Fachhochschule Nordwestschweiz (ohne PH),42305970.0,False
21,HE,Forschungsinstitut für biologischen Landbau,7442410.0,False
22,,Weitere Institute,9256736.0,False
24,,Schweizer Kompetenzzentrum Sozialwissensch.,34732820.0,False
25,,Firmen/Privatwirtschaft,109180100.0,False


In [325]:
# Output the results of our last requests
from __future__ import division
print ('Total length: ' +  repr(len(wc)))
print ('Entries with known canton: ' + repr(len(wc)-len(wc[wc['IsCanton'] == False])))
print ('ratio of missing canton to total length: ' + repr((len(wc)-len(wc[wc['IsCanton'] == False]))/len(wc)))

Total length: 76
Entries with known canton: 57
ratio of missing canton to total length: 0.75


In [326]:
pickle.dump(wc,open('p3_data.p','wb'))
#imp_wc = pickle.load(open('p3_data.p','rb'))

In [327]:
grouped_ = imp_wc.groupby(['Canton', 'University']).sum().reset_index()


In [335]:
def strip_spaces(x, axis=None):
    return x.strip()

grouped_wc = grouped_.copy()
grouped_wc['Canton'] = grouped_wc['Canton'].apply(strip_spaces, axis=1)

In [336]:
grouped_wc = grouped_wc.groupby(['Canton']).sum().reset_index()

In [337]:
grouped_wc['IsCanton'] = grouped_wc['Canton'].apply(is_known_canton, axis=1)

In [338]:
grouped_wc[grouped_wc['IsCanton']==True]

Unnamed: 0,Canton,Approved Amount,IsCanton
1,AG,115269000.0,True
2,BE,1526167000.0,True
3,BS,1366673000.0,True
5,FR,449639900.0,True
6,GE,1846063000.0,True
7,GR,5269915.0,True
9,JU,34162970.0,True
10,LU,48820480.0,True
13,NE,380547500.0,True
14,SG,88134300.0,True


In [339]:
pickle.dump(imp_wc,open('p4_data.p','wb'))

In [340]:
l = ['ZH','BE','LU','UR','SZ','OW','NW','GL','ZG','FR','SO','BS','BL','SH','AR','AI','SG','GR','AG','TG','TI','VD','VS','NE','GE','JU']
len(l)

26

In [354]:
#grouped_wc = grouped_wc.reset_index()
#grouped_wc.loc[24]

ll= list(i in grouped_wc['Canton'].values for i in l)

for j in range(len(ll)):
    if ll[j] == False:
        grouped_wc.loc[len(grouped_wc)+1] = [l[j],0,True]
    #grouped_wc.loc[len(grouped_wc)+1] = ['l[i]',0,True]

In [355]:
grouped_wc.set_value(16,['Canton'], 'GR')
grouped_wc = grouped_wc.groupby('Canton').sum().reset_index()
grouped_wc

Unnamed: 0,Canton,Approved Amount,IsCanton
0,,718336400.0,0.0
1,AG,115269000.0,1.0
2,AI,0.0,1.0
3,AR,0.0,1.0
4,BE,1526167000.0,1.0
5,BL,0.0,1.0
6,BS,1366673000.0,1.0
7,BW,3478469.0,0.0
8,FR,449639900.0,1.0
9,GE,1846063000.0,1.0


In [356]:
grouped_wc['IsCanton'] = grouped_wc['Canton'].apply(is_known_canton, axis=1)

In [357]:
grouped_wc[grouped_wc['IsCanton']==True]

Unnamed: 0,Canton,Approved Amount,IsCanton
1,AG,115269000.0,True
2,AI,0.0,True
3,AR,0.0,True
4,BE,1526167000.0,True
5,BL,0.0,True
6,BS,1366673000.0,True
8,FR,449639900.0,True
9,GE,1846063000.0,True
10,GL,0.0,True
11,GR,17368350.0,True


In [358]:
pickle.dump(grouped_wc[grouped_wc['IsCanton']==True],open('p5_data.p','wb'))

In [359]:
len(grouped_wc[grouped_wc['IsCanton']==True])

26

In [396]:
# Coordinate DataFrame
df_coord = df.copy()


In [397]:
df_coord = df_coord.reset_index().drop(['Institution'],1)
df_coord = df_coord.drop(['Project Number'],1)

In [440]:
df_coord = df_coord.groupby(['University']).sum().reset_index()
df_coord['lng']= ''
df_coord['lat']= ''

In [441]:
df_coord



Unnamed: 0,University,Approved Amount,lng,lat
0,AO Research Institute - AORI,3.435621e+06,,
1,Allergie- und Asthmaforschung - SIAF,1.916996e+07,,
2,Berner Fachhochschule - BFH,3.102870e+07,,
3,Biotechnologie Institut Thurgau - BITG,2.492535e+06,,
4,Centre de rech. sur l'environnement alpin - CR...,1.567678e+06,,
5,EPF Lausanne - EPFL,1.162778e+09,,
6,ETH Zürich - ETHZ,1.618742e+09,,
7,Eidg. Anstalt für Wasserversorgung - EAWAG,7.397585e+07,,
8,"Eidg. Forschungsanstalt für Wald,Schnee,Land -...",4.836039e+07,,
9,Eidg. Hochschulinstitut für Berufsbildung - EHB,2.086572e+06,,


In [449]:
for i in df_coord.index:
    row = df_coord.iloc[i]['University']
    res = geoname_query(ex_uni(row,1))
    json = res.json()
    print(json)
    if json['totalResultsCount'] > 0:
        lng = json['geonames'][0]['lng']
        lat = json['geonames'][0]['lat']
        df_coord.set_value(i,'lng', lng)
        df_coord.set_value(i,'lat', lat)
  

{'totalResultsCount': 0, 'geonames': []}
{'totalResultsCount': 0, 'geonames': []}
{'totalResultsCount': 0, 'geonames': []}
{'totalResultsCount': 0, 'geonames': []}
{'totalResultsCount': 0, 'geonames': []}
{'totalResultsCount': 0, 'geonames': []}
{'totalResultsCount': 5, 'geonames': [{'population': 0, 'geonameId': 6930856, 'lat': '47.3763', 'fcode': 'UNIV', 'countryCode': 'CH', 'fclName': 'spot, building, farm', 'name': 'Eidgenössische Technische Hochschule Zürich', 'fcodeName': 'university', 'lng': '8.54805', 'adminName1': 'Zurich', 'adminCode1': 'ZH', 'countryId': '2658434', 'fcl': 'S', 'countryName': 'Switzerland', 'toponymName': 'Eidgenössische Technische Hochschule Zürich'}, {'population': 5, 'geonameId': 6295476, 'lat': '47.40951', 'fcode': 'PPLX', 'countryCode': 'CH', 'fclName': 'city, village,...', 'name': 'Zürich (Kreis 10) / ETH-Hönggerberg', 'fcodeName': 'section of populated place', 'lng': '8.50769', 'adminName1': 'Zurich', 'adminCode1': 'ZH', 'countryId': '2658434', 'fcl': 

In [450]:
df_coord

Unnamed: 0,University,Approved Amount,lng,lat
0,AO Research Institute - AORI,3.435621e+06,,
1,Allergie- und Asthmaforschung - SIAF,1.916996e+07,,
2,Berner Fachhochschule - BFH,3.102870e+07,,
3,Biotechnologie Institut Thurgau - BITG,2.492535e+06,,
4,Centre de rech. sur l'environnement alpin - CR...,1.567678e+06,,
5,EPF Lausanne - EPFL,1.162778e+09,,
6,ETH Zürich - ETHZ,1.618742e+09,8.54805,47.3763
7,Eidg. Anstalt für Wasserversorgung - EAWAG,7.397585e+07,,
8,"Eidg. Forschungsanstalt für Wald,Schnee,Land -...",4.836039e+07,,
9,Eidg. Hochschulinstitut für Berufsbildung - EHB,2.086572e+06,,


In [455]:
wc                                     

Unnamed: 0,Canton,University,Approved Amount,IsCanton
0,AGS,Forschungsanstalten Agroscope,3.311572e+07,False
1,AORI,AO Research Institute,3.435621e+06,False
2,AG,Pädag. Hochschule Tessin (Teilschule SUPSI),1.593170e+05,False
3,ASPIT,Weitere Spitäler,1.074981e+07,False
4,BE,Berner Fachhochschule,3.102870e+07,True
5,BE,Pädagogische Hochschule Bern,1.836136e+06,True
6,BE,Robert Walser,5.695790e+05,True
7,BE,Universität Bern,1.490646e+09,True
8,BITG,Biotechnologie Institut Thurgau,2.492535e+06,False
9,BS,Staatsunabh. Theologische Hochschule Basel,1.730000e+04,True
