In [102]:
%matplotlib inline

import numpy as np
import requests
import re
import pickle
import pandas as pd
import matplotlib.pyplot as pl

In [103]:
GAPI_KEY = 'AIzaSyAKgQXny3tOfMtUj9PCHDL-lBqBMBdTd-k'

In [143]:
cols = ['Project Number', 'Institution', 'University', 'Approved Amount']
na_values = ['data not included in P3', 'Nicht zuteilbar - NA']

dtypes = {
        'Approved Amount': np.float64
}

raw = pd.read_csv(
    'P3_GrantExport.csv',
    sep = ';',
    na_values=na_values,
    index_col='Project Number',
    dtype=dtypes,
    usecols=cols
)

df = raw.dropna()

In [144]:
df.sample(10)

Unnamed: 0_level_0,Institution,University,Approved Amount
Project Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
147706,Theologisches Seminar Universität Basel,Universität Basel - BS,44203.0
140516,Institut de Géographie Université de Lausanne,Université de Lausanne - LA,396257.0
150492,Klinische Psychologie und Psychotherapie Insti...,Universität Bern - BE,1579664.0
54960,Division de Physiopathologie Clinique Départem...,Université de Lausanne - LA,360042.0
44935,Ethnologisches Seminar Universität Zürich,Universität Zürich - ZH,228754.0
41888,Géologie et Paléontologie Département des Géos...,Université de Fribourg - FR,74302.0
10777,Ciba-Geigy AG Forschungszentrum Marly,Firmen/Privatwirtschaft - FP,117762.0
122134,Laboratoire de microsystèmes 1 EPFL - STI - IM...,EPF Lausanne - EPFL,266533.0
66742,Institut d'informatique Université de Neuchâtel,Université de Neuchâtel - NE,273667.0
141254,Istituto di Ricerca in Biomedicina (IRB),Università della Svizzera italiana - USI,782474.0


In [145]:
df.index.is_unique

True

In [107]:
with_canton = df.copy()

word_to_canton = {
    'bern': 'BE',
    'lausanne': 'VD',
    'genève': 'GE',
    'geneva': 'GE',
    'luzern': 'LU',
    'zürich': 'ZH',
    'lugano': 'TI',
    'basel': 'BS',
    'vaud': 'VD',
    'fribourg': 'FR',
    'davos': 'GR',
    'sagw': 'BE'
}

cantons = ['ZH','BE','LU','UR','SZ','OW','NW','GL','ZG','FR','SO','BS','BL','SH','AR','AI','SG','GR','AG','TG','TI','VD','VS','NE','GE','JU']

def guess_canton(text):
    lower = text.lower()
    for word in word_to_canton:
        if word in lower:
            return word_to_canton[word]
        
    return ''

def ex_canton_str(s):
    m = re.search(r'\b([A-Z]+)\b$', s.strip())
    if m != None and m.group(1) in cantons:
        return m.group(1)
    else:
        return ''

def ex_canton(text, axis):
    guess = guess_canton(text)
    if guess:
        return guess
    
    res = text.split('-')
    
    if len(res) < 2: 
        return text.strip()
    else:
        return ex_canton_str(res[1])
        
    
def ex_uni(text, axis):
    res = text.split('-')
    
    if len(res) < 2 or ex_canton_str(res[1]) == '':
        return text.strip()
    else:
        return res[0].strip()

with_canton['Canton']     = with_canton['University'].apply(ex_canton, axis=1)
with_canton['University'] = with_canton['University'].apply(ex_uni, axis=1)

with_canton.sample(10)

Unnamed: 0_level_0,Institution,University,Approved Amount,Canton
Project Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
121998,"Abteilung für Pädiatrische Endokrinologie, Dia...",Universität Bern,468000.0,BE
32484,Neurologische Klinik Universitätsspital Zürich,Universität Zürich,281211.0,ZH
110973,Département des Géosciences Université de Frib...,Université de Fribourg,72360.0,FR
144277,Klinik für Infektionskrankheiten und Spitalhyg...,Universität Zürich,358883.13,ZH
9366,Physiologisches Institut Universität Zürich,Universität Bern,99932.0,BE
7680,Klinik für Angiologie Departement Innere Mediz...,Universität Zürich,153912.0,ZH
58475,Lamprecht und Stamm Sozialforschung und Beratu...,Firmen/Privatwirtschaft - FP,120000.0,
896,Fondation Archives Jean Piaget Université de G...,Université de Genève,160649.0,GE
164032,Institut des dynamiques de la surface terrestr...,Université de Lausanne - LA,266000.0,VD
56913,Klinik für Kardiologie Departement Innere Medi...,Universität Zürich,156700.0,ZH


In [108]:
grouped = with_canton.groupby(['Canton', 'University']).sum().reset_index()
len(grouped)

76

In [109]:
def is_known_canton(x, axis):
    return x.strip() in cantons

wc = grouped.copy()
wc['IsCanton'] = wc['Canton'].apply(is_known_canton, axis=1)

In [110]:
len(wc[wc['IsCanton'] == False])

53

In [111]:
wc[wc['IsCanton'] == False]

Unnamed: 0,Canton,University,Approved Amount,IsCanton
0,,AO Research Institute - AORI,3435621.0,False
1,,Allergie- und Asthmaforschung - SIAF,19169960.0,False
2,,Biotechnologie Institut Thurgau - BITG,2492535.0,False
3,,Centre de rech. sur l'environnement alpin - CR...,1567678.0,False
4,,Eidg. Anstalt für Wasserversorgung - EAWAG,73975850.0,False
5,,"Eidg. Forschungsanstalt für Wald,Schnee,Land -...",48360390.0,False
6,,Eidg. Hochschulinstitut für Berufsbildung - EHB,2086572.0,False
7,,Eidg. Material und Prüfungsanstalt - EMPA,57930690.0,False
8,,Ente Ospedaliero Cantonale - EOC,5067172.0,False
9,,Fachhochschule Kalaidos - FHKD,1090280.0,False


In [112]:
wc[wc['IsCanton']]

Unnamed: 0,Canton,University,Approved Amount,IsCanton
53,BE,Berner Fachhochschule - BFH,31028700.0,True
54,BE,Forschungskommission SAGW,100000.0,True
55,BE,Pädagogische Hochschule Bern - PHBern,1836136.0,True
56,BE,Robert Walser-Stiftung Bern - RWS,569579.0,True
57,BE,Universität Bern,1490646000.0,True
58,BS,Staatsunabh. Theologische Hochschule Basel - STHB,17300.0,True
59,BS,Universität Basel,1326427000.0,True
60,FR,Haute école pédagogique fribourgeoise - HEPFR,1547498.0,True
61,FR,Université de Fribourg,448092400.0,True
62,GE,Université de Genève,1810170000.0,True


In [113]:
def load_geo():
    '''
    params = {
        'username': 'ada_drs3',
        'country': 'CH',
        'type': 'json'
    }

    def geoname_query(q):
        params['q'] = q
        # print('Searching for %s...' % q)
        return requests.get('http://api.geonames.org/search', params)

    def search_by(col):
        for i in wc[wc['IsCanton'] == False].index:
            row = wc.iloc[i]
            res = geoname_query(row[col].strip())
            json = res.json()

            if json['totalResultsCount'] > 0:
                canton = json['geonames'][0]['adminCode1']
                print('=> Found ' + canton)
                wc.set_value(i,'Canton', canton)

    #search_by('University')
    #search_by('Canton')
    '''

In [114]:
def get_placeId(uni):
    url = 'https://maps.googleapis.com/maps/api/place/textsearch/json?'
    params = {
        'query': uni,
         'key': GAPI_KEY
    }
    res = requests.get(url, params=params).json()
    if res['status']=='OK':
        return res['results'][0]['place_id']
    else:
        print(res)
        return None

In [115]:
# We retrieve the all list of 'address_components' b.c. the canton 'short_name' is indexed at 5 or 6 depending 
# on the JSON result.
def get_cantonCode(placeId):
    url = 'https://maps.googleapis.com/maps/api/geocode/json?'
    params = {
        'place_id': placeId,
        'key': GAPI_KEY
    }
    res = requests.get(url,params=params).json()
    if res['status']=='OK': 
        return res['results']
    else:
        print(res)
        return ''

In [116]:
# Get the canton 'short_name' for all the university' entries that did not have a result with geoname API request.

query_api = False

place_ids = {}
geocodes = {}

if query_api:

    for i in wc.index:
        query = wc.iloc[i]['University']
        print('GMap request for %s' % query)
        place_ids[query] = get_placeId(query)
        if place_ids[query] != None:
            geocodes[query] = get_cantonCode(place_ids[query])
        else:
            geocodes[query]=None

    pickle.dump(place_ids, open('place_ids.p','wb'))
    pickle.dump(geocodes, open('geocodes.p','wb'))

else:
    place_ids = pickle.load(open('place_ids.p','rb'))
    geocodes  = pickle.load(open('geocodes.p','rb'))

In [117]:
def get_short_name(geocode):
    short_names = [
        comp['short_name']
        for comp in geocode['address_components']
        if 'administrative_area_level_1' in comp['types']
    ]
    
    if len(short_names) > 0:
        return short_names[0]
    else:
        return None

def get_locality(geocode):
    localities = [
        comp['long_name']
        for comp in geocode['address_components']
        if 'locality' in comp['types']
    ]
    
    if len(localities) > 0:
        return localities[0]
    else:
        return None
    
def get_location(geocode):
    return geocode['geometry']['location']
    
def get_geo_info(geocode):
    if geocode == None or geocode[0] == None:
        return None

    return {
        'canton':   get_short_name(geocode[0]),
        'locality': get_locality(geocode[0]),
        'location': get_location(geocode[0])
    }

uni_geo_infos = {}

for uni in geocodes:
    uni_geo_infos[uni] = get_geo_info(geocodes[uni])

In [118]:
from uni_geo_infos_manual import uni_geo_infos_manual

for uni in uni_geo_infos_manual:
    uni_geo_infos[uni] = uni_geo_infos_manual[uni]
    
uni_geo_infos

{'AO Research Institute - AORI': {'canton': 'GR',
  'locality': 'Davos',
  'location': {'lat': 46.777395, 'lng': 9.813876}},
 'Allergie- und Asthmaforschung - SIAF': {'canton': 'GR',
  'locality': 'Davos Platz',
  'location': {'lat': 46.7954192, 'lng': 9.8200409}},
 'Berner Fachhochschule - BFH': {'canton': 'BE',
  'locality': 'Burgdorf',
  'location': {'lat': 47.0574225, 'lng': 7.621387700000001}},
 'Biotechnologie Institut Thurgau - BITG': {'canton': 'TG',
  'locality': 'Kreuzlingen',
  'location': {'lat': 47.6483273, 'lng': 9.161303}},
 "Centre de rech. sur l'environnement alpin - CREALP": {'canton': 'VS',
  'locality': 'Sion',
  'location': {'lat': 46.2285589, 'lng': 7.367433499999999}},
 'EPF Lausanne - EPFL': {'canton': 'VD',
  'locality': 'Lausanne',
  'location': {'lat': 46.5190557, 'lng': 6.5667576}},
 'ETH Zürich - ETHZ': {'canton': 'ZH',
  'locality': 'Zürich',
  'location': {'lat': 47.376313, 'lng': 8.547669899999999}},
 'Eidg. Anstalt für Wasserversorgung - EAWAG': {'canto

In [119]:
def load_from_uni_geo_info(uni, axis=None):
    if uni in uni_geo_infos and uni_geo_infos[uni] != None:
        return uni_geo_infos[uni]['canton']
    return ''

wc['Canton']   = wc['University'].apply(load_from_uni_geo_info, axis=1)
wc['IsCanton'] = wc['Canton'].apply(is_known_canton, axis=1)
wc[wc['IsCanton'] == False]

Unnamed: 0,Canton,University,Approved Amount,IsCanton
13,,Firmen/Privatwirtschaft - FP,109180100.0,False
16,HE,Forschungsinstitut für biologischen Landbau - ...,7442410.0,False
29,Lazio,Istituto Svizzero di Roma - ISR,141000.0,False
31,,"NPO (Biblioth., Museen, Verwalt.) - NPO",322996000.0,False
45,,Schweizer Kompetenzzentrum Sozialwissensch. - ...,34732820.0,False
50,,Weitere Institute - FINST,9256736.0,False
51,,Weitere Spitäler - ASPIT,10749810.0,False


In [120]:
# Output the results of our last requests
from __future__ import division
print ('Total length: ' +  repr(len(wc)))
print ('Entries with known canton: ' + repr(len(wc)-len(wc[wc['IsCanton'] == False])))
print ('ratio of missing canton to total length: ' + repr((len(wc)-len(wc[wc['IsCanton'] == False]))/len(wc)))

Total length: 76
Entries with known canton: 69
ratio of missing canton to total length: 0.9078947368421053


In [128]:
final_wc = wc[wc['IsCanton'] == True].drop(['IsCanton'], axis=1)
final_wc

Unnamed: 0,Canton,University,Approved Amount
0,GR,AO Research Institute - AORI,3.435621e+06
1,GR,Allergie- und Asthmaforschung - SIAF,1.916996e+07
2,TG,Biotechnologie Institut Thurgau - BITG,2.492535e+06
3,VS,Centre de rech. sur l'environnement alpin - CR...,1.567678e+06
4,ZH,Eidg. Anstalt für Wasserversorgung - EAWAG,7.397585e+07
5,ZH,"Eidg. Forschungsanstalt für Wald,Schnee,Land -...",4.836039e+07
6,BE,Eidg. Hochschulinstitut für Berufsbildung - EHB,2.086572e+06
7,ZH,Eidg. Material und Prüfungsanstalt - EMPA,5.793069e+07
8,TI,Ente Ospedaliero Cantonale - EOC,5.067172e+06
9,ZH,Fachhochschule Kalaidos - FHKD,1.090280e+06


In [134]:
grouped_wc = final_wc.groupby('Canton').sum().reset_index()
grouped_wc

Unnamed: 0,Canton,Approved Amount
0,AG,115269000.0
1,BE,1526267000.0
2,BL,3476142.0
3,BS,1366673000.0
4,FR,449639900.0
5,GE,1857647000.0
6,GR,36538320.0
7,JU,34790350.0
8,LU,48820480.0
9,NE,398615800.0


In [137]:
missing_cantons = [canton for canton in cantons if canton not in grouped_wc['Canton'].values]

with_all_cantons = grouped_wc.copy()

for canton in missing_cantons:
    data = {
        'Canton': [canton],
        'Approved Amount': [0]
    }
    df = pd.DataFrame.from_dict(data, orient='columns')
    
    with_all_cantons = with_all_cantons.append(df, ignore_index=True)

with_all_cantons

Unnamed: 0,Approved Amount,Canton
0,115269000.0,AG
1,1526267000.0,BE
2,3476142.0,BL
3,1366673000.0,BS
4,449639900.0,FR
5,1857647000.0,GE
6,36538320.0,GR
7,34790350.0,JU
8,48820480.0,LU
9,398615800.0,NE


In [141]:
from math import pow
scaled_cantons = with_all_cantons.copy()
scaled_cantons['Approved Amount'] = with_all_cantons['Approved Amount'].div(pow(10, 6))
scaled_cantons

Unnamed: 0,Approved Amount,Canton
0,115.268969,AG
1,1526.266616,BE
2,3.476142,BL
3,1366.673453,BS
4,449.639858,FR
5,1857.646558,GE
6,36.538316,GR
7,34.790345,JU
8,48.820483,LU
9,398.61578,NE


In [142]:
pickle.dump(with_all_cantons, open('all_cantons.p','wb'))