In [799]:
%matplotlib inline

import numpy as np
import requests
import re
import pickle
import pandas as pd
import matplotlib.pyplot as pl

In [800]:
cols = ['Project Number', 'Institution', 'University', 'Approved Amount']
na_values = ['data not included in P3', 'Nicht zuteilbar - NA']

dtypes = {
        'Approved Amount': np.float64
}

raw = pd.read_csv('P3_GrantExport.csv', sep = ';', na_values=na_values, index_col='Project Number', dtype=dtypes, usecols=cols)
df = raw.dropna()

In [801]:
df.sample(10)

Unnamed: 0_level_0,Institution,University,Approved Amount
Project Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
120000,Unité d'histoire médiévale Département d'Histo...,Université de Genève - GE,130532.0
47255,Organisch-chemisches Institut Universität Zürich,Universität Zürich - ZH,425744.0
2690,Organisch-chemisches Institut Universität Zürich,Universität Zürich - ZH,417334.0
154099,Institute of Microeconomics Faculty of Economi...,Università della Svizzera italiana - USI,7000.0
138333,Institut d'analyse financière Université de Ne...,Université de Neuchâtel - NE,165658.0
62084,Institute of Sociology Bulgarian Academy of Sc...,Université de Fribourg - FR,60000.0
138243,Geologisches Institut ETH Zürich,ETH Zürich - ETHZ,52977.0
139421,Département d'Ecologie et d'Evolution Faculté ...,Université de Lausanne - LA,274748.0
65310,Institut für Informatik Universität Zürich,Universität Zürich - ZH,159975.0
102849,Abteilung für Allgemeine Innere Medizin Medizi...,Universität Basel - BS,1297880.0


In [802]:
df.index.is_unique

True

In [803]:
with_canton = df.copy()

word_to_canton = {
    'bern': 'BE',
    'lausanne': 'VD',
    'genève': 'GE',
    'geneva': 'GE',
    'luzern': 'LU',
    'zürich': 'ZH',
    'lugano': 'TI',
    'basel': 'BS',
    'vaud': 'VD',
    'fribourg': 'FR'
}

def guess_canton(text):
    lower = text.lower()
    for word in word_to_canton:
        if word in lower:
            return word_to_canton[word]
        
    return None


def ex_canton(text, axis):
    guess = guess_canton(text)
    if guess:
        return guess
    
    res = text.split('-')
    if len(res) < 2: 
        return text
    else:
        return res[1]
    
def ex_uni(text, axis):
    res = text.split('-')
    if len(res) < 2:
        return text.strip()
    else:
        return res[0].strip()

with_canton['Canton'] = with_canton['University'].apply(ex_canton, axis=1)
with_canton['University'] = with_canton['University'].apply(ex_uni, axis=1)

with_canton.sample(10)

Unnamed: 0_level_0,Institution,University,Approved Amount,Canton
Project Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8712,Dépt Microbiologie et Médecine Moléculaire Fac...,Université de Genève,560000.0,GE
144416,Laboratoire des matériaux organiques et macrom...,EPF Lausanne,128199.0,VD
121902,Labor für Ionenstrahlphysik ETH Zürich,ETH Zürich,431925.0,ZH
163542,Librairie Droz S.A.,Universität Basel,20780.0,BS
143179,Maison d'analyse des processus sociaux MAPS Un...,Université de Neuchâtel,527358.0,NE
138662,Département de Génétique et Evolution Faculté ...,Université de Genève,925788.0,GE
120866,Département de Biologie Végétale Faculté des S...,Université de Genève,667053.0,GE
134863,Abteilung Forschung Pädagogische Hochschule Th...,Pädagogische Hochschule Thurgau,291870.0,PHTG
51215,Dep. of Light-Induced Surface Pheno General Ph...,Universität Bern,86450.0,BE
10911,Institut für Agrarwirtschaft,ETH Zürich,124775.0,ZH


In [804]:
grouped = with_canton.groupby(['Canton', 'University']).sum().reset_index()
len(grouped)

76

In [805]:
cantons = pd.read_csv('cantons.csv', header=None)
cantons = cantons[0].values.tolist()



In [806]:
def is_known_canton(x, axis):
    return x.strip() in cantons

wc = grouped.copy()
wc['IsCanton'] = wc['Canton'].apply(is_known_canton, axis=1)

In [807]:
len(wc[wc['IsCanton'] == False])

55

In [808]:
wc[wc['IsCanton'] == False]

Unnamed: 0,Canton,University,Approved Amount,IsCanton
0,AGS,Forschungsanstalten Agroscope,33115720.0,False
1,AORI,AO Research Institute,3435621.0,False
2,ASP,Pädag. Hochschule Tessin (Teilschule SUPSI),159317.0,False
3,ASPIT,Weitere Spitäler,10749810.0,False
4,BITG,Biotechnologie Institut Thurgau,2492535.0,False
5,CREALP,Centre de rech. sur l'environnement alpin,1567678.0,False
6,CSEM,Swiss Center for Electronics and Microtech.,18068250.0,False
7,EAWAG,Eidg. Anstalt für Wasserversorgung,73975850.0,False
8,EHB,Eidg. Hochschulinstitut für Berufsbildung,2086572.0,False
9,EMPA,Eidg. Material und Prüfungsanstalt,57930690.0,False


In [809]:
len(wc[wc['IsCanton'] == False])

55

In [810]:
params = {
    'username': 'ada_drs',
    'country': 'CH',
    'type': 'json'
}

def geoname_query(q):
    params['q'] = q
    # print('Searching for %s...' % q)
    return requests.get('http://api.geonames.org/search', params)

def search_by(col):
    for i in wc[wc['IsCanton'] == False].index:
        row = wc.iloc[i]
        res = geoname_query(row[col].strip())
        json = res.json()

        if json['totalResultsCount'] > 0:
            canton = json['geonames'][0]['adminCode1']
            print('=> Found ' + canton)
            wc.set_value(i,'Canton', canton)
            
search_by('University')
search_by('Canton')

=> Found SG
=> Found AG
=> Found ZH
=> Found ZH
=> Found BL
=> Found SG
=> Found AG
=> Found LU
=> Found TI
=> Found ZH


In [811]:
wc_geonames = wc.copy()
wc_geonames['IsCanton'] = wc_geonames['Canton'].apply(is_known_canton, axis=1)

In [812]:
len(wc_geonames[wc_geonames['IsCanton'] == False])

46

In [813]:
wc_geonames[wc_geonames['IsCanton'] == False]

Unnamed: 0,Canton,University,Approved Amount,IsCanton
0,AGS,Forschungsanstalten Agroscope,33115720.0,False
1,AORI,AO Research Institute,3435621.0,False
3,ASPIT,Weitere Spitäler,10749810.0,False
4,BITG,Biotechnologie Institut Thurgau,2492535.0,False
5,CREALP,Centre de rech. sur l'environnement alpin,1567678.0,False
6,CSEM,Swiss Center for Electronics and Microtech.,18068250.0,False
8,EHB,Eidg. Hochschulinstitut für Berufsbildung,2086572.0,False
10,EOC,Ente Ospedaliero Cantonale,5067172.0,False
11,FFHS,Fernfachhochschule Schweiz (Mitglied SUPSI),12000.0,False
12,FHKD,Fachhochschule Kalaidos,1090280.0,False


In [814]:
def get_placeId(Uni):
    url = 'https://maps.googleapis.com/maps/api/place/textsearch/json?'
    params = {'query':Uni,
              'key':'AIzaSyD-jGLgkg-udefa8OkA1MMU6pXxSQzqFV4'}
    res = requests.get(url,params=params).json()
    if res['status']=='OK':
        res = res['results'][0]['place_id']
        return res
    return ''

In [815]:
# We retrieve the all list of 'address_components' b.c. the canton 'short_name' is indexed at 5 or 6 depending 
# on the JSON result.
def get_cantonCode(Uni):
    url = 'https://maps.googleapis.com/maps/api/geocode/json?'
    params = {'place_id':get_placeId(Uni),
              'key':'AIzaSyD-jGLgkg-udefa8OkA1MMU6pXxSQzqFV4'}
    res = requests.get(url,params=params).json()
    if res['status']=='OK': 
        ll = list(res['results'][0]['address_components'][i]['types'][0] =='administrative_area_level_1'
                  for i in range(0,len(res['results'][0]['address_components'])))
        return res['results'][0]['address_components'][ll.index(True)]['short_name']
    return ''

In [816]:
# Get the canton 'short_name' for all the university' entries that did not have a result with geoname API request.
for i in wc[wc['IsCanton'] == False][0:-1].index:
    canton = get_cantonCode(wc.iloc[i]['University'])
    wc.set_value(i,'Canton', canton)


In [817]:
# We reapply to see if the results the we got are legit.
wc['IsCanton'] = wc['Canton'].apply(is_known_canton, axis=1)
wc[wc['IsCanton'] == False]

Unnamed: 0,Canton,University,Approved Amount,IsCanton
0,,Forschungsanstalten Agroscope,33115720.0,False
2,,Pädag. Hochschule Tessin (Teilschule SUPSI),159317.0,False
3,,Weitere Spitäler,10749810.0,False
6,,Swiss Center for Electronics and Microtech.,18068250.0,False
9,,Eidg. Material und Prüfungsanstalt,57930690.0,False
13,,Fachhochschule Nordwestschweiz (ohne PH),42305970.0,False
15,HE,Forschungsinstitut für biologischen Landbau,7442410.0,False
16,,Weitere Institute,9256736.0,False
18,,Schweizer Kompetenzzentrum Sozialwissensch.,34732820.0,False
19,,Firmen/Privatwirtschaft,109180100.0,False


In [818]:
# Output the results of our last requests
from __future__ import division
print 'Total length: ' +  repr(len(wc))
print 'Entries with known canton: ' + repr(len(wc)-len(wc[wc['IsCanton'] == False])) 
print 'ratio of missing canton to total length: ' + repr((len(wc)-len(wc[wc['IsCanton'] == False]))/len(wc)) 

Total length: 76
Entries with known canton: 57
ratio of missing canton to total length: 0.75
