In [1]:
import pandas as pd

Here we load the wide-formatted data on the Gemeinden (communes), summarize all Gemeinden into Bezirke (districts) and Kantone and save the results back into new datatables

## Load the necessary resources

In [2]:
#Load the correspondence-table for communes
df_ids = pd.read_excel('OriginalData/Gemeindestand-2015.xls')
df_ids.shape

(2324, 7)

In [3]:
df_ids.head(2)

Unnamed: 0,Hist.-Nummer,Kanton,Bezirksnummer,Bezirksname,BFS Gde-nummer,Gemeindename,Datum der Aufnahme
0,13256,ZH,101,Affoltern,1,Aeugst am Albis,1976-11-15
1,11742,ZH,101,Affoltern,2,Affoltern am Albis,1960-01-01


In [9]:
#Load the numbers and abbreviations file for the cantons
df_ktnr = pd.read_csv('OriginalData/Kantone-ABK-ID.csv')
df_ktnr.shape

(25, 3)

In [10]:
df_ktnr.head(2)

Unnamed: 0,KTABK,KTID,KTName
0,ZH,1,Zürich
1,BE,2,Bern


In [6]:
#Load the Einbürgerungsdata for all Communes
df_einb = pd.read_csv('CleanData/Gem-Einb-Wide-1991-2015.csv')
df_einb.shape

(2847, 26)

In [7]:
df_einb.head(2)

Unnamed: 0,GemeindeID,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,1,0.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,2.0,...,3.0,7.0,10.0,8.0,4.0,15.0,6.0,11.0,8.0,10.0
1,2,6.0,24.0,16.0,15.0,31.0,24.0,24.0,34.0,33.0,...,137.0,122.0,150.0,109.0,106.0,49.0,55.0,83.0,49.0,55.0


In [8]:
#Load the Bevölkerungsdata for all Communes
df_bev = pd.read_csv('CleanData/Gem-Bev-Wide-1991-2015.csv')
df_bev.shape

(2847, 26)

In [11]:
df_bev.head(2)

Unnamed: 0,GemeindeID,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,1,1135.0,1168.0,1226.0,1242.0,1269.0,1302.0,1346.0,1362.0,1362.0,...,1478.0,1505.0,1538.0,1536.0,1538.0,1599.0,1651.0,1678.0,1689.0,1699.0
1,2,7268.0,7244.0,7224.0,7193.0,7156.0,7190.0,7246.0,7238.0,7287.0,...,7531.0,7683.0,7760.0,7954.0,8022.0,8259.0,8260.0,8340.0,8423.0,8463.0


A little preparation

In [13]:
#Add the missing kanton-info to the districts
df_ids = df_ids.merge(df_ktnr, how='inner', left_on='Kanton', right_on="KTABK")
df_ids.head(2)

In [31]:
#Create a short-version of the districts table that we can use later
df_ids_short = df_ids.drop_duplicates(subset='Bezirksnummer')[['Bezirksnummer', 'Bezirksname', 'Kanton']]
df_ids_short.head(2)

Unnamed: 0,Bezirksnummer,Bezirksname,Kanton
0,101,Affoltern,ZH
14,102,Andelfingen,ZH


## Merge District-info into Gemeinde-data

Einbürgerungen

In [27]:
#Next, add the combined info on districts and cantons to the Gemeindedata
df_einb2 = df_einb.merge(df_ids, how='inner', left_on='GemeindeID', right_on='BFS Gde-nummer')

In [28]:
#We group by Districts
df_einb_bez = df_einb2.groupby('Bezirksnummer').sum()
df_einb_bez.head(2)

Unnamed: 0_level_0,GemeindeID,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2009,2010,2011,2012,2013,2014,2015,Hist.-Nummer,BFS Gde-nummer,KTID
Bezirksnummer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101,105,25.0,61.0,55.0,54.0,83.0,85.0,90.0,97.0,79.0,...,262.0,244.0,209.0,214.0,242.0,195.0,267.0,173898,105,14
102,780,8.0,21.0,22.0,43.0,30.0,46.0,23.0,40.0,20.0,...,115.0,75.0,95.0,89.0,50.0,64.0,86.0,288862,780,24


In [29]:
#Get rid of some stuff we dont need
df_einb_bez.pop('GemeindeID')
df_einb_bez.pop('Hist.-Nummer')
df_einb_bez.pop('BFS Gde-nummer')
df_einb_bez.pop('KTID')
df_einb_bez.reset_index()
df_einb_bez.head(2)

Unnamed: 0_level_0,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Bezirksnummer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101,25.0,61.0,55.0,54.0,83.0,85.0,90.0,97.0,79.0,164.0,...,282.0,273.0,319.0,262.0,244.0,209.0,214.0,242.0,195.0,267.0
102,8.0,21.0,22.0,43.0,30.0,46.0,23.0,40.0,20.0,64.0,...,74.0,84.0,126.0,115.0,75.0,95.0,89.0,50.0,64.0,86.0


In [34]:
#Add back the district-info that went missing
df_einb_bez2 = df_einb_bez.merge(df_ids_short, how='inner', left_index=True, right_on='Bezirksnummer')
df_einb_bez2.head(2)

Unnamed: 0,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,...,2009,2010,2011,2012,2013,2014,2015,Bezirksnummer,Bezirksname,Kanton
0,25.0,61.0,55.0,54.0,83.0,85.0,90.0,97.0,79.0,164.0,...,262.0,244.0,209.0,214.0,242.0,195.0,267.0,101,Affoltern,ZH
14,8.0,21.0,22.0,43.0,30.0,46.0,23.0,40.0,20.0,64.0,...,115.0,75.0,95.0,89.0,50.0,64.0,86.0,102,Andelfingen,ZH


In [36]:
#Save it into a file
df_einb_bez2.to_csv('CleanData/Bez-Einb-Wide-1991-2015', index=False)

Bevölkerung

In [37]:
#Next, add the combined info on districts and cantons to the Gemeindedata
df_bev2 = df_bev.merge(df_ids, how='inner', left_on='GemeindeID', right_on='BFS Gde-nummer')

In [38]:
#We group by Districts
df_bev_bez = df_bev2.groupby('Bezirksnummer').sum()
df_bev_bez.head(2)

Unnamed: 0_level_0,GemeindeID,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2009,2010,2011,2012,2013,2014,2015,Hist.-Nummer,BFS Gde-nummer,KTID
Bezirksnummer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101,105,30874.0,31242.0,31774.0,32249.0,32774.0,33268.0,33817.0,34067.0,34493.0,...,39176.0,39729.0,40432.0,40914.0,41168.0,41428.0,41720.0,173898,105,14
102,780,21269.0,21436.0,21739.0,22056.0,22247.0,22581.0,22775.0,23099.0,23304.0,...,25894.0,26189.0,26210.0,26398.0,26521.0,26581.0,26749.0,288862,780,24


In [39]:
#Get rid of some stuff we dont need
df_bev_bez.pop('GemeindeID')
df_bev_bez.pop('Hist.-Nummer')
df_bev_bez.pop('BFS Gde-nummer')
df_bev_bez.pop('KTID')
df_bev_bez.reset_index()
df_bev_bez.head(2)

Unnamed: 0_level_0,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Bezirksnummer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101,30874.0,31242.0,31774.0,32249.0,32774.0,33268.0,33817.0,34067.0,34493.0,35058.0,...,37602.0,38222.0,38631.0,39176.0,39729.0,40432.0,40914.0,41168.0,41428.0,41720.0
102,21269.0,21436.0,21739.0,22056.0,22247.0,22581.0,22775.0,23099.0,23304.0,23676.0,...,25294.0,25485.0,25675.0,25894.0,26189.0,26210.0,26398.0,26521.0,26581.0,26749.0


In [40]:
#Add back the district-info that went missing
df_bev_bez2 = df_einb_bez.merge(df_ids_short, how='inner', left_index=True, right_on='Bezirksnummer')
df_bev_bez2.head(2)

Unnamed: 0,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,...,2009,2010,2011,2012,2013,2014,2015,Bezirksnummer,Bezirksname,Kanton
0,25.0,61.0,55.0,54.0,83.0,85.0,90.0,97.0,79.0,164.0,...,262.0,244.0,209.0,214.0,242.0,195.0,267.0,101,Affoltern,ZH
14,8.0,21.0,22.0,43.0,30.0,46.0,23.0,40.0,20.0,64.0,...,115.0,75.0,95.0,89.0,50.0,64.0,86.0,102,Andelfingen,ZH


In [41]:
#Save it into a file
df_bev_bez2.to_csv('CleanData/Bez-Bev-Wide-1991-2015', index=False)

## Merge canton-data with Gemeinden