In [1]:
import pandas as pd

In [2]:
%matplotlib inline

## Read data and separate Switzerland, Cantons, Districts, Communes

### Load the Excel with all the population data in it

In [3]:
df = pd.read_excel('CleanData/Bev-Wide-1981-2015.xls')

In [4]:
df.head(2)

Unnamed: 0,Einheit,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,Schweiz,6335243.0,6372904.0,6409713.0,6427833.0,6455896.0,6484834.0,6523413.0,6566799.0,6619973.0,...,7459128.0,7508739.0,7593494.0,7701856.0,7785806.0,7870134.0,7954662.0,8039060.0,8139631.0,8237666.0
1,- Zürich,1120815.0,1123840.0,1127659.0,1124690.0,1126848.0,1128248.0,1131484.0,1136566.0,1141494.0,...,1272590.0,1284052.0,1307567.0,1332727.0,1351297.0,1373068.0,1392396.0,1408575.0,1425538.0,1446354.0


In [5]:
#Make sure we don't have any NaN in there
df = df[df['Einheit'].notnull()]

### Load some helper files

In [6]:
#Load the numbers and abbreviations file for the cantons
df_kant_ids = pd.read_csv('OriginalData/Kantone-ABK-ID.csv')
df_kant_ids.head(2)

Unnamed: 0,Kanton_ABK,Kanton_ID,Kanton_Name,Kanton_Name2
0,ZH,1,Zürich,Zürich
1,BE,2,Bern,Bern / Berne


In [7]:
#Load the canton/district infos for the communes
df_gem_ids = pd.read_excel('OriginalData/Gemeindestand-2015.xls')
df_gem_ids.head(2)

Unnamed: 0,Hist.-Nummer,Kanton,Bezirksnummer,Bezirksname,BFS Gde-nummer,Gemeindename,Datum der Aufnahme
0,13256,ZH,101,Affoltern,1,Aeugst am Albis,1976-11-15
1,11742,ZH,101,Affoltern,2,Affoltern am Albis,1960-01-01


In [8]:
df_gem_ids.pop('Hist.-Nummer')
df_gem_ids.pop('Gemeindename')
df_gem_ids.pop('Datum der Aufnahme')
df_gem_ids.rename(columns={'Bezirksnummer': 'Bezirk_ID', 'Bezirksname': 'Bezirk_Name', 'BFS Gde-nummer': 'Gemeinde_ID', 'Kanton': 'Kanton_ABK'}, inplace=True)
df_gem_ids.head(2)

Unnamed: 0,Kanton_ABK,Bezirk_ID,Bezirk_Name,Gemeinde_ID
0,ZH,101,Affoltern,1
1,ZH,101,Affoltern,2


In [9]:
#Construct a set of district ids
df_bez_ids = df_gem_ids.drop_duplicates(subset='Bezirk_ID')
df_bez_ids.pop('Gemeinde_ID')
df_bez_ids.head(2)

Unnamed: 0,Kanton_ABK,Bezirk_ID,Bezirk_Name
0,ZH,101,Affoltern
14,ZH,102,Andelfingen


## Get a dataframe for each geopraphic level

### Separate Switzerland

In [10]:
#Switzerland
df_ch = df[df['Einheit'] == 'Schweiz']
df_ch.pop('Einheit')
df_ch.head(2)

Unnamed: 0,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,6335243.0,6372904.0,6409713.0,6427833.0,6455896.0,6484834.0,6523413.0,6566799.0,6619973.0,6673850.0,...,7459128.0,7508739.0,7593494.0,7701856.0,7785806.0,7870134.0,7954662.0,8039060.0,8139631.0,8237666.0


In [11]:
df_ch.shape

(1, 35)

### Separate the cantons

In [12]:
#Cantons - there should be 26 of them
df_kant = df[df['Einheit'].str.contains('^- ')]
df_kant['Kanton_Name'] = df_kant['Einheit'].str.extract('^- (.*)', expand=True)
df_kant.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Einheit,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,Kanton_Name
1,- Zürich,1120815.0,1123840.0,1127659.0,1124690.0,1126848.0,1128248.0,1131484.0,1136566.0,1141494.0,...,1284052.0,1307567.0,1332727.0,1351297.0,1373068.0,1392396.0,1408575.0,1425538.0,1446354.0,Zürich
183,- Bern / Berne,897296.0,900987.0,904420.0,906070.0,907551.0,908799.0,911266.0,914516.0,918201.0,...,958897.0,962982.0,969299.0,974235.0,979802.0,985046.0,992617.0,1001281.0,1009418.0,Bern / Berne


In [13]:
#Add the ID and Abbreviations
df_kant = df_kant.merge(df_kant_ids, how='inner', left_on='Kanton_Name', right_on='Kanton_Name2')
df_kant.pop('Kanton_Name2')
df_kant.pop('Einheit')
df_kant.pop('Kanton_Name_y')
df_kant.rename(columns={'Kanton_Name_x': 'Kanton_Name'}, inplace=True)
df_kant.head(2)

Unnamed: 0,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,...,2009,2010,2011,2012,2013,2014,2015,Kanton_Name,Kanton_ABK,Kanton_ID
0,1120815.0,1123840.0,1127659.0,1124690.0,1126848.0,1128248.0,1131484.0,1136566.0,1141494.0,1144899.0,...,1332727.0,1351297.0,1373068.0,1392396.0,1408575.0,1425538.0,1446354.0,Zürich,ZH,1
1,897296.0,900987.0,904420.0,906070.0,907551.0,908799.0,911266.0,914516.0,918201.0,922884.0,...,969299.0,974235.0,979802.0,985046.0,992617.0,1001281.0,1009418.0,Bern / Berne,BE,2


In [14]:
df_kant.shape

(26, 38)

### Separate the communes

In [15]:
#Gemeinden - there should be 2324 of them
df_gem = df[df['Einheit'].str.contains('^\.\.\.\.\.\.')]
df_gem['Gemeinde_ID'] = df_gem['Einheit'].str.extract('^\.\.\.\.\.\.(\d\d\d\d)', expand=True).astype(int)
df_gem['Gemeinde_Name'] = df_gem['Einheit'].str.extract('^\.\.\.\.\.\.\d\d\d\d (.*)', expand=True)
df_gem.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Einheit,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2008,2009,2010,2011,2012,2013,2014,2015,Gemeinde_ID,Gemeinde_Name
3,......0001 Aeugst am Albis,867.0,905.0,945.0,958.0,969.0,1008.0,1000.0,1043.0,1104.0,...,1704.0,1700.0,1740.0,1824.0,1910.0,1955.0,1972.0,1959.0,1,Aeugst am Albis
4,......0002 Affoltern am Albis,8041.0,8223.0,8281.0,8422.0,8629.0,8699.0,8827.0,8958.0,9101.0,...,10410.0,10630.0,10735.0,11091.0,11160.0,11276.0,11363.0,11552.0,2,Affoltern am Albis


In [16]:
#Add the district and canton IDs and names
df_gem = df_gem.merge(df_gem_ids, how='inner', on='Gemeinde_ID')
df_gem.pop('Einheit')
df_gem.head(2)

Unnamed: 0,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,...,2011,2012,2013,2014,2015,Gemeinde_ID,Gemeinde_Name,Kanton_ABK,Bezirk_ID,Bezirk_Name
0,867.0,905.0,945.0,958.0,969.0,1008.0,1000.0,1043.0,1104.0,1144.0,...,1824.0,1910.0,1955.0,1972.0,1959.0,1,Aeugst am Albis,ZH,101,Affoltern
1,8041.0,8223.0,8281.0,8422.0,8629.0,8699.0,8827.0,8958.0,9101.0,9314.0,...,11091.0,11160.0,11276.0,11363.0,11552.0,2,Affoltern am Albis,ZH,101,Affoltern


In [17]:
df_gem = df_gem.merge(df_kant_ids, how='inner', on='Kanton_ABK')
df_gem.pop('Kanton_Name2')
df_gem.head(2)

Unnamed: 0,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,...,2013,2014,2015,Gemeinde_ID,Gemeinde_Name,Kanton_ABK,Bezirk_ID,Bezirk_Name,Kanton_ID,Kanton_Name
0,867.0,905.0,945.0,958.0,969.0,1008.0,1000.0,1043.0,1104.0,1144.0,...,1955.0,1972.0,1959.0,1,Aeugst am Albis,ZH,101,Affoltern,1,Zürich
1,8041.0,8223.0,8281.0,8422.0,8629.0,8699.0,8827.0,8958.0,9101.0,9314.0,...,11276.0,11363.0,11552.0,2,Affoltern am Albis,ZH,101,Affoltern,1,Zürich


In [18]:
df_gem.shape

(2324, 42)

### Separate the districts

In [19]:
#We have to do this one differently
df_bez = df_gem.groupby('Bezirk_ID').sum().reset_index()
df_bez.pop('Gemeinde_ID')
df_bez.pop('Kanton_ID')
df_bez.head(2)

Unnamed: 0,Bezirk_ID,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,101,29374.0,30248.0,30869.0,31644.0,32253.0,32512.0,32832.0,33438.0,34062.0,...,44064.0,44635.0,45277.0,46117.0,47063.0,48096.0,48958.0,49446.0,49965.0,50591.0
1,102,21095.0,21227.0,21307.0,21444.0,21644.0,21831.0,22000.0,22269.0,22576.0,...,27833.0,28027.0,28450.0,28832.0,29247.0,29511.0,29847.0,30038.0,30346.0,30654.0


In [20]:
df_bez = df_bez.merge(df_bez_ids, how='inner', on='Bezirk_ID')
df_bez.head(2)

Unnamed: 0,Bezirk_ID,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2008,2009,2010,2011,2012,2013,2014,2015,Kanton_ABK,Bezirk_Name
0,101,29374.0,30248.0,30869.0,31644.0,32253.0,32512.0,32832.0,33438.0,34062.0,...,45277.0,46117.0,47063.0,48096.0,48958.0,49446.0,49965.0,50591.0,ZH,Affoltern
1,102,21095.0,21227.0,21307.0,21444.0,21644.0,21831.0,22000.0,22269.0,22576.0,...,28450.0,28832.0,29247.0,29511.0,29847.0,30038.0,30346.0,30654.0,ZH,Andelfingen


In [21]:
df_bez = df_bez.merge(df_kant_ids, how='inner', on='Kanton_ABK')
df_bez.pop('Kanton_Name2')
df_bez.head(2)

Unnamed: 0,Bezirk_ID,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,Kanton_ABK,Bezirk_Name,Kanton_ID,Kanton_Name
0,101,29374.0,30248.0,30869.0,31644.0,32253.0,32512.0,32832.0,33438.0,34062.0,...,47063.0,48096.0,48958.0,49446.0,49965.0,50591.0,ZH,Affoltern,1,Zürich
1,102,21095.0,21227.0,21307.0,21444.0,21644.0,21831.0,22000.0,22269.0,22576.0,...,29247.0,29511.0,29847.0,30038.0,30346.0,30654.0,ZH,Andelfingen,1,Zürich


## Export the data

In [22]:
#Switzerland
df_ch.to_csv('CleanerData/CH-Bev-Wide-1981-2015.csv', index=False)
df_temp = pd.read_csv('CleanerData/CH-Bev-Wide-1981-2015.csv')
df_temp.head(2)

Unnamed: 0,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,6335243.0,6372904.0,6409713.0,6427833.0,6455896.0,6484834.0,6523413.0,6566799.0,6619973.0,6673850.0,...,7459128.0,7508739.0,7593494.0,7701856.0,7785806.0,7870134.0,7954662.0,8039060.0,8139631.0,8237666.0


In [23]:
#Cantons
df_kant.to_csv('CleanerData/Kant-Bev-Wide-1981-2015.csv', index=False)
df_temp = pd.read_csv('CleanerData/Kant-Bev-Wide-1981-2015.csv')
df_temp.head(2)

Unnamed: 0,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,...,2009,2010,2011,2012,2013,2014,2015,Kanton_Name,Kanton_ABK,Kanton_ID
0,1120815.0,1123840.0,1127659.0,1124690.0,1126848.0,1128248.0,1131484.0,1136566.0,1141494.0,1144899.0,...,1332727.0,1351297.0,1373068.0,1392396.0,1408575.0,1425538.0,1446354.0,Zürich,ZH,1
1,897296.0,900987.0,904420.0,906070.0,907551.0,908799.0,911266.0,914516.0,918201.0,922884.0,...,969299.0,974235.0,979802.0,985046.0,992617.0,1001281.0,1009418.0,Bern / Berne,BE,2


In [24]:
#Districts
df_bez.to_csv('CleanerData/Bez-Bev-Wide-1981-2015.csv', index=False)
df_temp = pd.read_csv('CleanerData/Bez-Bev-Wide-1981-2015.csv')
df_temp.head(2)

Unnamed: 0,Bezirk_ID,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,Kanton_ABK,Bezirk_Name,Kanton_ID,Kanton_Name
0,101,29374.0,30248.0,30869.0,31644.0,32253.0,32512.0,32832.0,33438.0,34062.0,...,47063.0,48096.0,48958.0,49446.0,49965.0,50591.0,ZH,Affoltern,1,Zürich
1,102,21095.0,21227.0,21307.0,21444.0,21644.0,21831.0,22000.0,22269.0,22576.0,...,29247.0,29511.0,29847.0,30038.0,30346.0,30654.0,ZH,Andelfingen,1,Zürich


In [25]:
#Communes
df_gem.to_csv('CleanerData/Gem-Bev-Wide-1981-2015.csv', index=False)
df_temp = pd.read_csv('CleanerData/Gem-Bev-Wide-1981-2015.csv')
df_temp.head(2)

Unnamed: 0,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,...,2013,2014,2015,Gemeinde_ID,Gemeinde_Name,Kanton_ABK,Bezirk_ID,Bezirk_Name,Kanton_ID,Kanton_Name
0,867.0,905.0,945.0,958.0,969.0,1008.0,1000.0,1043.0,1104.0,1144.0,...,1955.0,1972.0,1959.0,1,Aeugst am Albis,ZH,101,Affoltern,1,Zürich
1,8041.0,8223.0,8281.0,8422.0,8629.0,8699.0,8827.0,8958.0,9101.0,9314.0,...,11276.0,11363.0,11552.0,2,Affoltern am Albis,ZH,101,Affoltern,1,Zürich
