In [41]:
import pandas as pd

In [42]:
%matplotlib inline

## Read data and separate Switzerland, Cantons, Districts, Communes

### Load the Excel with all the data in it

In [43]:
df = pd.read_excel('ReadinData/px-x-0102010000_104 - Nationalitaet.xlsx', sheetname='relrel')

In [44]:
df.head(2)

Unnamed: 0,Nummer,Einheit,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,...,Vatikanstadt,Venezuela,Vereinigte Arabische Emirate,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe
0,8100,Schweiz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ZH,- Zürich,-2.014882,22.516411,3.666642,-2.14365,-27.162818,-78.968424,-17.191365,-12.63807,...,-100.0,13.753983,-85.056512,29.2169,25.545908,-30.837647,-100.0,-43.214746,54.743273,7.609412


In [45]:
#Make sure we don't have any NaN in there
df = df[df['Einheit'].notnull()]

### Load some helper files

In [46]:
#Load the numbers and abbreviations file for the cantons
df_kant_ids = pd.read_csv('HelperData/Kantone-ABK-ID.csv')
df_kant_ids.head(2)

Unnamed: 0,Kanton_ABK,Kanton_ID,Kanton_Name,Kanton_Name2
0,ZH,1,Zürich,Zürich
1,BE,2,Bern,Bern / Berne


In [47]:
#Load the canton/district infos for the communes
df_gem_ids = pd.read_excel('HelperData/Gemeindestand-2015.xls')
df_gem_ids.head(2)

Unnamed: 0,Hist.-Nummer,Kanton,Bezirksnummer,Bezirksname,BFS Gde-nummer,Gemeindename,Datum der Aufnahme
0,13256,ZH,101,Affoltern,1,Aeugst am Albis,1976-11-15
1,11742,ZH,101,Affoltern,2,Affoltern am Albis,1960-01-01


In [48]:
df_gem_ids.pop('Hist.-Nummer')
df_gem_ids.pop('Gemeindename')
df_gem_ids.pop('Datum der Aufnahme')
df_gem_ids.rename(columns={'Bezirksnummer': 'Bezirk_ID', 'Bezirksname': 'Bezirk_Name', 'BFS Gde-nummer': 'Gemeinde_ID', 'Kanton': 'Kanton_ABK'}, inplace=True)
df_gem_ids.head(2)

Unnamed: 0,Kanton_ABK,Bezirk_ID,Bezirk_Name,Gemeinde_ID
0,ZH,101,Affoltern,1
1,ZH,101,Affoltern,2


In [49]:
#Construct a set of district ids
df_bez_ids = df_gem_ids.drop_duplicates(subset='Bezirk_ID')
df_bez_ids.pop('Gemeinde_ID')
df_bez_ids.head(2)

Unnamed: 0,Kanton_ABK,Bezirk_ID,Bezirk_Name
0,ZH,101,Affoltern
14,ZH,102,Andelfingen


## Get a dataframe for each geopraphic level

### Separate Switzerland

In [50]:
#Switzerland
df_ch = df[df['Einheit'] == 'Schweiz']
df_ch.pop('Einheit')
df_ch.pop('Nummer')
df_ch.head(2)

Unnamed: 0,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,Äquatorialguinea,Argentinien,...,Vatikanstadt,Venezuela,Vereinigte Arabische Emirate,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
df_ch.shape

(1, 200)

### Separate the cantons

In [52]:
#Cantons - there should be 26 of them
df_kant = df[df['Einheit'].str.contains('^- ')]
df_kant['Kanton_Name'] = df_kant['Einheit'].str.extract('^- (.*)', expand=True)
df_kant.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Nummer,Einheit,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,...,Venezuela,Vereinigte Arabische Emirate,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe,Kanton_Name
1,ZH,- Zürich,-2.014882,22.516411,3.666642,-2.14365,-27.162818,-78.968424,-17.191365,-12.63807,...,13.753983,-85.056512,29.2169,25.545908,-30.837647,-100.0,-43.214746,54.743273,7.609412,Zürich
183,BE,- Bern / Berne,12.104217,15.049524,-2.952041,-14.556518,3.57525,-100.0,6.748405,88.862562,...,-29.265529,-67.694562,-58.988405,-67.335148,9.887826,-100.0,-34.527645,-66.148722,85.772786,Bern / Berne


In [53]:
#Add the ID and Abbreviations
df_kant = df_kant.merge(df_kant_ids, how='inner', left_on='Kanton_Name', right_on='Kanton_Name2')
df_kant.pop('Kanton_Name2')
df_kant.pop('Einheit')
df_kant.pop('Kanton_Name_y')
df_kant.pop('Nummer')
df_kant.rename(columns={'Kanton_Name_x': 'Kanton_Name'}, inplace=True)
df_kant.head(2)

Unnamed: 0,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,Äquatorialguinea,Argentinien,...,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe,Kanton_Name,Kanton_ABK,Kanton_ID
0,-2.014882,22.516411,3.666642,-2.14365,-27.162818,-78.968424,-17.191365,-12.63807,-100.0,27.758798,...,29.2169,25.545908,-30.837647,-100.0,-43.214746,54.743273,7.609412,Zürich,ZH,1
1,12.104217,15.049524,-2.952041,-14.556518,3.57525,-100.0,6.748405,88.862562,-61.02836,-40.782494,...,-58.988405,-67.335148,9.887826,-100.0,-34.527645,-66.148722,85.772786,Bern / Berne,BE,2


In [54]:
df_kant.shape

(26, 203)

### Separate the districts

In [55]:
#Districts - there should be 148 of them
df_bez = df[df['Einheit'].str.contains('^>> ')]
df_bez['Bezirk_Name'] = df_bez['Einheit'].str.extract('^>> (.*)', expand=True)
df_bez['Bezirk_ID'] = df_bez['Nummer'].str.extract('00(.*)', expand=True).astype(int)
df_bez.pop('Nummer')
df_bez.pop('Einheit')
df_bez.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,Äquatorialguinea,Argentinien,...,Vereinigte Arabische Emirate,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe,Bezirk_Name,Bezirk_ID
2,9.073742,-32.732607,-21.08941,-49.254039,-56.630987,-100.0,-65.13699,-100.0,-100.0,-36.144427,...,-100.0,-26.256687,22.479944,-96.339937,-100.0,-100.0,-100.0,21.001617,Bezirk Affoltern,101
17,15.506005,8.502907,-67.042668,-100.0,-93.413365,-100.0,-100.0,-100.0,-100.0,-69.520523,...,-100.0,-69.200803,-38.679446,-63.312637,-100.0,-100.0,-100.0,83.770262,Bezirk Andelfingen,102


In [56]:
df_bez.shape

(148, 202)

### Separate the communes

In [57]:
#Gemeinden - there should be 2324 of them
df_gem = df[df['Einheit'].str.contains('^\.\.\.\.\.\.')]
df_gem['Gemeinde_ID'] = df_gem['Einheit'].str.extract('^\.\.\.\.\.\.(\d\d\d\d)', expand=True).astype(int)
df_gem['Gemeinde_Name'] = df_gem['Einheit'].str.extract('^\.\.\.\.\.\.\d\d\d\d (.*)', expand=True)
df_gem.pop('Nummer')
df_gem.pop('Einheit')
df_gem.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,Äquatorialguinea,Argentinien,...,Vereinigte Arabische Emirate,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe,Gemeinde_ID,Gemeinde_Name
3,15.691469,-100.0,242.583229,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,118.284064,151.609935,-100.0,-100.0,-100.0,-100.0,-100.0,1,Aeugst am Albis
4,-3.246189,32.90269,44.913174,-10.536622,-47.869495,-100.0,-80.793037,-100.0,-100.0,-19.589163,...,-100.0,-63.066217,-23.369431,-100.0,-100.0,-100.0,-100.0,-3.035602,2,Affoltern am Albis


In [58]:
df_gem.shape

(2324, 202)

## Export the data

In [59]:
#Switzerland
df_ch.to_csv('StructuredData/CH-Nationalitaet.csv', index=False)
df_temp = pd.read_csv('StructuredData/CH-Nationalitaet.csv')
df_temp.head(2)

Unnamed: 0,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,Äquatorialguinea,Argentinien,...,Vatikanstadt,Venezuela,Vereinigte Arabische Emirate,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
#Cantons
df_kant.to_csv('StructuredData/kant-Nationalitaet.csv', index=False)
df_temp = pd.read_csv('StructuredData/kant-Nationalitaet.csv')
df_temp.head(2)

Unnamed: 0,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,Äquatorialguinea,Argentinien,...,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe,Kanton_Name,Kanton_ABK,Kanton_ID
0,-2.014882,22.516411,3.666642,-2.14365,-27.162818,-78.968424,-17.191365,-12.63807,-100.0,27.758798,...,29.2169,25.545908,-30.837647,-100.0,-43.214746,54.743273,7.609412,Zürich,ZH,1
1,12.104217,15.049524,-2.952041,-14.556518,3.57525,-100.0,6.748405,88.862562,-61.02836,-40.782494,...,-58.988405,-67.335148,9.887826,-100.0,-34.527645,-66.148722,85.772786,Bern / Berne,BE,2


In [61]:
#Districts
df_bez.to_csv('StructuredData/bez-Nationalitaet.csv', index=False)
df_temp = pd.read_csv('StructuredData/bez-Nationalitaet.csv')
df_temp.head(2)

Unnamed: 0,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,Äquatorialguinea,Argentinien,...,Vereinigte Arabische Emirate,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe,Bezirk_Name,Bezirk_ID
0,9.073742,-32.732607,-21.08941,-49.254039,-56.630987,-100.0,-65.13699,-100.0,-100.0,-36.144427,...,-100.0,-26.256687,22.479944,-96.339937,-100.0,-100.0,-100.0,21.001617,Bezirk Affoltern,101
1,15.506005,8.502907,-67.042668,-100.0,-93.413365,-100.0,-100.0,-100.0,-100.0,-69.520523,...,-100.0,-69.200803,-38.679446,-63.312637,-100.0,-100.0,-100.0,83.770262,Bezirk Andelfingen,102


In [62]:
#Communes
df_gem.to_csv('StructuredData/gem-Nationalitaet.csv', index=False)
df_temp = pd.read_csv('StructuredData/gem-Nationalitaet.csv')
df_temp.head(2)

Unnamed: 0,Schweiz,Afghanistan,Ägypten,Albanien,Algerien,Andorra,Angola,Antigua und Barbuda,Äquatorialguinea,Argentinien,...,Vereinigte Arabische Emirate,Vereinigte Staaten,Vereinigtes Königreich,Vietnam,Westsahara,Zentralafrikanische Republik,Zypern,Ohne Angabe,Gemeinde_ID,Gemeinde_Name
0,15.691469,-100.0,242.583229,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,118.284064,151.609935,-100.0,-100.0,-100.0,-100.0,-100.0,1,Aeugst am Albis
1,-3.246189,32.90269,44.913174,-10.536622,-47.869495,-100.0,-80.793037,-100.0,-100.0,-19.589163,...,-100.0,-63.066217,-23.369431,-100.0,-100.0,-100.0,-100.0,-3.035602,2,Affoltern am Albis
