In [1]:
import subprocess
repo_dir = subprocess.Popen(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE).communicate()[0].rstrip().decode('utf-8')

In [2]:
import sys 
sys.path.append(repo_dir) # go to parent dir from customFunctions import *
import statbot_helpers as sbh
from pyaxis import pyaxis
import pandas as pd
import numpy as np
import os
import requests
import json

In [3]:
# bfs_nr of dataset
BFS_NR = 'px-x-0102020204_102'

STICHTAG =  '-12-31' # -%m-%d

# get asset_nr
asset_nr = sbh.get_bfs_asset_nr(BFS_NR)
BFS_URL = "https://www.bfs.admin.ch/bfsstatic/dam/assets/" + str(asset_nr) + "/master"

file_name="bfs_1_01_001_CH.px"

# download cube
r = requests.get(BFS_URL)
open(os.path.join(repo_dir,"temp/"+file_name), 'wb').write(r.content)

17093511

In [4]:
# load data from bfs
px = pyaxis.parse(os.path.join(repo_dir,"temp/"+file_name), encoding='ISO-8859-2')

In [5]:
px['DATA']

Unnamed: 0,Jahr,Kanton (-) / Bezirk (>>) / Gemeinde (......),Geschlecht des Kindes,Staatsangehörigkeit (Kategorie) des Kindes,Altersklasse der Mutter,DATA
0,1969,Schweiz,Geschlecht des Kindes - Total,Staatsangehörigkeit des Kindes - Total,Altersklasse der Mutter - Total,102520
1,1969,Schweiz,Geschlecht des Kindes - Total,Staatsangehörigkeit des Kindes - Total,Unter 25 Jahren,33884
2,1969,Schweiz,Geschlecht des Kindes - Total,Staatsangehörigkeit des Kindes - Total,25-29 Jahre,36206
3,1969,Schweiz,Geschlecht des Kindes - Total,Staatsangehörigkeit des Kindes - Total,30-34 Jahre,20479
4,1969,Schweiz,Geschlecht des Kindes - Total,Staatsangehörigkeit des Kindes - Total,35-39 Jahre,9077
...,...,...,...,...,...,...
6649339,2020,......6810 La Baroche,Mädchen,Ausland,Unter 25 Jahren,0
6649340,2020,......6810 La Baroche,Mädchen,Ausland,25-29 Jahre,0
6649341,2020,......6810 La Baroche,Mädchen,Ausland,30-34 Jahre,0
6649342,2020,......6810 La Baroche,Mädchen,Ausland,35-39 Jahre,0


In [7]:
# clean df
df = px['DATA']#.loc[px['DATA']
df = df.rename(columns={"Kanton (-) / Bezirk (>>) / Gemeinde (......)": "name"})


# add column with spatialunit_ontology
df.loc[df['name'].str.startswith("......"), "spatialunit_ontology"] = "A.ADM3"
#df.loc[df['name'].str.startswith(">>"), "spatialunit_ontology"] = "A.ADM2"
#df.loc[df['name'].str.startswith("-"), "spatialunit_ontology"] = "A.ADM1"
#df.loc[df['name'].str.startswith("Schweiz"), "spatialunit_ontology"] = "CH"

df = df.dropna(subset=['spatialunit_ontology'])

df['spatialunit_ontology'] = df['spatialunit_ontology'].astype('category') # reduce memory footprint


# extract bfs_nr and name for gemeinde
df.loc[df['spatialunit_ontology'] == 'A.ADM3', 'bfs_nr'] = df['name'].str.slice(6,11)
df.loc[df['spatialunit_ontology'] == 'A.ADM3', 'name'] = df['name'].str.slice(11)

df['bfs_nr'] = df['bfs_nr'].astype('int16')

                    
                    
# extract name for bezirk
#df.loc[df['spatialunit_ontology'] == 'A.ADM2', 'name'] = df['name'].str.slice(3)

# extract name for kanton
#df.loc[df['spatialunit_ontology'] == 'A.ADM1', 'name'] = df['name'].str.slice(2)

# convert origin
df.loc[df['Staatsangehörigkeit (Kategorie) des Kindes'] == 'Staatsangehörigkeit des Kindes - Total', 'Staatsangehörigkeit (Kategorie) des Kindes'] = '-1'
df.loc[df['Staatsangehörigkeit (Kategorie) des Kindes'] == 'Schweiz', 'Staatsangehörigkeit (Kategorie) des Kindes'] = '1'
df.loc[df['Staatsangehörigkeit (Kategorie) des Kindes'] == 'Ausland', 'Staatsangehörigkeit (Kategorie) des Kindes'] = '2'

# convert sex
df.loc[df['Geschlecht des Kindes'] == 'Geschlecht des Kindes - Total', 'Geschlecht des Kindes'] = '-1'
df.loc[df['Geschlecht des Kindes'] == 'Knabe', 'Geschlecht des Kindes'] = '1'
df.loc[df['Geschlecht des Kindes'] == 'Mädchen', 'Geschlecht des Kindes'] = '2'

df['Geschlecht des Kindes'] = df['Geschlecht des Kindes'].astype('category') # reduce memory footprint


# convert age   
df['Altersklasse der Mutter'] = df['Altersklasse der Mutter'].str.split(" ", n=1, expand=True)[0]                    
df.loc[df['Altersklasse der Mutter'] == 'Altersklasse', 'Altersklasse der Mutter'] = '-1'
df.loc[df['Altersklasse der Mutter'] == 'Unter', 'Altersklasse der Mutter'] = '1'
df.loc[df['Altersklasse der Mutter'] == '25-29', 'Altersklasse der Mutter'] = '2'
df.loc[df['Altersklasse der Mutter'] == '30-34', 'Altersklasse der Mutter'] = '3'
df.loc[df['Altersklasse der Mutter'] == '35-39', 'Altersklasse der Mutter'] = '4'
df.loc[df['Altersklasse der Mutter'] == '40', 'Altersklasse der Mutter'] = '5'



df['DATA'] = df['DATA'].astype('int32')
df['Altersklasse der Mutter'] = df['Altersklasse der Mutter'].astype('int32')

maximum=max(df['Jahr'])
df['Jahr'] += STICHTAG

df['period_value'] = np.NAN


df.columns = ['time_value', 'spatialunit_name', 'gender', 'origin', 'age_classes_of_mother', 
              'value', 'spatialunit_ontology', 'spatialunit_current_id', 'period_value']

df = df[['spatialunit_ontology', 'spatialunit_name', 
         'time_value', 'period_value', 'value', 'origin', 'gender', 'age_classes_of_mother', 'spatialunit_current_id']]


df

Unnamed: 0,spatialunit_ontology,spatialunit_name,time_value,period_value,value,origin,gender,age_classes_of_mother,spatialunit_current_id
162,A.ADM3,Aeugst am Albis,1969-12-31,,13,-1,-1,-1,1
163,A.ADM3,Aeugst am Albis,1969-12-31,,2,-1,-1,1,1
164,A.ADM3,Aeugst am Albis,1969-12-31,,6,-1,-1,2,1
165,A.ADM3,Aeugst am Albis,1969-12-31,,1,-1,-1,3,1
166,A.ADM3,Aeugst am Albis,1969-12-31,,3,-1,-1,4,1
...,...,...,...,...,...,...,...,...,...
6649339,A.ADM3,La Baroche,2020-12-31,,0,2,2,1,6810
6649340,A.ADM3,La Baroche,2020-12-31,,0,2,2,2,6810
6649341,A.ADM3,La Baroche,2020-12-31,,0,2,2,3,6810
6649342,A.ADM3,La Baroche,2020-12-31,,0,2,2,4,6810


In [19]:
df['spatialunit_current_id'].isnull().sum()

0

In [20]:
spatial_reference=str(maximum)+"-10-18"
#spatial_reference

df_clean = sbh.convert_current_to_hist_id(df, spatial_reference, os.path.join(repo_dir,"data/spatialunits.csv"))

df_try = df_clean.copy()

#print(df_try.shape)

#print(df_try['spatialunit_hist_id'].isnull().sum())

list_ontologies = ['A.ADM2', 'A.ADM1', 'CH']

list_dimensions = ['origin', 'gender', 'age_classes_of_mother']

#df_adm2 = sbh.add_granularity_levels_up(df_try,list_ontologies, repo_dir, list_dimensions)

df_new = sbh.add_granularity_levels_up(df_try, list_ontologies, repo_dir, list_dimensions)
df_new



Unnamed: 0,spatialunit_ontology,spatialunit_name,time_value,period_value,value,origin,gender,age_classes_of_mother,spatialunit_hist_id
0,A.ADM3,Aeugst am Albis,1969-12-31,,13,-1,-1,-1,13256
1,A.ADM3,Aeugst am Albis,1969-12-31,,2,-1,-1,1,13256
2,A.ADM3,Aeugst am Albis,1969-12-31,,6,-1,-1,2,13256
3,A.ADM3,Aeugst am Albis,1969-12-31,,1,-1,-1,3,13256
4,A.ADM3,Aeugst am Albis,1969-12-31,,3,-1,-1,4,13256
...,...,...,...,...,...,...,...,...,...
6576331,CH,Schweiz,2020-12-31,,874,2,2,1,0
6576332,CH,Schweiz,2020-12-31,,2733,2,2,2,0
6576333,CH,Schweiz,2020-12-31,,4320,2,2,3,0
6576334,CH,Schweiz,2020-12-31,,3291,2,2,4,0


In [27]:
df_new['age_classes_of_mother'].isnull().sum()

0

In [28]:
df_new[df_new['spatialunit_name'].isnull()]['spatialunit_hist_id'].unique()

array([10061, 10070, 10093, 10108, 10109, 10110, 10187, 10252, 10320])

In [29]:
df_new.to_csv(os.path.join(repo_dir,"data/11_11004.csv"), index=False, na_rep='NA')

sys.exit(0)


KeyboardInterrupt: 