In [2]:
import subprocess
repo_dir = subprocess.Popen(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE).communicate()[0].rstrip().decode('utf-8')

In [34]:
import sys 
sys.path.append(repo_dir) # go to parent dir from customFunctions import *
import statbot_helpers as sbh
from pyaxis import pyaxis
import pandas as pd
import numpy as np
import os
import requests
import json

In [4]:
# bfs_nr of dataset
BFS_NR = 'px-x-0102010000_101'

STICHTAG =  '-12-31' # -%m-%d

In [6]:
# get asset_nr
asset_nr = sbh.get_bfs_asset_nr(BFS_NR)
BFS_URL = "https://www.bfs.admin.ch/bfsstatic/dam/assets/" + str(asset_nr) + "/master"

file_name="bfs_1_01_001_CH.px"

In [3]:
# download cube
r = requests.get(BFS_URL)
open(os.path.join(repo_dir,"temp/"+file_name), 'wb').write(r.content)

108324271

In [7]:
# load data from bfs
px = pyaxis.parse(os.path.join(repo_dir,"temp/"+file_name), encoding='ISO-8859-2')

In [8]:


# clean df
df = px['DATA'].loc[px['DATA']['Bevölkerungstyp'] == 'Ständige Wohnbevölkerung']
df = df.drop(columns=['Bevölkerungstyp'])
df = df.rename(columns={"Kanton (-) / Bezirk (>>) / Gemeinde (......)": "name"})


# add column with spatialunit_ontology
df.loc[df['name'].str.startswith("......"), "spatialunit_ontology"] = "A.ADM3"
#df.loc[df['name'].str.startswith(">>"), "spatialunit_ontology"] = "A.ADM2"
#df.loc[df['name'].str.startswith("-"), "spatialunit_ontology"] = "A.ADM1"
#df.loc[df['name'].str.startswith("Schweiz"), "spatialunit_ontology"] = "CH"

df = df.dropna(subset=['spatialunit_ontology'])

df['spatialunit_ontology'] = df['spatialunit_ontology'].astype('category') # reduce memory footprint


# extract bfs_nr and name for gemeinde
df.loc[df['spatialunit_ontology'] == 'A.ADM3', 'bfs_nr'] = df['name'].str.slice(6,11)
df.loc[df['spatialunit_ontology'] == 'A.ADM3', 'name'] = df['name'].str.slice(11)

df['bfs_nr'] = df['bfs_nr'].astype('int16')

# extract name for bezirk
#df.loc[df['spatialunit_ontology'] == 'A.ADM2', 'name'] = df['name'].str.slice(3)

# extract name for kanton
#df.loc[df['spatialunit_ontology'] == 'A.ADM1', 'name'] = df['name'].str.slice(2)

# convert origin
df.loc[df['Staatsangehörigkeit (Kategorie)'] == 'Staatsangehörigkeit (Kategorie) - Total', 'Staatsangehörigkeit (Kategorie)'] = '-1'
df.loc[df['Staatsangehörigkeit (Kategorie)'] == 'Schweiz', 'Staatsangehörigkeit (Kategorie)'] = '1'
df.loc[df['Staatsangehörigkeit (Kategorie)'] == 'Ausland', 'Staatsangehörigkeit (Kategorie)'] = '2'

# convert sex
df.loc[df['Geschlecht'] == 'Geschlecht - Total', 'Geschlecht'] = '-1'
df.loc[df['Geschlecht'] == 'Mann', 'Geschlecht'] = '1'
df.loc[df['Geschlecht'] == 'Frau', 'Geschlecht'] = '2'

df['Geschlecht'] = df['Geschlecht'].astype('category') # reduce memory footprint


# convert age
df['Alter'] = df['Alter'].str.split(" ", n=1, expand=True)[0]
df.loc[df['Alter'] == 'Alter', 'Alter'] = '-1'

df['DATA'] = df['DATA'].astype('int32')
df['Alter'] = df['Alter'].astype('int16')

maximum=max(df['Jahr'])
df['Jahr'] += STICHTAG

df['period_value'] = np.NAN

df.columns = ['time_value', 'spatialunit_name', 'origin', 'sex', 'age', 
              'value', 'spatialunit_ontology', 'spatialunit_current_id', 'period_value']

df = df[['spatialunit_ontology', 'spatialunit_name', 
         'time_value', 'period_value', 'value', 'origin', 'sex', 'age', 'spatialunit_current_id']]


In [10]:
df.to_csv('df.csv')

In [11]:
test = df.head(100000).copy()

In [12]:
spatial_reference=str(maximum)+"-10-18"

In [13]:
df_clean = sbh.convert_current_to_hist_id(test, spatial_reference,os.path.join(repo_dir,"data/spatialunits.csv"))

In [14]:
df_try = df_clean.copy()
df_try

Unnamed: 0,spatialunit_ontology,spatialunit_name,time_value,period_value,value,origin,sex,age,spatialunit_current_id,spatialunit_hist_id
0,A.ADM3,Aeugst am Albis,2010-12-31,,1824,-1,-1,-1,1,13256
1,A.ADM3,Aeugst am Albis,2010-12-31,,14,-1,-1,0,1,13256
2,A.ADM3,Aeugst am Albis,2010-12-31,,18,-1,-1,1,1,13256
3,A.ADM3,Aeugst am Albis,2010-12-31,,13,-1,-1,2,1,13256
4,A.ADM3,Aeugst am Albis,2010-12-31,,20,-1,-1,3,1,13256
...,...,...,...,...,...,...,...,...,...,...
99995,A.ADM3,Uetikon am See,2010-12-31,,12,2,2,34,159,13259
99996,A.ADM3,Uetikon am See,2010-12-31,,16,2,2,35,159,13259
99997,A.ADM3,Uetikon am See,2010-12-31,,8,2,2,36,159,13259
99998,A.ADM3,Uetikon am See,2010-12-31,,13,2,2,37,159,13259


In [15]:
test.shape

(100000, 9)

In [16]:
df_try.shape

(100000, 10)

In [17]:
df_try['spatialunit_hist_id'].isnull().sum()

0

In [132]:
list_ontologies = ['A.ADM2', 'A.ADM1', 'CH']

list_dimensions = ['origin', 'sex', 'age']

#df_adm2 = sbh.add_granularity_levels_up(df_try,list_ontologies, repo_dir, list_dimensions)



In [133]:
df_try = df_clean.copy()
del df_try['spatialunit_current_id']

In [134]:
spatial_unit_table = pd.read_csv(os.path.join(repo_dir,"data/spatialunits.csv"), 
                                 usecols=["spatialunit_ontology","spatialunit_hist_id","canton_hist_id","district_hist_id"])

spatial_unit_table[['canton_hist_id', 'district_hist_id']] = spatial_unit_table[['canton_hist_id', 'district_hist_id']].astype('Int64')


df_out = df_try.copy()

df_try = df_try[df_try['spatialunit_ontology'] == 'A.ADM3']

df_try = pd.merge(df_try, spatial_unit_table, how='left', on=['spatialunit_ontology', 'spatialunit_hist_id'])

with open(os.path.join(repo_dir,"constants/"+"constants.json")) as f:
    GLOBAL_TOTAL_LIST = json.load(f)

allcols = list(GLOBAL_TOTAL_LIST['GLOBAL_TOTAL_LIST']) + list_dimensions
allcols


['spatialunit_ontology',
 'spatialunit_hist_id',
 'spatialunit_name',
 'time_value',
 'period_value',
 'value',
 'origin',
 'sex',
 'age']

In [135]:
df_out

Unnamed: 0,spatialunit_ontology,spatialunit_name,time_value,period_value,value,origin,sex,age,spatialunit_hist_id
0,A.ADM3,Aeugst am Albis,2010-12-31,,1824,-1,-1,-1,13256
1,A.ADM3,Aeugst am Albis,2010-12-31,,14,-1,-1,0,13256
2,A.ADM3,Aeugst am Albis,2010-12-31,,18,-1,-1,1,13256
3,A.ADM3,Aeugst am Albis,2010-12-31,,13,-1,-1,2,13256
4,A.ADM3,Aeugst am Albis,2010-12-31,,20,-1,-1,3,13256
...,...,...,...,...,...,...,...,...,...
99995,A.ADM3,Uetikon am See,2010-12-31,,12,2,2,34,13259
99996,A.ADM3,Uetikon am See,2010-12-31,,16,2,2,35,13259
99997,A.ADM3,Uetikon am See,2010-12-31,,8,2,2,36,13259
99998,A.ADM3,Uetikon am See,2010-12-31,,13,2,2,37,13259


In [125]:
if 'A.ADM2' in list_ontologies:
    list_to_group = ["district_hist_id","time_value"] + list_dimensions
    df_adm2 = df_try.groupby(by=list_to_group).agg({'value': ['sum']}).reset_index()
    df_adm2['spatialunit_ontology'] = 'A.ADM2'
    df_adm2['spatialunit_hist_id'] = spatial_unit_table['district_hist_id']
    df_adm2['spatialunit_name'] = np.NAN  #translate_to_spatial_unit_name(df_asdm2, "de")
    df_adm2['period_value'] = np.NAN
    df_out =  pd.concat([df_out, df_adm2[allcols]], ignore_index=True)

In [129]:
type(df_adm2[allcols])
type(df_out)

pandas.core.frame.DataFrame

In [130]:
df_out

Unnamed: 0,"(age, )","(origin, )","(period_value, )","(sex, )","(spatialunit_hist_id, )","(spatialunit_name, )","(spatialunit_ontology, )","(time_value, )","(value, sum)",age,origin,period_value,sex,spatialunit_hist_id,spatialunit_name,spatialunit_ontology,time_value,value
0,,,,,,,,,,-1.0,-1,,-1,13256.0,Aeugst am Albis,A.ADM3,2010-12-31,1824.0
1,,,,,,,,,,0.0,-1,,-1,13256.0,Aeugst am Albis,A.ADM3,2010-12-31,14.0
2,,,,,,,,,,1.0,-1,,-1,13256.0,Aeugst am Albis,A.ADM3,2010-12-31,18.0
3,,,,,,,,,,2.0,-1,,-1,13256.0,Aeugst am Albis,A.ADM3,2010-12-31,13.0
4,,,,,,,,,,3.0,-1,,-1,13256.0,Aeugst am Albis,A.ADM3,2010-12-31,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106421,96.0,2,,2,,,A.ADM2,2010-12-31,0.0,,,,,,,,,
106422,97.0,2,,2,,,A.ADM2,2010-12-31,0.0,,,,,,,,,
106423,98.0,2,,2,,,A.ADM2,2010-12-31,0.0,,,,,,,,,
106424,99.0,2,,2,,,A.ADM2,2010-12-31,0.0,,,,,,,,,
