In [57]:
import sys 
sys.path.append("../../") # go to parent dir from customFunctions import *
import statbot_helpers as sbh
from pyaxis import pyaxis
import pandas as pd

In [2]:
# bfs_nr of dataset
BFS_NR = 'px-x-0102010000_101'

STICHTAG =  '2020-10-18'

# get asset_nr
asset_nr = sbh.get_bfs_asset_nr(BFS_NR)
BFS_URL = "https://www.bfs.admin.ch/bfsstatic/dam/assets/" + str(asset_nr) + "/master"

# load data from bfs
px = pyaxis.parse(BFS_URL, encoding='ISO-8859-2')

# clean df
df = px['DATA'].loc[px['DATA']['Bevölkerungstyp'] == 'Ständige Wohnbevölkerung']
df = df.drop(columns=['Bevölkerungstyp'])
df = df.rename(columns={"Kanton (-) / Bezirk (>>) / Gemeinde (......)": "name"})


# add column with spatialunit_ontology
df.loc[df['name'].str.startswith("......"), "spatialunit_ontology"] = "A.ADM3"
#df.loc[df['name'].str.startswith(">>"), "spatialunit_ontology"] = "A.ADM2"
#df.loc[df['name'].str.startswith("-"), "spatialunit_ontology"] = "A.ADM1"
#df.loc[df['name'].str.startswith("Schweiz"), "spatialunit_ontology"] = "CH"

df = df.dropna(subset=['spatialunit_ontology'])

df['spatialunit_ontology'] = df['spatialunit_ontology'].astype('category') # reduce memory footprint


# extract bfs_nr and name for gemeinde
df.loc[df['spatialunit_ontology'] == 'A.ADM3', 'bfs_nr'] = df['name'].str.slice(6,11)
df.loc[df['spatialunit_ontology'] == 'A.ADM3', 'name'] = df['name'].str.slice(11)

df['bfs_nr'] = df['bfs_nr'].astype('int16').astype(str)

# extract name for bezirk
#df.loc[df['spatialunit_ontology'] == 'A.ADM2', 'name'] = df['name'].str.slice(3)

# extract name for kanton
#df.loc[df['spatialunit_ontology'] == 'A.ADM1', 'name'] = df['name'].str.slice(2)

# convert origin
df.loc[df['Staatsangehörigkeit (Kategorie)'] == 'Staatsangehörigkeit (Kategorie) - Total', 'Staatsangehörigkeit (Kategorie)'] = '-1'
df.loc[df['Staatsangehörigkeit (Kategorie)'] == 'Schweiz', 'Staatsangehörigkeit (Kategorie)'] = '1'
df.loc[df['Staatsangehörigkeit (Kategorie)'] == 'Ausland', 'Staatsangehörigkeit (Kategorie)'] = '2'

# convert sex
df.loc[df['Geschlecht'] == 'Geschlecht - Total', 'Geschlecht'] = '-1'
df.loc[df['Geschlecht'] == 'Mann', 'Geschlecht'] = '1'
df.loc[df['Geschlecht'] == 'Frau', 'Geschlecht'] = '2'

df['Geschlecht'] = df['Geschlecht'].astype('category') # reduce memory footprint


# convert age
df['Alter'] = df['Alter'].str.split(" ", n=1, expand=True)[0]
df.loc[df['Alter'] == 'Alter', 'Alter'] = '-1'

df['DATA'] = df['DATA'].astype('int32')
df['Alter'] = df['Alter'].astype('int16')

df['Jahr'] += '-12-31'

df['period_value'] = ''
df['placeholder_dims'] = ''

df.columns = ['time_value', 'spatialunit_name', 'origin', 'sex', 'age', 
              'value', 'spatialunit_ontology', 'spatialunit_current_id', 'period_value', 'placeholder_dims']

df = df[['spatialunit_ontology', 'spatialunit_name', 
         'time_value', 'period_value', 'value', 'placeholder_dims', 'origin', 'sex', 'age', 'spatialunit_current_id']]


In [16]:
import pandas as pd


def convert_current_to_hist_id(df, reference_point):
  spatial_unit_table = pd.read_csv("/home/b105p02@ji.ktzh.ch/gitrepos/statbot/data/spatialunits.csv", usecols=["spatialunit_ontology","spatialunit_hist_id","spatialunit_current_id","valid_from","valid_until"])
  
  spatial_unit_table['valid_from'] = pd.to_datetime(spatial_unit_table['valid_from'], format = '%Y-%m-%d')
  
  try:
    spatial_unit_table['valid_until'] = pd.to_datetime(spatial_unit_table['valid_until'], format = '%Y-%m-%d')
  except:
    spatial_unit_table['valid_until'] = pd.to_datetime('2262-04-11')
  
  
  reference_point = pd.to_datetime(reference_point, format = '%Y-%m-%d')
                                   
  spatial_unit_table = spatial_unit_table.loc[(spatial_unit_table['valid_from'] <= reference_point) & (spatial_unit_table['valid_until'] >= reference_point)]

  
  #df<-merge(df,spatial_unit_table,by=c("spatialunit_ontology","spatialunit_current_id"),all.x=T,sort=F)

  df['spatialunit_ontology'] = df['spatialunit_ontology'].astype(str)
  spatial_unit_table['spatialunit_ontology'] = spatial_unit_table['spatialunit_ontology'].astype(str)

  df['spatialunit_current_id'] = df['spatialunit_current_id'].astype(str)
  spatial_unit_table['spatialunit_current_id'] = spatial_unit_table['spatialunit_current_id'].astype(str)

  df = pd.merge(df, spatial_unit_table, how='left', on=['spatialunit_ontology', 'spatialunit_current_id'])
  
  df = df.drop(columns=['valid_from', 'valid_until'])
  
  return df



In [None]:
df_clean = convert_current_to_hist_id(df, '2021-12-31')

In [30]:
df_clean.head()

Unnamed: 0,spatialunit_ontology,spatialunit_name,time_value,period_value,value,placeholder_dims,origin,sex,age,spatialunit_current_id,spatialunit_hist_id
0,A.ADM3,Aeugst am Albis,2010-12-31,,1824,,-1,-1,-1,1,11348
1,A.ADM3,Aeugst am Albis,2010-12-31,,1824,,-1,-1,-1,1,13256
2,A.ADM3,Aeugst am Albis,2010-12-31,,14,,-1,-1,0,1,11348
3,A.ADM3,Aeugst am Albis,2010-12-31,,14,,-1,-1,0,1,13256
4,A.ADM3,Aeugst am Albis,2010-12-31,,18,,-1,-1,1,1,11348


In [51]:
df_try = df_clean.head(50000)
df_try

Unnamed: 0,spatialunit_ontology,spatialunit_name,time_value,period_value,value,placeholder_dims,origin,sex,age,spatialunit_current_id,spatialunit_hist_id
0,A.ADM3,Aeugst am Albis,2010-12-31,,1824,,-1,-1,-1,1,11348
1,A.ADM3,Aeugst am Albis,2010-12-31,,1824,,-1,-1,-1,1,13256
2,A.ADM3,Aeugst am Albis,2010-12-31,,14,,-1,-1,0,1,11348
3,A.ADM3,Aeugst am Albis,2010-12-31,,14,,-1,-1,0,1,13256
4,A.ADM3,Aeugst am Albis,2010-12-31,,18,,-1,-1,1,1,11348
...,...,...,...,...,...,...,...,...,...,...,...
49995,A.ADM3,Oberembrach,2010-12-31,,4,,1,1,14,65,12524
49996,A.ADM3,Oberembrach,2010-12-31,,5,,1,1,15,65,12524
49997,A.ADM3,Oberembrach,2010-12-31,,5,,1,1,16,65,12524
49998,A.ADM3,Oberembrach,2010-12-31,,9,,1,1,17,65,12524


In [56]:
# def add_granularity_levels_up(df,list_ontologies):

list_ontologies = ['A.ADM2', 'A.ADM1', 'CH']

spatial_unit_table = pd.read_csv("/home/b105p02@ji.ktzh.ch/gitrepos/statbot/data/spatialunits.csv", 
                                 usecols=["spatialunit_ontology","spatialunit_hist_id","canton_hist_id","district_hist_id"])

spatial_unit_table[['canton_hist_id', 'district_hist_id']] = spatial_unit_table[['canton_hist_id', 'district_hist_id']].astype('Int64')


df_try_out = df_try.copy()

df_try = df_try[df_try['spatialunit_ontology'] == 'A.ADM3']

df_try = pd.merge(df_try, spatial_unit_table, how='left', on=['spatialunit_ontology', 'spatialunit_hist_id'])


if 'A.ADM2' in list_ontologies:
    list_to_group = ["district_hist_id","time_value"]
    adm2 = df_try.groupby(by=list_to_group)['value'].sum()



  df_try = pd.merge(df_try, spatial_unit_table, how='left', on=['spatialunit_ontology', 'spatialunit_hist_id'])


KeyError: 'district_hist_id'

In [32]:
spatial_unit_table

Unnamed: 0,spatialunit_ontology,spatialunit_hist_id,canton_hist_id,district_hist_id
0,CH,0,,
1,A.ADM1,1,1.0,
2,A.ADM1,2,2.0,
3,A.ADM1,3,3.0,
4,A.ADM1,4,4.0,
...,...,...,...,...
6185,A.ADM4,1212,12.0,
6186,A.ADM4,1214,12.0,
6187,A.ADM4,1216,12.0,
6188,A.ADM4,1217,12.0,
