In [1]:
import pandas as pd
import numpy as np
import os

from ddf_utils.str import to_concept_id
from ddf_utils.index import create_index_file

In [2]:
source = 'source/kilm04.xlsx'

In [3]:
data = pd.read_excel(source, skiprows=2)

In [4]:
data

Unnamed: 0,Country (code),Country,Region,Sub-region (broad),Sub-region (detailed),Income group (code),Income group,B,Year,Sex (code),...,ISIC,Coverage (code),Coverage,Age,Reference period,Geographic limitation,Coverage limitation,Classification remark,Survey limitation,Notes
0,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2000,MF,...,International Standard Industrial Classificati...,T,Total,15+,,,,,,
1,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2001,MF,...,International Standard Industrial Classificati...,T,Total,15+,,,,,,
2,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2002,MF,...,International Standard Industrial Classificati...,T,Total,15+,,,,,,
3,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2003,MF,...,International Standard Industrial Classificati...,T,Total,15+,,,,,,
4,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2004,MF,...,International Standard Industrial Classificati...,T,Total,15+,,,,,,
5,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2005,MF,...,International Standard Industrial Classificati...,T,Total,15+,,,,,,
6,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2006,MF,...,International Standard Industrial Classificati...,T,Total,15+,,,,,,
7,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,b,2007,MF,...,International Standard Industrial Classificati...,T,Total,15-64,,,,,,
8,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,b,2009,MF,...,International Standard Industrial Classificati...,C,Civilian,15+,Noncalendar year,,Excluding institutional population,,,
9,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,b,2009,M,...,International Standard Industrial Classificati...,C,Civilian,15+,Noncalendar year,,Excluding institutional population,,,


In [5]:
data.columns

Index(['Country (code)', 'Country', 'Region', 'Sub-region (broad)',
       'Sub-region (detailed)', 'Income group (code)', 'Income group', 'B',
       'Year', 'Sex (code)', 'Sex', 'Total employment ('000)',
       'Agriculture ('000)', 'Agriculture (%)', 'Industry ('000)',
       'Industry (%)', 'Services ('000)', 'Services (%)',
       'Not adequately defined ('000)', 'Not adequately defined (%)',
       'Repository (code)', 'Repository', 'Type of source (code)',
       'Type of source', 'ISIC (code)', 'ISIC', 'Coverage (code)', 'Coverage',
       'Age', 'Reference period', 'Geographic limitation',
       'Coverage limitation', 'Classification remark', 'Survey limitation',
       'Notes'],
      dtype='object')

In [None]:
# country

In [7]:
country = data[['Country (code)', 'Country']].drop_duplicates().copy()

In [8]:
country.columns = ['country', 'name']

In [9]:
country['country'] = country['country'].map(to_concept_id)

In [10]:
country.to_csv('../ddf--entities--country.csv', index=False)

In [12]:
# sex

In [13]:
sex = data[['Sex (code)', 'Sex']].drop_duplicates().copy()
sex.columns = ['sex', 'name']

sex['sex'] = sex['sex'].map(to_concept_id)

sex.to_csv('../ddf--entities--sex.csv', index=False)

In [15]:
# concepts

In [38]:
discs = ['Name', 'Year', 'Country', 'Sex']

conc = data.columns[11:20]

In [39]:
conc

Index(['Total employment ('000)', 'Agriculture ('000)', 'Agriculture (%)',
       'Industry ('000)', 'Industry (%)', 'Services ('000)', 'Services (%)',
       'Not adequately defined ('000)', 'Not adequately defined (%)'],
      dtype='object')

In [40]:
cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])

cdf['name'] = [*discs, *conc]

cdf['concept'] = cdf['name'].map(to_concept_id)

In [41]:
cdf.loc[4:, 'concept_type'] = 'measure'
cdf.loc[0, 'concept_type'] = 'string'
cdf.loc[1, 'concept_type'] = 'time'
cdf.loc[2:3, 'concept_type'] = 'entity_domain'

In [42]:
cdf

Unnamed: 0,concept,name,concept_type
0,name,Name,string
1,year,Year,time
2,country,Country,entity_domain
3,sex,Sex,entity_domain
4,total_employment_000,Total employment ('000),measure
5,agriculture_000,Agriculture ('000),measure
6,agriculture,Agriculture (%),measure
7,industry_000,Industry ('000),measure
8,industry,Industry (%),measure
9,services_000,Services ('000),measure


In [43]:
cdf.to_csv('../ddf--concepts.csv', index=False)

In [14]:
# datapoints

In [44]:
dps = data[['Country (code)', 'Sex (code)', 'Year', *conc]].copy()

In [45]:
dps.columns = ['country', 'sex', 'year', *[to_concept_id(x) for x in conc]]

In [46]:
dps['country'] = dps['country'].map(to_concept_id)
dps['sex'] = dps['sex'].map(to_concept_id)

In [47]:
dps = dps.set_index(['country', 'sex', 'year'])

In [48]:
for k, df in dps.items():
    df_ = df.reset_index()
    path = '../ddf--datapoints--{}--by--country--sex--year.csv'.format(k)
    df_.to_csv(path, index=False)

In [49]:
create_index_file('../')

Unnamed: 0,key,value,file
0,concept,name,ddf--concepts.csv
1,concept,concept_type,ddf--concepts.csv
0,"country,sex,year",agriculture,ddf--datapoints--agriculture--by--country--sex...
0,"country,sex,year",agriculture_000,ddf--datapoints--agriculture_000--by--country-...
0,"country,sex,year",industry,ddf--datapoints--industry--by--country--sex--y...
0,"country,sex,year",industry_000,ddf--datapoints--industry_000--by--country--se...
0,"country,sex,year",not_adequately_defined,ddf--datapoints--not_adequately_defined--by--c...
0,"country,sex,year",not_adequately_defined_000,ddf--datapoints--not_adequately_defined_000--b...
0,"country,sex,year",services,ddf--datapoints--services--by--country--sex--y...
0,"country,sex,year",services_000,ddf--datapoints--services_000--by--country--se...
