In [1]:
import pandas as pd
import numpy as np
import os

from ddf_utils.str import to_concept_id
from ddf_utils.index import create_index_file

In [2]:
source = 'source/kilm03.xlsx'

In [3]:
data = pd.read_excel(source, skiprows=2)

In [4]:
data.head()

Unnamed: 0,Country (code),Country,Region,Sub-region (broad),Sub-region (detailed),Income group (code),Income group,B,Year,Sex (code),...,Survey limitation,Type of source (code),Type of source,Coverage (code),Coverage,Reference period,Geographic limitation,Coverage limitation,Classification remark,Notes
0,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2007,MF,...,,LFS,Labour force survey,T,Total,,,,,
1,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2007,M,...,,LFS,Labour force survey,T,Total,,,,,
2,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2007,F,...,,LFS,Labour force survey,T,Total,,,,,
3,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2008,MF,...,,LFS,Labour force survey,T,Total,,,,,
4,ALB,Albania,Europe and Central Asia,"Northern, Southern and Western Europe",Southern Europe,III,Upper-middle income,,2008,M,...,,LFS,Labour force survey,T,Total,,,,,


In [5]:
data.columns

Index(['Country (code)', 'Country', 'Region', 'Sub-region (broad)',
       'Sub-region (detailed)', 'Income group (code)', 'Income group', 'B',
       'Year', 'Sex (code)', 'Sex', 'Total employment ('000)',
       'Wage & salaried workers (employees) ('000)',
       'Wage & salaried workers (employees) (%)',
       'Total self-employed workers (a + b + c + d) ('000)',
       'Total self-employed workers (a + b + c + d) (%)',
       'Employers (a) ('000)', 'Employers (a) (%)',
       'Own-account workers (b) ('000)', 'Own-account workers (b) (%)',
       'Members of producers' cooperatives (c) ('000)',
       'Members of producers' cooperatives (c) (%)',
       'Contributing family workers (d) ('000)',
       'Contributing family workers (d) (%)', 'Not classified ('000)',
       'Not classified (%)', 'Persons in vulnerable employment (b + d) ('000)',
       'Share of vulnerable employment in total employment (b + d) (%)',
       'Repository (code)', 'Repository', 'Age', 'Survey limitati

In [6]:
# country

In [7]:
country = data[['Country (code)', 'Country']].drop_duplicates().copy()
country.columns = ['country', 'name']
country['country'] = country['country'].map(to_concept_id)

In [9]:
country.to_csv('../ddf--entities--country.csv', index=False)

In [None]:
# sex

In [10]:
sex = data[['Sex (code)', 'Sex']].drop_duplicates().copy()
sex.columns = ['sex', 'name']

sex['sex'] = sex['sex'].map(to_concept_id)

sex.to_csv('../ddf--entities--sex.csv', index=False)

In [12]:
# concepts

In [13]:
discs = ['Name', 'Year', 'Country', 'Sex']

conc = data.columns[12:28]

In [14]:
conc

Index(['Wage & salaried workers (employees) ('000)',
       'Wage & salaried workers (employees) (%)',
       'Total self-employed workers (a + b + c + d) ('000)',
       'Total self-employed workers (a + b + c + d) (%)',
       'Employers (a) ('000)', 'Employers (a) (%)',
       'Own-account workers (b) ('000)', 'Own-account workers (b) (%)',
       'Members of producers' cooperatives (c) ('000)',
       'Members of producers' cooperatives (c) (%)',
       'Contributing family workers (d) ('000)',
       'Contributing family workers (d) (%)', 'Not classified ('000)',
       'Not classified (%)', 'Persons in vulnerable employment (b + d) ('000)',
       'Share of vulnerable employment in total employment (b + d) (%)'],
      dtype='object')

In [15]:
cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])

cdf['name'] = [*discs, *conc]

cdf['concept'] = cdf['name'].map(to_concept_id)

In [16]:
cdf.loc[4:, 'concept_type'] = 'measure'
cdf.loc[0, 'concept_type'] = 'string'
cdf.loc[1, 'concept_type'] = 'time'
cdf.loc[2:3, 'concept_type'] = 'entity_domain'

In [17]:
cdf

Unnamed: 0,concept,name,concept_type
0,name,Name,string
1,year,Year,time
2,country,Country,entity_domain
3,sex,Sex,entity_domain
4,wage_salaried_workers_employees_000,Wage & salaried workers (employees) ('000),measure
5,wage_salaried_workers_employees,Wage & salaried workers (employees) (%),measure
6,total_self_employed_workers_a_b_c_d_000,Total self-employed workers (a + b + c + d) ('...,measure
7,total_self_employed_workers_a_b_c_d,Total self-employed workers (a + b + c + d) (%),measure
8,employers_a_000,Employers (a) ('000),measure
9,employers_a,Employers (a) (%),measure


In [18]:
cdf.to_csv('../ddf--concepts.csv', index=False)

In [None]:
# datapoints

In [19]:
dps = data[['Country (code)', 'Sex (code)', 'Year', *conc]].copy()


In [20]:
dps.columns = ['country', 'sex', 'year', *[to_concept_id(x) for x in conc]]

In [21]:
dps['country'] = dps['country'].map(to_concept_id)
dps['sex'] = dps['sex'].map(to_concept_id)

In [22]:
dps = dps.set_index(['country', 'sex', 'year'])

In [25]:
for k, df in dps.items():
    df_ = df.reset_index().dropna()
    path = '../ddf--datapoints--{}--by--country--sex--year.csv'.format(k)
    df_.to_csv(path, index=False)

In [24]:
create_index_file('../')

Unnamed: 0,key,value,file
0,concept,name,ddf--concepts.csv
1,concept,concept_type,ddf--concepts.csv
0,"country,sex,year",contributing_family_workers_d,ddf--datapoints--contributing_family_workers_d...
0,"country,sex,year",contributing_family_workers_d_000,ddf--datapoints--contributing_family_workers_d...
0,"country,sex,year",employers_a,ddf--datapoints--employers_a--by--country--sex...
0,"country,sex,year",employers_a_000,ddf--datapoints--employers_a_000--by--country-...
0,"country,sex,year",members_of_producers_cooperatives_c,ddf--datapoints--members_of_producers_cooperat...
0,"country,sex,year",members_of_producers_cooperatives_c_000,ddf--datapoints--members_of_producers_cooperat...
0,"country,sex,year",not_classified,ddf--datapoints--not_classified--by--country--...
0,"country,sex,year",not_classified_000,ddf--datapoints--not_classified_000--by--count...
