In [1]:
import pandas as pd
import numpy as np
import os

from ddf_utils.str import to_concept_id
from ddf_utils.index import create_index_file

In [2]:
source = 'source/kilm02.xlsx'

In [3]:
data = pd.read_excel(source, skiprows=2)

In [4]:
data.head()

Unnamed: 0,Country (code),Country,Region,Sub-region (broad),Sub-region (detailed),Income group (code),Income group,Year,Sex (code),Sex,Age group (code),Age group,Employment ('000),Population ('000),Employment- to-population ratio,Repository (code),Repository
0,AFG,Afghanistan,Asia and the Pacific,Southern Asia,,I,Low income,1991,MF,Male and female,Total,15+,3047.871094,6646.708008,45.85535,TRENDS,Trends Econometric Models
1,AFG,Afghanistan,Asia and the Pacific,Southern Asia,,I,Low income,1991,MF,Male and female,Youth,15-24,861.978271,2439.979004,35.327282,TRENDS,Trends Econometric Models
2,AFG,Afghanistan,Asia and the Pacific,Southern Asia,,I,Low income,1991,MF,Male and female,Adult,25+,2185.893066,4206.729004,51.961823,TRENDS,Trends Econometric Models
3,AFG,Afghanistan,Asia and the Pacific,Southern Asia,,I,Low income,1991,M,Male,Total,15+,2590.974609,3398.583984,76.236885,TRENDS,Trends Econometric Models
4,AFG,Afghanistan,Asia and the Pacific,Southern Asia,,I,Low income,1991,M,Male,Youth,15-24,712.678101,1268.593994,56.178581,TRENDS,Trends Econometric Models


In [6]:
# country

In [7]:
country = data[['Country (code)', 'Country']].drop_duplicates().copy()

In [8]:
country.columns = [
    'country', 'name'
]

In [9]:
country['country'] = country['country'].map(to_concept_id)

In [12]:
country.to_csv('../ddf--entities--country.csv', index=False)

In [13]:
# age group

In [18]:
age = data[['Age group (code)', 'Age group']].drop_duplicates().copy()
age.columns = ['age_group', 'name']
age['name'] = 'Age ' + age['name']
age['age_group'] = age['age_group'].map(to_concept_id)
age.to_csv('../ddf--entities--age_group.csv', index=False)

In [19]:
age

Unnamed: 0,age_group,name
0,total,Age 15+
1,youth,Age 15-24
2,adult,Age 25+


In [20]:
# sex

In [21]:
sex = data[['Sex (code)', 'Sex']].drop_duplicates().copy()
sex.columns = ['sex', 'name']

sex['sex'] = sex['sex'].map(to_concept_id)

sex.to_csv('../ddf--entities--sex.csv', index=False)

In [22]:
sex

Unnamed: 0,sex,name
0,mf,Male and female
3,m,Male
6,f,Female


In [23]:
data.columns

Index(['Country (code)', 'Country', 'Region', 'Sub-region (broad)',
       'Sub-region (detailed)', 'Income group (code)', 'Income group', 'Year',
       'Sex (code)', 'Sex', 'Age group (code)', 'Age group',
       'Employment ('000)', 'Population ('000)',
       'Employment- to-population ratio', 'Repository (code)', 'Repository'],
      dtype='object')

In [24]:
discs = ['Name', 'Year', 'Country', 'Sex', 'Age Group']

conc = ["Employment ('000)", 
        "Population ('000)", 
        "Employment- to-population ratio"]

In [25]:
cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])

cdf['name'] = [*discs, *conc]

cdf['concept'] = cdf['name'].map(to_concept_id)

In [26]:
cdf.loc[5:, 'concept_type'] = 'measure'
cdf.loc[0, 'concept_type'] = 'string'
cdf.loc[1, 'concept_type'] = 'time'
cdf.loc[2:4, 'concept_type'] = 'entity_domain'

In [28]:
cdf.to_csv('../ddf--concepts.csv', index=False)

In [29]:
# datapoints

In [37]:
dps = data[['Country (code)', 'Sex (code)', 'Age group (code)', 'Year', *conc]].copy()

In [38]:
dps.head()

Unnamed: 0,Country (code),Sex (code),Age group (code),Year,Employment ('000),Population ('000),Employment- to-population ratio
0,AFG,MF,Total,1991,3047.871094,6646.708008,45.85535
1,AFG,MF,Youth,1991,861.978271,2439.979004,35.327282
2,AFG,MF,Adult,1991,2185.893066,4206.729004,51.961823
3,AFG,M,Total,1991,2590.974609,3398.583984,76.236885
4,AFG,M,Youth,1991,712.678101,1268.593994,56.178581


In [39]:
dps.columns = ['country', 'sex', 'age_group', 'year', *[to_concept_id(x) for x in conc]]


In [40]:
dps['country'] = dps['country'].map(to_concept_id)
dps['sex'] = dps['sex'].map(to_concept_id)

dps['age_group'] = dps['age_group'].str.replace('+', '_plus').map(to_concept_id)

In [41]:
dps = dps.set_index(['country', 'sex', 'age_group', 'year'])


In [42]:
for k, df in dps.items():
    df_ = df.reset_index()
    path = '../ddf--datapoints--{}--by--country--sex--age_group--year.csv'.format(k)
    df_.to_csv(path, index=False)

In [43]:
create_index_file('../')

Unnamed: 0,key,value,file
0,concept,name,ddf--concepts.csv
1,concept,concept_type,ddf--concepts.csv
0,"country,sex,age_group,year",employment_000,ddf--datapoints--employment_000--by--country--...
0,"country,sex,age_group,year",employment_to_population_ratio,ddf--datapoints--employment_to_population_rati...
0,"country,sex,age_group,year",population_000,ddf--datapoints--population_000--by--country--...
0,age_group,name,ddf--entities--age_group.csv
0,country,name,ddf--entities--country.csv
0,sex,name,ddf--entities--sex.csv
