## Structure WB data to HuruMap

In [2]:
import pandas as pd

# Load the WB data and takwimu indicators 

In [5]:
# generate a dict from the indicators file
takwimu_indicators = pd.read_csv('../data/takwimu_indicators.csv',
                                 index_col=0, squeeze=True).to_dict()
# Gather indicator data on the selected country

# #  Kenya
# data = pd.read_csv('../data/Kenya/WB_KEN.csv')
# structured = '../huru/ke/ke_'
# country_code = 'KE'

#  Ethiopia
# data = pd.read_csv('../data/Ethiopia/WB_ET.csv')
# structured = '../huru/et/et_'
# country_code = 'ET'

# Nigeria
data = pd.read_csv('../data/Nigeria/WB_NG.csv')
structured = '../huru/ng/ng_'
country_code = 'NG'




### Wrangle  by Indicator

In [49]:
# population total
def population_total():
    
    i_code = ['SP.POP.TOTL.FE.IN','SP.POP.TOTL.MA.IN']

    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Population, male": 'male',"Population, female":'female' })
    df['geo_code'] = country_code

    df = df.melt(id_vars=['geo_code','geo_version'], value_vars=['female','male'],
                var_name='sex', value_name='total')
    population = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return population

# Prevalence of HIV,(% ages 15-24)

def hiv_prevalence():
    i_code = ['SH.HIV.1524.MA.ZS','SH.HIV.1524.FE.ZS']
    
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Prevalence of HIV, male (% ages 15-24)": 'male',
                            "Prevalence of HIV, female (% ages 15-24)":'female' })
    df['geo_code'] = country_code

    df = df.melt(id_vars=['geo_code','geo_version'], value_vars=['female','male'],
                var_name='sex', value_name='total')
    prevalence = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return prevalence
    
# Infant and Under 5 Mortality Rate 

def infant_under_5_mortality():
    
    i_code = ['SP.DYN.IMRT.IN','SH.DYN.MORT']
    
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Mortality rate, infant (per 1,000 live births)": 'infant',
                            "Mortality rate, under-5 (per 1,000 live births)":'under_5' })
    df['geo_code'] = country_code

    df = df.melt(id_vars=['geo_code','geo_version'], value_vars=['infant','under_5'],
                var_name='mortality', value_name='total')
    infant_under_5 = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return infant_under_5

# Adult Literacy rate

def adult_literacy_rate():
    
    i_code = ['SE.ADT.LITR.FE.ZS','SE.ADT.LITR.MA.ZS']
    
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Literacy rate, adult male (% of males ages 15 and above)": 'male',
                            "Literacy rate, adult female (% of females ages 15 and above)":'female' })
    df['geo_code'] = country_code

    df = df.melt(id_vars=['geo_code','geo_version'], value_vars=['female','male'],
                var_name='sex', value_name='total')
    literacy_rate = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return literacy_rate

# Employment in agriculture, (% of employment) (modeled ILO estimate)

def agr_employment():
    
    i_code = ['SL.AGR.EMPL.FE.ZS','SL.AGR.EMPL.MA.ZS']

    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Employment in agriculture, female (% of female employment) (modeled ILO estimate)": 'female',
                            "Employment in agriculture, male (% of male employment) (modeled ILO estimate)":'male' })
    df['geo_code'] = country_code

    df = df.melt(id_vars=['geo_code','geo_version'], value_vars=['female','male'],
                var_name='sex', value_name='total')
    agr_employment = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return agr_employment


# School enrollment, primary,


def primary_ed_enrollment():
    
    i_code = ['SE.PRM.ENRR.MA','SE.PRM.ENRR.FE']
    
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "School enrollment, primary, male (% gross)": 'male',
                            "School enrollment, primary, female (% gross)":'female' })
    df['geo_code'] = country_code

    df = df.melt(id_vars=['geo_code','geo_version'], value_vars=['female','male'],
                var_name='sex', value_name='total')
    primary_enrollment = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return primary_enrollment

# School enrollment, Secondary,


def secondary_ed_enrollment():
    
    i_code = ['SE.SEC.ENRR.FE','SE.SEC.ENRR.MA']
    
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "School enrollment, secondary, male (% gross)": 'male',
                            "School enrollment, secondary, female (% gross)":'female' })
    df['geo_code'] = country_code

    df = df.melt(id_vars=['geo_code','geo_version'], value_vars=['female','male'],
                var_name='sex', value_name='total')
    secondary_enrollment = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return secondary_enrollment

# Mobile cellular subscriptions (per 100 people),


def cellular_subcription():
    
    i_code = ['IT.CEL.SETS.P2']
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Mobile cellular subscriptions (per 100 people)": 'total'})
    df['geo_code'] = country_code
    cellular_subcription = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return cellular_subcription

# Life expectancy at birth,(years)

def life_expectancy():
    
    i_code = ['SP.DYN.LE00.FE.IN','SP.DYN.LE00.MA.IN']
    
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Life expectancy at birth, male (years)": 'male',
                            "Life expectancy at birth, female (years)":'female' })
    df['geo_code'] = country_code

    df = df.melt(id_vars=['geo_code','geo_version'], value_vars=['female','male'],
                var_name='sex', value_name='total')
    life_expectancy = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return life_expectancy

#  Women in national parliaments (%)

def women_in_parliament():
    
    i_code = ['SG.GEN.PARL.ZS']
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Proportion of seats held by women in national parliaments (%)": 'total'})
    df['geo_code'] = country_code

    women_in_parliament = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return women_in_parliament

#  Maternal mortality ratio (modeled estimate, per 100,000 live births)

def maternal_mortality():
    
    i_code = ['SH.STA.MMRT']
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Maternal mortality ratio (modeled estimate, per 100,000 live births)": 'total'})
    df['geo_code'] = country_code

    maternal_mortality = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return maternal_mortality

#  Medical Staff - Physicians, Nurses and Midwives (per 1,000 people)

def physicians_nurses():
    
    i_code = ['SH.MED.PHYS.ZS','SH.MED.NUMW.P3']

    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Physicians (per 1,000 people)": 'physicians',
                            "Nurses and midwives (per 1,000 people)":'nurses_midwives' })
    df['geo_code'] = country_code

    df = df.melt(id_vars=['geo_code','geo_version'], value_vars=['nurses_midwives','physicians'],
                var_name='medical_staff', value_name='total')
    physicians_nurses = df.set_index(['geo_code','geo_version']).sort_index()
    
    return physicians_nurses

# Incidence of Malaria 

def malaria_incidence():
    
    i_code = ['SH.MLR.INCD.P3']
    df = data.loc[data['I_code'].isin(i_code)].drop(['I_code'],axis=1).T.reset_index()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.rename(columns={"Indicator": 'geo_version' , "Incidence of malaria (per 1,000 population at risk)": 'total'})
    df['geo_code'] = country_code

    malaria_incidence = df.set_index(['geo_code','geo_version']).sort_index() 
    
    return malaria_incidence


In [50]:
# Save to location

def save_to_location():
    population_total().to_csv(structured +'population.csv')
    hiv_prevalence().to_csv(structured +'hiv_prevalence.csv')
    infant_under_5_mortality().to_csv(structured +'infant_under_5_mortality.csv')
    adult_literacy_rate().to_csv(structured +'adult_literacy_rate.csv')
    agr_employment().to_csv(structured +'agr_employment.csv')
    primary_ed_enrollment().to_csv(structured +'primary_enrollment.csv')
    secondary_ed_enrollment().to_csv(structured +'secondary_enrollment.csv')
    cellular_subcription().to_csv(structured + 'cellular subscriptions_per_100.csv')
    life_expectancy().to_csv(structured + 'life_expectancy.csv')
    women_in_parliament().to_csv(structured + 'women_in_parliament.csv')
    maternal_mortality().to_csv(structured + 'maternal_mortality.csv')
    physicians_nurses().to_csv(structured + 'physicians_nurses.csv')
    malaria_incidence().to_csv(structured + 'malaria_incidence.csv')
    
    return 

In [51]:
save_to_location()