## Loading libraries

In [1]:
import requests
from requests.auth import HTTPBasicAuth
from lxml import html
from cssselect import GenericTranslator
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import json
import os
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', None)

## 1. Collecting the **Data Catalog** from the UNDP (Human Development Report Office Statistical Data API)

In [2]:
my_data_catalog_path = '/Users/rubencito/CAS_datascience/ADS_CAS_Bern_2020/Projects/M3/UNDP_data_catalog/HDRO.html'

In [3]:
# loading the Indicator parameters
indi_col_names = ['ID', 'Indicator_Name']
indi_UNDP_df = pd.read_html(my_data_catalog_path)[0][2:]
indi_UNDP_df.reset_index(inplace = True, drop = True) # fix the indexing
indi_UNDP_df.columns = indi_col_names # Fix columns names
indi_UNDP_df.head(6)

Unnamed: 0,ID,Indicator_Name
0,164406,Adjusted net savings (% of GNI)
1,36806,"Adolescent birth rate (births per 1,000 women ages 15-19)"
2,185106,"Age-standardized mortality rate attributed to noncommunicable diseases, female"
3,185206,"Age-standardized mortality rate attributed to noncommunicable diseases, male"
4,175206,"Antenatal care coverage, at least one visit (%)"
5,186806,Average annual change in the share of bottom 40 percent (%)


In [19]:
indi_UNDP_df[indi_UNDP_df['Indicator_Name'].str.contains("shar", case = False)]

Unnamed: 0,ID,Indicator_Name
5,186806,Average annual change in the share of bottom 40 percent (%)
31,175006,Female share of employment in senior and middle management (%)
59,186906,Income share held by poorest 40%
60,186106,Income share held by richest 1%
61,187006,Income share held by richest 10 %
79,183906,"Labour share of GDP, comprising wages and social protection transfers (%)"
145,175106,"Share of employment in nonagriculture, female (% of total employment in nonagriculture)"
146,183506,"Share of graduates from science, technology, engineering and mathematics programmes in tertiary education who are female (%)"
147,183706,"Share of graduates from science, technology, engineering and mathematics programmes in tertiary education who are male (%)"
148,175906,"Share of graduates in science, technology, engineering and mathematics programmes at tertiary level, female (%)"


In [5]:
# loading the Country codes
country_col_names = ['Country_Code', 'Country_Name']
country_UNDP_df = pd.read_html(my_data_catalog_path)[1][2:]
country_UNDP_df.reset_index(inplace = True, drop = True) # fix the indexing
country_UNDP_df.columns = country_col_names # Fix columns names
country_UNDP_df.head()

Unnamed: 0,Country_Code,Country_Name
0,AFG,Afghanistan
1,ALB,Albania
2,DZA,Algeria
3,AND,Andorra
4,AGO,Angola


## 2. Define a function that call a specific parameter and retunr a df

In [6]:
# Define the defoult values

url_seed = "http://ec2-54-174-131-205.compute-1.amazonaws.com/API/HDRO_API.php/"
years = np.arange(2010, 2020)
years = ','.join(map(str, years))
indicator_id = "195606"
struct = ["ciy", "yic", "yci", "iyc", "icy"] 

In [7]:
years

'2010,2011,2012,2013,2014,2015,2016,2017,2018,2019'

In [24]:
def API_UNDP_call_to_df(indicator_id, url_seed = url_seed, years = years):
    
    # define User agent and session
    s = requests.Session()
    headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36"
    }
    
    # create parameterized query to the API
    my_url = url_seed + "indicator_id" + "=" + indicator_id + "/" + "year" + "=" + years + "/" + "structure" + "=" + struct[3]
    print(my_url)
    response = requests.get(my_url, headers=headers)
    json_objet = response.json()
#     print(json_objet)
    
    #Create df from json object
    df = pd.DataFrame(list(list(json_objet.values())[0].values())[0]) # call the 3rd level nested element
    df = df.stack().to_frame() # stack() method transform the data in narrow format, and to_frame() to a dataframe
    df.reset_index(inplace=True)# fixing the indexing 
    df.columns = ['country_code','year', list(list(json_objet.values())[2].values())[0]]
    country_map = pd.DataFrame({list(df.columns)[0] : list(list(json_objet.values())[1].keys()),# Create a dataframe with the country name and code to map the values to the original one
                                "country_name" : list(list(json_objet.values())[1].values())})
    df = df[list(df.columns)].merge(country_map, how= "left")# Merge the original dataframe with the he country name and code dataframe
    
    columns = df.columns.tolist() # rearrange columns
    columns = columns[-1:] + columns[:3]
    
    df = df[columns]
    
    return(df)

## 3. Extracting the HDI indicator

In [9]:
HDI_df = API_UNDP_call_to_df(indicator_id = "137506")
HDI_df

Unnamed: 0,country_name,country_code,year,Human Development Index (HDI)
0,Afghanistan,AFG,2010,0.472
1,Afghanistan,AFG,2011,0.477
2,Afghanistan,AFG,2012,0.489
3,Afghanistan,AFG,2013,0.496
4,Afghanistan,AFG,2014,0.500
...,...,...,...,...
1878,Zimbabwe,ZWE,2018,0.569
1879,Zimbabwe,ZWE,2019,0.571
1880,Marshall Islands,MHL,2017,0.699
1881,Marshall Islands,MHL,2018,0.702


In [10]:
### Check the number of Countries present in this dataset

len(HDI_df.country_name.unique())

189

In [None]:
HDI_df.country_name.unique()

In [12]:
if not os.path.exists('data'):
    print(os.path.abspath('.'))

/Users/rubencito/CAS_datascience/ADS_CAS_Bern_2020/Projects/M3


### Export the semicurated dataset to csv format

In [13]:
HDI_df.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'HDI_df.csv', index=False )

## 4. Extracting the Gender Development Index (GDI) indicator

In [15]:
GDI_df = API_UNDP_call_to_df(indicator_id = "137906")
GDI_df

Unnamed: 0,country_name,country_code,year,Gender Development Index (GDI)
0,Afghanistan,AFG,2010,0.595
1,Afghanistan,AFG,2011,0.609
2,Afghanistan,AFG,2012,0.618
3,Afghanistan,AFG,2013,0.627
4,Afghanistan,AFG,2014,0.634
...,...,...,...,...
1644,Angola,AGO,2019,0.903
1645,Myanmar,MMR,2017,0.956
1646,Myanmar,MMR,2018,0.953
1647,Myanmar,MMR,2019,0.954


### Export the semicurated dataset to csv format

In [16]:
GDI_df.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'GDI_df.csv', index=False )

## 5. Extracting the Gender Inequality Index (GII) indicator

In [17]:
GII_df = API_UNDP_call_to_df(indicator_id = "68606")
GII_df

Unnamed: 0,country_name,country_code,year,Gender Inequality Index (GII)
0,Afghanistan,AFG,2010,0.751
1,Afghanistan,AFG,2011,0.743
2,Afghanistan,AFG,2012,0.734
3,Afghanistan,AFG,2013,0.724
4,Afghanistan,AFG,2014,0.714
...,...,...,...,...
1554,Chad,TCD,2018,0.709
1555,Brunei Darussalam,BRN,2016,0.299
1556,Brunei Darussalam,BRN,2017,0.269
1557,Brunei Darussalam,BRN,2019,0.255


### Export the semicurated dataset to csv format

In [18]:
GII_df.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'GII_df.csv', index=False )

In [19]:
indi_UNDP_df[indi_UNDP_df['Indicator_Name'].str.contains("shar", case = False)]

Unnamed: 0,ID,Indicator_Name
5,186806,Average annual change in the share of bottom 40 percent (%)
31,175006,Female share of employment in senior and middle management (%)
59,186906,Income share held by poorest 40%
60,186106,Income share held by richest 1%
61,187006,Income share held by richest 10 %
79,183906,"Labour share of GDP, comprising wages and social protection transfers (%)"
145,175106,"Share of employment in nonagriculture, female (% of total employment in nonagriculture)"
146,183506,"Share of graduates from science, technology, engineering and mathematics programmes in tertiary education who are female (%)"
147,183706,"Share of graduates from science, technology, engineering and mathematics programmes in tertiary education who are male (%)"
148,175906,"Share of graduates in science, technology, engineering and mathematics programmes at tertiary level, female (%)"


## 6. Extracting the Share of seats in parliament (% held by women) "Share_parl_seat_Fem" indicator

In [26]:
Share_parl_seat_Fem_df = API_UNDP_call_to_df(indicator_id = "31706")
Share_parl_seat_Fem_df

http://ec2-54-174-131-205.compute-1.amazonaws.com/API/HDRO_API.php/indicator_id=31706/year=2010,2011,2012,2013,2014,2015,2016,2017,2018,2019/structure=iyc


Unnamed: 0,country_name,country_code,year,Share of seats in parliament (% held by women)
0,Afghanistan,AFG,2010,27.635
1,Afghanistan,AFG,2011,27.635
2,Afghanistan,AFG,2012,27.635
3,Afghanistan,AFG,2013,27.635
4,Afghanistan,AFG,2014,27.635
...,...,...,...,...
1887,Guinea,GIN,2018,21.930
1888,Brunei Darussalam,BRN,2016,6.452
1889,Brunei Darussalam,BRN,2017,9.091
1890,Brunei Darussalam,BRN,2019,9.091


### Export the semicurated dataset to csv format

In [27]:
Share_parl_seat_Fem_df.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'Share_parl_seat_Fem_df.csv', index=False )

Unnamed: 0,country_name,country_code,year,Gender Inequality Index (GII),Share of seats in parliament (% held by women)
0,Afghanistan,AFG,2010,0.751,27.635
1,Afghanistan,AFG,2011,0.743,27.635
2,Afghanistan,AFG,2012,0.734,27.635
3,Afghanistan,AFG,2013,0.724,27.635
4,Afghanistan,AFG,2014,0.714,27.635
...,...,...,...,...,...
1887,Guinea,GIN,2015,,21.930
1888,Guinea,GIN,2016,,21.930
1889,Guinea,GIN,2017,,21.930
1890,Guinea,GIN,2019,,22.807
