## Loading libraries

In [1]:
import requests
from requests.auth import HTTPBasicAuth
from lxml import html
from cssselect import GenericTranslator
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import json
import os
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', None)

In [3]:
# if not os.path.exists('data'):
#     print(os.path.abspath('.'))

/Users/rubencito/CAS_datascience/ADS_CAS_Bern_2020/Projects/M3


## 1. Collecting the **Data Catalog** from the UNDP (Human Development Report Office Statistical Data API)

In [4]:
my_data_catalog_path = '/Users/rubencito/CAS_datascience/ADS_CAS_Bern_2020/Projects/M3/UNDP_data_catalog/HDRO.html'

In [5]:
# loading the Indicator parameters
indi_col_names = ['ID', 'Indicator_Name']
indi_UNDP_df = pd.read_html(my_data_catalog_path)[0][2:]
indi_UNDP_df.reset_index(inplace = True, drop = True) # fix the indexing
indi_UNDP_df.columns = indi_col_names # Fix columns names
indi_UNDP_df.head(6)

Unnamed: 0,ID,Indicator_Name
0,164406,Adjusted net savings (% of GNI)
1,36806,"Adolescent birth rate (births per 1,000 women ages 15-19)"
2,185106,"Age-standardized mortality rate attributed to noncommunicable diseases, female"
3,185206,"Age-standardized mortality rate attributed to noncommunicable diseases, male"
4,175206,"Antenatal care coverage, at least one visit (%)"
5,186806,Average annual change in the share of bottom 40 percent (%)


In [6]:
indi_UNDP_df[indi_UNDP_df['Indicator_Name'].str.contains("multi", case = False)]

Unnamed: 0,ID,Indicator_Name
15,117806,Contribution of deprivation in education to the Multidimensional Poverty Index
16,117906,Contribution of deprivation in health to the Multidimensional Poverty Index
17,118006,Contribution of deprivation in standard of living to the Multidimensional Poverty Index
99,38406,Multidimensional poverty index (MPI)
112,38606,"Population in multidimensional poverty, headcount (%)"
113,102006,"Population in multidimensional poverty, headcount (thousands) (for the year of the survey)"
114,183406,"Population in multidimensional poverty, headcount (thousands) (projection for 2018)"
115,38506,"Population in multidimensional poverty, intensity of deprivation (%)"
116,101006,Population in severe multidimensional poverty (%)
122,142506,Population vulnerable to multidimensional poverty (%)


In [None]:
# loading the Country codes
country_col_names = ['Country_Code', 'Country_Name']
country_UNDP_df = pd.read_html(my_data_catalog_path)[1][2:]
country_UNDP_df.reset_index(inplace = True, drop = True) # fix the indexing
country_UNDP_df.columns = country_col_names # Fix columns names
country_UNDP_df.head()

## 2. Define a function that call a specific parameter and retunr a df

In [312]:
# Define the defoult values

url_seed = "http://ec2-54-174-131-205.compute-1.amazonaws.com/API/HDRO_API.php/"
years = np.arange(2010, 2020)
years = ','.join(map(str, years))
indicator_id = "195606"
struct = ["ciy", "yic", "yci", "iyc", "icy"] 

In [341]:
years

'2010,2011,2012,2013,2014,2015,2016,2017,2018,2019'

In [361]:
def API_UNDP_call_to_df(indicator_id, url_seed = url_seed, years = years):
    
    # define User agent and session
    s = requests.Session()
    headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36"
    }
    
    # create parameterized query to the API
    my_url = url_seed + "indicator_id" + "=" + indicator_id + "/" + "year" + "=" + years + "/" + "structure" + "=" + struct[3]
#     print(my_url)
    response = requests.get(my_url, headers=headers)
    json_objet = response.json()
#     print(json_objet)
    
    #Create df from json object
    df = pd.DataFrame(list(list(json_objet.values())[0].values())[0]) # call the 3rd level nested element
    df = df.stack().to_frame() # stack() method transform the data in narrow format, and to_frame() to a dataframe
    df.reset_index(inplace=True)# fixing the indexing 
    df.columns = ['country_code','year', list(list(json_objet.values())[2].values())[0]]
    country_map = pd.DataFrame({list(df.columns)[0] : list(list(json_objet.values())[1].keys()),# Create a dataframe with the country name and code to map the values to the original one
                                "country_name" : list(list(json_objet.values())[1].values())})
    df = df[list(df.columns)].merge(country_map, how= "left")# Merge the original dataframe with the he country name and code dataframe
    
    columns = df.columns.tolist() # rearrange columns
    columns = columns[-1:] + columns[:3]
    
    df = df[columns]
    
    return(df)

## 3. Extracting the HDI indicator

In [360]:
HDI_df = API_UNDP_call_to_df(indicator_id = "137506")
HDI_df

http://ec2-54-174-131-205.compute-1.amazonaws.com/API/HDRO_API.php/indicator_id=137506/year=2010,2011,2012,2013,2014,2015,2016,2017,2018,2019/structure=iyc


Unnamed: 0,country_name,country_code,year,Human Development Index (HDI)
0,Afghanistan,AFG,2010,0.472
1,Afghanistan,AFG,2011,0.477
2,Afghanistan,AFG,2012,0.489
3,Afghanistan,AFG,2013,0.496
4,Afghanistan,AFG,2014,0.500
...,...,...,...,...
1878,Zimbabwe,ZWE,2018,0.569
1879,Zimbabwe,ZWE,2019,0.571
1880,Marshall Islands,MHL,2017,0.699
1881,Marshall Islands,MHL,2018,0.702


In [364]:
### Check the number of Countries present in this dataset

len(HDI_df.country_name.unique())

189

In [365]:
HDI_df.country_name.unique()

array(['Afghanistan', 'Angola', 'Albania', 'Andorra',
       'United Arab Emirates', 'Argentina', 'Armenia',
       'Antigua and Barbuda', 'Australia', 'Austria', 'Azerbaijan',
       'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh',
       'Bulgaria', 'Bahrain', 'Bahamas', 'Bosnia and Herzegovina',
       'Belarus', 'Belize', 'Bolivia (Plurinational State of)', 'Brazil',
       'Barbados', 'Brunei Darussalam', 'Bhutan', 'Botswana',
       'Central African Republic', 'Canada', 'Switzerland', 'Chile',
       'China', "Cote d'Ivoire", 'Cameroon',
       'Congo (Democratic Republic of the)', 'Congo', 'Colombia',
       'Comoros', 'Cabo Verde', 'Costa Rica', 'Cuba', 'Cyprus', 'Czechia',
       'Germany', 'Djibouti', 'Dominica', 'Denmark', 'Dominican Republic',
       'Algeria', 'Ecuador', 'Egypt', 'Eritrea', 'Spain', 'Estonia',
       'Ethiopia', 'Finland', 'Fiji', 'France',
       'Micronesia (Federated States of)', 'Gabon', 'United Kingdom',
       'Georgia', 'Ghana', 'Guinea'