## Loading libraries

In [25]:
import requests
from lxml import html
import pandas as pd
import numpy as np

from time import sleep
import os

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', None)

## 1. Collecting the **Data Catalog** from the UNDP (Human Development Report Office Statistical Data API)

In [4]:
path = os.getcwd()
path

'/Users/rubencito/CAS_datascience/ADS_CAS_Bern_2020/Projects/M3'

In [11]:
my_data_catalog_path = path + "/" + "data_catalogs" + "/" + "UNDP_data_catalog" + "/" + "HDRO.html"
my_data_catalog_path

'/Users/rubencito/CAS_datascience/ADS_CAS_Bern_2020/Projects/M3/data_catalogs/UNDP_data_catalog/HDRO.html'

In [14]:
# loading the Indicator parameters
indi_col_names = ['ID', 'Indicator_Name']
indi_UNDP_df = pd.read_html(my_data_catalog_path)[0][2:]
indi_UNDP_df.reset_index(inplace = True, drop = True) # fix the indexing
indi_UNDP_df.columns = indi_col_names # Fix columns names
indi_UNDP_df.head(6)
# indi_UNDP_df.shape

Unnamed: 0,ID,Indicator_Name
0,164406,Adjusted net savings (% of GNI)
1,36806,"Adolescent birth rate (births per 1,000 women ages 15-19)"
2,185106,"Age-standardized mortality rate attributed to noncommunicable diseases, female"
3,185206,"Age-standardized mortality rate attributed to noncommunicable diseases, male"
4,175206,"Antenatal care coverage, at least one visit (%)"
5,186806,Average annual change in the share of bottom 40 percent (%)


### Loading the Country codes and save them for later mapping the countries by code

In [55]:
# Don't run
country_col_names = ['Country_Code', 'Country_Name']
country_UNDP_df = pd.read_html(my_data_catalog_path)[1][2:]
country_UNDP_df.reset_index(inplace = True, drop = True) # fix the indexing
country_UNDP_df.columns = country_col_names # Fix columns names

country_UNDP_df.to_csv(os.path.abspath('.') + '/' + 'Country_code.csv', index=False)

country_UNDP_df.shape

(195, 2)

## 2. Define a function that call a specific parameter and retunr a df

In [15]:
# Define the default values for the function

url_seed = "http://ec2-54-174-131-205.compute-1.amazonaws.com/API/HDRO_API.php/"
years = np.arange(2010, 2020)
years = ','.join(map(str, years))
indicator_id = "195606"
struct = ["ciy", "yic", "yci", "iyc", "icy"] 

In [12]:
years

'2010,2011,2012,2013,2014,2015,2016,2017,2018,2019'

In [56]:
def API_UNDP_call_to_df(indicator_id, url_seed = url_seed, years = years):
    
    # define User agent and session
    s = requests.Session()
    headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36"
    }
    
    # create parameterized query to the API
    my_url = url_seed + "indicator_id" + "=" + indicator_id + "/" + "year" + "=" + years + "/" + "structure" + "=" + struct[3]
#     print(my_url)
    response = requests.get(my_url, headers=headers)
    json_objet = response.json()
#     print(json_objet)
    
    #Create df from json object
    df = pd.DataFrame(list(list(json_objet.values())[0].values())[0]) # call the 3rd level nested element
    df = df.stack().to_frame() # stack() method transform the data in narrow format, and to_frame() to a dataframe
    df.reset_index(inplace=True)# fixing the indexing 
    df.columns = ['country_code','year', list(list(json_objet.values())[2].values())[0]]
    country_map = pd.DataFrame({list(df.columns)[0] : list(list(json_objet.values())[1].keys()),# Create a dataframe with the country name and code to map the values to the original one
                                "country_name" : list(list(json_objet.values())[1].values())})
    df = df[list(df.columns)].merge(country_map, how= "left")# Merge the original dataframe with the he country name and code dataframe
    
    columns = df.columns.tolist() # rearrange columns
    columns = columns[-1:] + columns[:3]
    
    df = df[columns]
    
    return(json_objet)

In [None]:
example = API_UNDP_call_to_df("103706")
example

In [66]:
example_df = pd.DataFrame(list(list(example.values())[0].values())[0]).stack().to_frame().reset_index()
example_df.head()

Unnamed: 0,level_0,level_1,0
0,AFG,2010,0.372
1,AFG,2011,0.374
2,AFG,2012,0.39
3,AFG,2013,0.398
4,AFG,2014,0.403


In [67]:
example_df.columns = ['country_code','year', list(list(example.values())[2].values())[0]]
example_df.head()

Unnamed: 0,country_code,year,Education index
0,AFG,2010,0.372
1,AFG,2011,0.374
2,AFG,2012,0.39
3,AFG,2013,0.398
4,AFG,2014,0.403


In [72]:
country_map = pd.DataFrame({list(example_df.columns)[0] : list(list(example.values())[1].keys()),# Create a dataframe with the country name and code to map the values to the original one
                                "country_name" : list(list(example.values())[1].values())})
country_map.head()

Unnamed: 0,country_code,country_name
0,AFG,Afghanistan
1,AGO,Angola
2,ALB,Albania
3,AND,Andorra
4,ARE,United Arab Emirates


In [73]:
example_df[list(example_df.columns)].merge(country_map, how= "right")

Unnamed: 0,country_code,year,Education index,country_name
0,AFG,2010,0.372,Afghanistan
1,AFG,2011,0.374,Afghanistan
2,AFG,2012,0.390,Afghanistan
3,AFG,2013,0.398,Afghanistan
4,AFG,2014,0.403,Afghanistan
...,...,...,...,...
1878,ZWE,2018,0.570,Zimbabwe
1879,ZWE,2019,0.587,Zimbabwe
1880,MHL,2017,0.707,Marshall Islands
1881,MHL,2018,0.707,Marshall Islands


In [74]:
pd.read_csv("Country_code.csv")

Unnamed: 0,Country_Code,Country_Name
0,AFG,Afghanistan
1,ALB,Albania
2,DZA,Algeria
3,AND,Andorra
4,AGO,Angola
5,ATG,Antigua and Barbuda
6,ARG,Argentina
7,ARM,Armenia
8,AUS,Australia
9,AUT,Austria


## 3. Loop over a number of data codes, colect, transform to df and save a formated dataframe for the given indicator

In [53]:
# check of find for an specific indicator
my_indi = "govern"
indi_UNDP_df[indi_UNDP_df['Indicator_Name'].str.contains(my_indi, case = False)]

Unnamed: 0,ID,Indicator_Name
40,149206,Government expenditure on education (% of GDP)
150,194306,Share of seats held by women in local government (%)


In [51]:
# check of find for an specific code
my_code = "103706"
indi_UNDP_df[indi_UNDP_df['ID'].str.contains(my_code, case = False)]

Unnamed: 0,ID,Indicator_Name
21,103706,Education index


In [54]:
# make a dictionary of indi codes and name for the output csv file, whihc will be saved in the curated folder

my_UNDP_indicator_dic = {"137506": "HDI",# Human Development Index (HDI)
                         "137906" : "GDI",# Gender Development Index (GDI)
                        "68606": "GII",# Gender Inequality Index (GII)
                        "31706": "FemParlmt",# Share of seats in parliament (% held by women)
                        "103206": "LifeExpecIndx",# Life expectancy index
                        "181806": "HealthExpend",# Current health expenditure (% of GDP)
                        "44206": "TotPopulMill",# Total population (millions)
                        "103706": "EducaIndx"} # Education index
my_UNDP_indicator_dic



{'137506': 'HDI',
 '137906': 'GDI',
 '68606': 'GII',
 '31706': 'FemParlmt',
 '103206': 'LifeExpecIndx',
 '181806': 'HealthExpend',
 '44206': 'TotPopulMill',
 '103706': 'EducaIndx'}

In [48]:
# loop over the indicator list and 
for value in my_UNDP_indicator_dic:
    my_df = API_UNDP_call_to_df(indicator_id = value)
    my_df.to_csv(path + '/' + 'curated_datasets' + '/' + 'UNDP_' + my_UNDP_indicator_dic[value] +'_df' + ".csv", index=False )
    sleep(1) # make a time sleep between calls to ot overload the API
    