## Loading libraries

In [None]:
import requests
from lxml import html
import pandas as pd
import os

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', None)

## 1. Collecting the **Data Catalog** from the UNDP (Human Development Report Office Statistical Data API)

In [None]:
my_data_catalog_path = '/Users/rubencito/CAS_datascience/ADS_CAS_Bern_2020/Projects/M3/UNDP_data_catalog/HDRO.html'

In [None]:
# loading the Indicator parameters
indi_col_names = ['ID', 'Indicator_Name']
indi_UNDP_df = pd.read_html(my_data_catalog_path)[0][2:]
indi_UNDP_df.reset_index(inplace = True, drop = True) # fix the indexing
indi_UNDP_df.columns = indi_col_names # Fix columns names
indi_UNDP_df.head(6)

In [None]:
indi_UNDP_df[indi_UNDP_df['Indicator_Name'].str.contains("civil", case = False)]

In [None]:
# loading the Country codes
country_col_names = ['Country_Code', 'Country_Name']
country_UNDP_df = pd.read_html(my_data_catalog_path)[1][2:]
country_UNDP_df.reset_index(inplace = True, drop = True) # fix the indexing
country_UNDP_df.columns = country_col_names # Fix columns names
country_UNDP_df.head()

## 2. Define a function that call a specific parameter and retunr a df

In [None]:
# Define the defoult values

url_seed = "http://ec2-54-174-131-205.compute-1.amazonaws.com/API/HDRO_API.php/"
years = np.arange(2010, 2020)
years = ','.join(map(str, years))
indicator_id = "195606"
struct = ["ciy", "yic", "yci", "iyc", "icy"] 

In [None]:
years

In [None]:
def API_UNDP_call_to_df(indicator_id, url_seed = url_seed, years = years):
    
    # define User agent and session
    s = requests.Session()
    headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36"
    }
    
    # create parameterized query to the API
    my_url = url_seed + "indicator_id" + "=" + indicator_id + "/" + "year" + "=" + years + "/" + "structure" + "=" + struct[3]
    print(my_url)
    response = requests.get(my_url, headers=headers)
    json_objet = response.json()
#     print(json_objet)
    
    #Create df from json object
    df = pd.DataFrame(list(list(json_objet.values())[0].values())[0]) # call the 3rd level nested element
    df = df.stack().to_frame() # stack() method transform the data in narrow format, and to_frame() to a dataframe
    df.reset_index(inplace=True)# fixing the indexing 
    df.columns = ['country_code','year', list(list(json_objet.values())[2].values())[0]]
    country_map = pd.DataFrame({list(df.columns)[0] : list(list(json_objet.values())[1].keys()),# Create a dataframe with the country name and code to map the values to the original one
                                "country_name" : list(list(json_objet.values())[1].values())})
    df = df[list(df.columns)].merge(country_map, how= "left")# Merge the original dataframe with the he country name and code dataframe
    
    columns = df.columns.tolist() # rearrange columns
    columns = columns[-1:] + columns[:3]
    
    df = df[columns]
    
    return(df)

## 3. Extracting the Human development Index "HDI" indicator

In [None]:
HDI_df = API_UNDP_call_to_df(indicator_id = "137506")
HDI_df

In [None]:
### Check the number of Countries present in this dataset

len(HDI_df.country_name.unique())

In [None]:
HDI_df.country_name.unique()

In [None]:
if not os.path.exists('data'):
    print(os.path.abspath('.'))

### Export the semicurated dataset to csv format

In [None]:
HDI_df.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'HDI_df.csv', index=False )

## 4. Extracting the Gender Development Index (GDI) indicator

In [None]:
GDI_df = API_UNDP_call_to_df(indicator_id = "137906")
GDI_df

### Export the semicurated dataset to csv format

In [None]:
GDI_df.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'GDI_df.csv', index=False )

## 5. Extracting the Gender Inequality Index (GII) indicator

In [None]:
GII_df = API_UNDP_call_to_df(indicator_id = "68606")
GII_df

### Export the semicurated dataset to csv format

In [None]:
GII_df.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'GII_df.csv', index=False )

In [None]:
indi_UNDP_df[indi_UNDP_df['Indicator_Name'].str.contains("shar", case = False)]

## 6. Extracting the Share of seats in parliament (% held by women) "Share_parl_seat_Fem" indicator

In [None]:
Share_parl_seat_Fem_df = API_UNDP_call_to_df(indicator_id = "31706")
Share_parl_seat_Fem_df

### Export the semicurated dataset to csv format

In [None]:
Share_parl_seat_Fem_df.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'Share_parl_seat_Fem_df.csv', index=False )

## 7. Extracting the Life expectancy index "LEI" indicator

In [None]:
LEI_df = API_UNDP_call_to_df(indicator_id = "103206")
LEI_df


### Export the semicurated dataset to csv format

In [None]:
LEI_df.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'LEI_df.csv', index=False )

## 8. Extracting the Current health expenditure (% of GDP) "health_exp" indicator

In [None]:
health_exp = API_UNDP_call_to_df(indicator_id = "181806")
health_exp


### Export the semicurated dataset to csv format

In [None]:
health_exp.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'health_exp.csv', index=False )

## 9. Extracting the Total population (millions) "Tot_pop_mill" indicator

In [None]:
Tot_pop_mill = API_UNDP_call_to_df(indicator_id = "44206")
Tot_pop_mill


### Export the semicurated dataset to csv format

In [None]:
Tot_pop_mill.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'Tot_pop_mill.csv', index=False )

In [None]:
API_UNDP_call_to_df("103706")

## 10. Extracting the Education index "Edu_indx" indicator

In [None]:
Edu_indx = API_UNDP_call_to_df(indicator_id = "103706")
Edu_indx


### Export the semicurated dataset to csv format

In [None]:
Edu_indx.to_csv(os.path.abspath('.') + '/' + 'curated_datasets' + '/' + 'Edu_indx.csv', index=False )