In [1]:
import pandas as pd
import requests
from lxml import html
import wget
from zipfile import ZipFile
import fnmatch
import os
from time import sleep

In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', None)

In [3]:
path = os.getcwd()
path

'/Users/rubencito/CAS_datascience/ADS_CAS_Bern_2020/Projects/M3'

## 1. Importing the Worldbank data catalog
We see that the catalog contain > 1500 series!!!

In [5]:
full_db_catalog = pd.read_excel(io=path + "/" + "data_catalogs" + "/" + "WB_fullseries_catalog.xls",
             sheet_name = 1)

full_db_catalog.head()

Unnamed: 0,Series Code,Series Name,Topic,SubTopic1,SubTopic2,SubTopic3
0,AG.AGR.TRAC.NO,"Agricultural machinery, tractors",Environment,Agricultural production,,
1,AG.CON.FERT.PT.ZS,Fertilizer consumption (% of fertilizer production),Environment,Agricultural production,,
2,AG.CON.FERT.ZS,Fertilizer consumption (kilograms per hectare of arable land),Environment,Agricultural production,,
3,AG.LND.AGRI.K2,Agricultural land (sq. km),Environment,Land use,,
4,AG.LND.AGRI.ZS,Agricultural land (% of land area),Environment,Land use,,


## 2. Create a Helper Function to find easily a given catalog by code or by name

In [7]:
def find_your_serie(in_your_serie, full_wb_catalog = full_db_catalog,  by_code = False, by_name = False):
    if by_code:
        
        return(full_db_catalog[full_db_catalog['Series Code'].str.contains(in_your_serie, case = False)])
    
    if by_name:
        
        return(full_db_catalog[full_db_catalog['Series Name'].str.contains(in_your_serie, case = False)])
    else:
        
        return(full_db_catalog)
        

## 3. Extract socioeconomical data from the worlbank

### Create a Function that make a call to the API and return a data frame

In [38]:
#####################

# This function make a specific dataseries request to the worlbank API and return a pandas df as output. 
# It takes 3  arguments:
# 1. Indicator Code "indic_code", 
# 2. year range "year_s", 
# 3. Path destination "destination"

#####################


def API_call_to_df(indic_code, 
                   year_s = '2000:2020', 
                   destination = path + "/" + "datasets" + "/" + "WB_datasets"):
    WB_url_seed = 'http://api.worldbank.org/v2/country/all/indicator/' + indic_code
    my_params = {'date' : year_s,
                'incomelevel' :'',
                'downloadformat' : 'csv',
                'per_page' : '304'}
    
    def number_of_cols(year_s): # Define function to asses index of columns to retrieve
        my_years_int = year_s.split(":")
        my_colums_indx = [*range(0, list(map(int, my_years_int))[1] - list(map(int, my_years_int))[0] + 5)]
        return(my_colums_indx)
    
    for file in os.listdir(destination):
        if fnmatch.fnmatch(file, "API_" + indic_code.upper() + "*"):
            os.remove(destination + "/" + file)
            response = requests.get(WB_url_seed, params = my_params)
            my_zip_file = wget.download(response.url, out= destination)
            print("file exists and will be replaced")
            break
            
    else:
        response = requests.get(WB_url_seed, params = my_params)
        my_zip_file = wget.download(response.url, out= destination)
        
    with ZipFile(my_zip_file, 'r') as zipObj:
            for content in zipObj.namelist():
                if fnmatch.fnmatch(content, 'API_*'):
                    #print(content) #wihtin the content of the zip file find and extract the csv file that contain the data
                    my_filename = content
                    zipObj.extract(content, path = destination)
                    my_df = pd.read_csv(destination + '/' + my_filename, 
                                                header = 2,
                                                usecols = number_of_cols(year_s))
                    os.remove(destination + "/" + my_filename)
    
    my_columns = list(my_df.columns)                
    my_df = my_df.melt(#change the df to long format
        id_vars=my_columns[:2],
        value_vars=my_columns[4:],
        var_name='year', 
        value_name=my_df['Indicator Name'][1])
    
    my_df.rename(columns={my_columns[0]: 'country_name', my_columns[1]: 'country_code'}, inplace=True)#rename columns
    

                    
    return(my_df)


### Filtering the series that contain only variables expressed as %
We obtain only 755 variables which is a lot less!

In [12]:
raw_percentage_db_catalog = full_db_catalog[full_db_catalog['Series Name'].str.contains("%")]
raw_percentage_db_catalog.head()

Unnamed: 0,Series Code,Series Name,Topic,SubTopic1,SubTopic2,SubTopic3
1,AG.CON.FERT.PT.ZS,Fertilizer consumption (% of fertilizer production),Environment,Agricultural production,,
4,AG.LND.AGRI.ZS,Agricultural land (% of land area),Environment,Land use,,
7,AG.LND.ARBL.ZS,Arable land (% of land area),Environment,Land use,,
9,AG.LND.CROP.ZS,Permanent cropland (% of land area),Environment,Land use,,
11,AG.LND.EL5M.RU.ZS,Rural land area where elevation is below 5 meters (% of total land area),Environment,Land use,,


### Display the number of dataseries per topic

In [13]:
raw_percentage_db_catalog.sort_values(by = "Topic").groupby(['Topic'], as_index = False).size()#[raw_percentage_db_catalog[]]

Topic
Economic Policy & Debt       105
Education                    124
Environment                   68
Financial Sector              35
Gender                         9
Health                       131
Infrastructure                 6
Poverty                       20
Private Sector & Trade        91
Public Sector                 27
Social Protection & Labor    139
dtype: int64

#### We Observe that the topics that contain the most number of series are:

1. **Social Protection & Labor**
2. **Health**
3. **Education**
4. **Economic Policy & Debt**

## 4. Explore the various topics to asess quality of the data

This was experimental, need to revisited again...

In [254]:
raw_percentage_db_catalog[raw_percentage_db_catalog['Topic'] == "Social Protection & Labor"].head(10)

Unnamed: 0,Series Code,Series Name,Topic,SubTopic1,SubTopic2,SubTopic3
869,per_allsp.adq_pop_tot,Adequacy of social protection and labor programs (% of total welfare of beneficiary households),Social Protection & Labor,Performance,,
870,per_allsp.ben_q1_tot,Benefit incidence of social protection and labor programs to poorest quintile (% of total SPL benefits),Social Protection & Labor,Performance,,
871,per_allsp.cov_pop_tot,Coverage of social protection and labor programs (% of population),Social Protection & Labor,Performance,,
872,per_lm_alllm.adq_pop_tot,Adequacy of unemployment benefits and ALMP (% of total welfare of beneficiary households),Social Protection & Labor,Performance,,
873,per_lm_alllm.ben_q1_tot,Benefit incidence of unemployment benefits and ALMP to poorest quintile (% of total U/ALMP benefits),Social Protection & Labor,Performance,,
874,per_lm_alllm.cov_pop_tot,Coverage of unemployment benefits and ALMP (% of population),Social Protection & Labor,Performance,,
875,per_lm_alllm.cov_q1_tot,Coverage of unemployment benefits and ALMP in poorest quintile (% of population),Social Protection & Labor,Performance,,
876,per_lm_alllm.cov_q2_tot,Coverage of unemployment benefits and ALMP in 2nd quintile (% of population),Social Protection & Labor,Performance,,
877,per_lm_alllm.cov_q3_tot,Coverage of unemployment benefits and ALMP in 3rd quintile (% of population),Social Protection & Labor,Performance,,
878,per_lm_alllm.cov_q4_tot,Coverage of unemployment benefits and ALMP in 4th quintile (% of population),Social Protection & Labor,Performance,,


In [317]:
# Count the number of NaNs in a given series df
# Indicator_Code = 
# wer_indicator
years = pd.DataFrame(wer_indicator.isnull().sum()).reset_index()[4:]
years.columns = ("year", "NaNs")
years.head(6)

Unnamed: 0,year,NaNs
4,2000,263
5,2001,263
6,2002,261
7,2003,261
8,2004,259
9,2005,251


### Make a function that take a dataframe and return the NaNs listed by year in all countries



In [13]:
def data_quality_asses(dataframe):
    df_years = pd.DataFrame(dataframe.isnull().sum()).reset_index()[4:] # Obtain Nas per year
    df_years.columns = ("year", "NaNs") # rename columns
    df_years["Indicator Code"] = dataframe["Indicator Code"][0] # add new column with the code name
    df_years.set_index( ["Indicator Code"], inplace = True) # reset index to the code name
    df_years = pd.pivot(df_years, columns= 'year', values= "NaNs") # pivot the dataset
    return (df_years)

In [None]:
data_quality_asses(wer_indicator)[:]

In [15]:
wer_indicator.head()

Unnamed: 0,country_name,country_code,year,"GNI, PPP (current international $)"
0,Aruba,ABW,2010,3122230000.0
1,Afghanistan,AFG,2010,50011960000.0
2,Angola,AGO,2010,139021700000.0
3,Albania,ALB,2010,27788450000.0
4,Andorra,AND,2010,


In [None]:
wer_indicator.isnull().sum().to_frame()

## 5. Taking only countries included in the UNDP dataset

Before assesing the quality of the data namely ratio of NaNs to total entries is better to base the analysis in countries only and exclude geografical groups or organizations.

### We make a function that clean the data and export it
this function take and filter countries only present in the UNDP country/code file

In [71]:
# Define a function to clean the dataset
# this function remove unknown conutry ocdes and sort the data by country_name and year

def clean_and_save_my_df(df, output_file_name, destination_dir = path + "/" + "curated_datasets"):
    #collect the country codes for the original file "Country_code.csv"
    UNDP_country_code_list = pd.read_csv(path + "/" + "Country_code.csv")["Country_Code"]
    
    clean_df = df[df['country_code'].isin(UNDP_country_code_list)].reset_index(drop=True)#filter by the countrycode
    clean_df = clean_df.sort_values(by=['country_name', 'year']).reset_index(drop=True)#sort the columns
    
    clean_df.to_csv(destination_dir + '/' + "WB_" + output_file_name + '_df' + '.csv', index=False)
    
    return(clean_df)
    
    
    
    

## 6. Loop over a number of data codes, colect, transform  and save to df for the given indicator

In [77]:
my_WB_indicator_dic = {"NY.GNP.MKTP.PP.CD": "GNI_PPP",
                       "IQ.CPA.TRAN.XQ": "GEI_indx",
                       "IQ.CPA.BREG.XQ": "Env_1",
                       "IQ.CPA.ENVR.XQ" : "Env_2"}

In [78]:
for value in my_WB_indicator_dic:
    my_df = API_call_to_df(indic_code = value)
    clean_and_save_my_df(df = my_df, output_file_name = my_indicator_dic[value])
    sleep(1) # make a time sleep between calls
    

file exists and will be replaced
file exists and will be replaced
file exists and will be replaced
file exists and will be replaced
