#### bulk download World Bank Governance data

In [1]:
import pandas as pd
import numpy as np
import os
import time
import logging
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import openpyxl
import json
import io

# pd.set_option('display.max_colwidth', None)
#logger
logging.basicConfig(filename='WB governance.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)

#### Getting country list

In [2]:
#for WGI
df_countries=pd.read_excel('Countries.xlsx')
df_countries=df_countries[(df_countries['Filter']==1)&(df_countries['dataset']=='Governance_dataset')]
countries_wgi=dict(zip(df_countries['iso3'],df_countries['Country']))

#for ISPAR
df_countries=pd.read_excel('Countries.xlsx')
df_countries_ispar=df_countries[(df_countries['Filter']==1)&(df_countries['dataset']=='ISPAR')]

#for TC360
df_countries=pd.read_excel('Countries.xlsx')
df_countries=df_countries[(df_countries['Filter']==1)&(df_countries['dataset']=='TCdata360')]
countries_tc360=dict(zip(df_countries['iso3'],df_countries['Country']))

#### Bulk download from WGI

In [3]:
logging.info('#############getting data by downloading from https://info.worldbank.org/governance/wgi/#home ##############')

url='https://info.worldbank.org/governance/wgi/#home'

#get the url for download
response=requests.get(url)
logging.info(f'request status code for the main url {response.status_code}')
soup=BeautifulSoup(response.content, 'html.parser')

#getting the href for Download full dataset (Excel) link in the webpage
s1=soup.find_all("ul", {"class": "listItems"})

url=s1[1].find_all("a", {"class": "file-download"})[2]['href']
domain='https://info.worldbank.org'
#construct the url
bulk_download_url=domain+url
#to download the url
try:
    r=requests.get(bulk_download_url)
    logging.info(f'Download request is status code is {r.status_code}')
except Exception as e:
    logging.warning(e)

#write the raw data to excel
with open('C:/Users/511232/Desktop/Worldwide Governance Indicators/CODES/Datasets/WB governance.xlsx','wb') as f:
    try:
        f.write(r.content)
        logging.info('successfuly downloaded bulk excel file')
    except Exception as e:
        logging.warning(e)
logging.info('---------------------------------------------------')

try:
    df_WGI=pd.DataFrame()

    #get the sheet names to loop over
    wb=openpyxl.load_workbook("C:/Users/511232/Desktop/Worldwide Governance Indicators/CODES/Datasets/WB governance.xlsx")
    sheetnames=wb.sheetnames[1:]

    #counter to count rows of newly added dataframes
    sheet_counter={}

    for sh in sheetnames:
        logging.info(f'processing sheet {sh}')    
        df=pd.read_excel('C:/Users/511232/Desktop/Worldwide Governance Indicators/CODES/Datasets/WB governance.xlsx', sheet_name=sh)

        #EXTRACT THE DATAFRAME FROM UNSTRCUTURED EXCEL SHEET
        #get the dataframe filtered on idx at value = "Country/Territory"
        idx=df[df.iloc[:,0]=="Country/Territory"].index[0]
        df1=df.iloc[idx-1:, :].reset_index(drop=True)

        #zip the 1st 2 rows to make columns. output example as below
        '''((nan, 'Country/Territory'),
        (nan, 'Code'),
        (1996, 'Estimate'),
        (1996, 'StdErr'),
        (1996, 'NumSrc'),'''
        cols=tuple(zip(df1.iloc[0,:],df1.iloc[1,:]))
        #drop the 2 rows
        df1.drop(axis=0, index=[0,1], inplace=True)
        #add the cols as columns
        df1.columns=cols
        
        #add the sheet name as a column
        df1[sh]=sh

        #filter the dataframe on ['Country/Territory','Code','Estimate', 'Rank']
        cols_to_filter=[(a,b) for (a,b) in list(df1.columns) if (b in ['Country/Territory','Code','Estimate', 'Rank']) | (a==sh)]
        #cols_to_filter will look like
        '''[(nan, 'Country/Territory'),
        (nan, 'Code'),
        (1996, 'Estimate'),
        (1996, 'Rank'),'''
        df1=df1.loc[:,cols_to_filter]

        #'Estimate', 'Rank' will be melted, so the melted dataframe must have double the row numbers of the unmelted dataframe. the number of  'Estimate', 'Rank' for timeseries = len(cols_to_filter) - 3 (remove 'Country/Territory','Code', sh)
        wide_df_rows= len(df1)
        n_wide=len(cols_to_filter)-3

        #RESHAPE THE DATAFRAME
        #choose 'Country/Territory','Code' and the sheetname col added at the end of the dataframe as id_vars (vars not to be reshaped) and the rest as value_vars (variables to reshape)
        # id_variables such as: [(nan, 'Country/Territory'), (nan, 'Code'), (2021, 'Rank')]
        id_variables=df1.columns.to_list()[:2]
        id_variables.append(df1.columns.to_list()[-1])

        df_melt=pd.melt(df1,
                id_vars=id_variables,
                value_vars=df1.columns.to_list()[2:-1],
                var_name=['Year','Value'])
                
        df_melt.columns=['Country','Code','Indicator','Year','Type','Value']
        
        #check if melting went ok by checking the number of rows pre and post melting
        if len(df_melt)==wide_df_rows*n_wide:
            logging.info(f'sheet {sh} succesfully reshaped to long with {len(df_melt)} rows')
        else:
            logging.warning(f'WARNING!!!!!!!!! problem with melting {sh}, which has {len(df_melt)} rows whereas before melting it had {wide_df_rows} rows and n_wide was {n_wide}')

        #APPEND TO Governance_total DATAFRAME
        df_WGI=df_WGI.append(df_melt)
        logging.info(f'sheet {sh} is appended successfully')
        logging.info(f'Governance_total has {len(df_WGI)} rows')
        logging.info('---------------------------------------------------------')

    #adding the world average
    df_WGI['Value']=pd.to_numeric(df_WGI['Value'])
    world_avg=df_WGI.groupby(['Indicator', 'Year']).agg({'Value':'mean'}).reset_index()
    world_avg['Country']='World'
    world_avg
    Governance_dataset=pd.concat([df_WGI,world_avg], axis=0)
    logging.info('World averages added to the Governance_dataset')

    #transforming indicator names
    ind_dct={'VoiceandAccountability':'Voice and Accountability',
             'Political StabilityNoViolence' : 'Political Stability No Violence',
             'GovernmentEffectivenes' : 'Government Effectiveness',
             'RegulatoryQuality' : 'Regulatory Quality',
             'RuleofLaw' : 'Rule of Law',
             'ControlofCorruption' : 'Control of Corruption'}
    
    df_WGI['Indicator']=df_WGI['Indicator'].map(ind_dct)

    df_WGI.to_excel('C:/Users/511232/Desktop/Worldwide Governance Indicators/CODES/Datasets/df_WGI.xlsx', index=False)
    logging.info(f'Governance_dataset saved with {len(df_WGI)} rows')

    #filter on Arab countries and world
    df_WGI_Arab=df_WGI[df_WGI['Country'].isin(countries_wgi.values())].copy()
        
    #change WBG to PSE for State of Palestine
    df_WGI_Arab.loc[df_WGI_Arab['Code']=='WBG', 'Code']='PSE'
    logging.info('Code changed from WBG to PSE for State of Palestine')
    df_WGI_Arab.loc[df_WGI_Arab['Code']=='PSE', 'Country']='State of Palestine'
    logging.info('Country/Territory changed from WBG to PSE for State of Palestine')

    #add Note regarding State of Palestine
    df_WGI_Arab.loc[df_WGI_Arab['Code']=='PSE', 'Note']='Only for West Bank and Gaza'
    logging.info('West bank and Gaza note is added')
    
    df_WGI_Arab.to_excel('C:/Users/511232/Desktop/Worldwide Governance Indicators/CODES/Datasets/df_WGI_Arab.xlsx', index=False)
    logging.info(f'Governance_dataset_Arab saved with {len(df_WGI_Arab)} rows')
    logging.info('///////////////////////////////////////////////////////\n')
except Exception as e:
    logging.warning(f'!!! WARNING !!! {e}')

#### ISPAR

In [4]:
logging.info('#############getting data from ISPAR##############')

id={3:'Global Cybersecurity Index',
        4:'Global Gender Gap Index',
        5:'ICT Development Index',
        6:'Network Readiness Index',
        7:'Global Innovation Index',
        9:'E-Government Development Index',
        11:'Global Competitiveness Index',
        12:'E-Participation index',
        14:'AI Readiness Index',
        16:'Open Data Inventory Index'}

url='https://datacatalog.unescwa.org/datastore/dump_v2?resource_id=e0d88222-f90c-4a92-8de4-397f6529c402&format=csv'

#get the url for download
response=requests.get(url)

try:
    if response.status_code!=200:
        logging.warning(f'!!!!!!WARNING!!!!!!status code is {response.status_code}')

    #read the response into pandas dataframe
    df = pd.read_csv(io.BytesIO(response.content))
    logging.info(f'dataframe returned with {len(df)} rows')

    #remove the 0.colname from column names
    df.columns=[i.replace("0.", "") for i in df.columns]
    #save ISPAR_all countries
    df.to_excel('ISPAR_all countries.xlsx', index=False)

    #filter on Index ID and 'Country Name'
    df_ispar_arab=df[(df['Index ID'].isin(id)) & (df['Country Name'].isin(df_countries_ispar['Country']))]

    #change PSE to State of Palestine
    df_ispar_arab.loc[df_ispar_arab['ISO']=='PSE', 'Country']='State of Palestine'
    logging.info('Country changed from West Bank and Gaza to State of Palestine')

    #add Note regarding State of Palestine
    df_ispar_arab.loc[df_ispar_arab['ISO']=='PSE', 'Note']='Only for West Bank and Gaza'
    logging.info('West bank and Gaza note is added')

    df_ispar_arab.to_excel('C:/Users/511232/Desktop/Worldwide Governance Indicators/CODES/Datasets/ISPAR_Arab.xlsx', index=False)
    logging.info(f'dataframe for Arab countries returned with {len(df_ispar_arab)} rows')

    #goupby and aggregate as mean
    ispar_aggregated=df_ispar_arab.groupby(['Index Name', 'Year']).aggregate({'Score':'mean'})
    ispar_aggregated.to_excel('C:/Users/511232/Desktop/Worldwide Governance Indicators/CODES/Datasets/ISPAR_AGGREGATED.xlsx')
    logging.info('groupby.aggregate() successful and saved as ISPAR_AGGREGATED.xlsx')

    # #to download to a csv
    # with open('ISPAR.csv', 'wb') as f:
    #     f.write(response.content)

except Exception as e:
    logging.warning(f'!!!WARNING!!!  {e}')

logging.info('///////////////////////////////////////////////////////\n')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


### Worldbank  TCdata360

In [5]:
try:

    logging.info('#############creating the url for API request for TCdata360##############')

    ind_url='https://tcdata360-backend.worldbank.org/api/v1/indicators/'
    #get the url for download
    response=requests.get(ind_url)
    logging.info(f'status code for response ~/api/v1/indicators/ is {response.status_code}')

    #get indicator codes to add to the url
    df_ind=pd.read_excel('TCdata360_all indicators.xlsx')
    df_ind=df_ind[df_ind['Filter']==1]
    ind=dict(zip(df_ind['id'],df_ind['name']))

    main_data_url='https://tcdata360-backend.worldbank.org/api/v1/data?'

    # m_data_url='https://tcdata360-backend.worldbank.org/api/v1/data?countries=ARE%2CBHR%2CCOM%2CDJI%2CDZA%2CEGY%2CIRQ%2CJOR%2CKWT%2CLBN%2CLBY%2CMAR%2CMRT%2COMN%2CQAT%2CSAU%2CSDN%2CSOM%2CSYR%2CTUN%2CYEM%2CPSE&indicators=45410%2C45283%2C3294%2C1776%2C799%2C3500%2C960%2C3421%2C922%2C2073%2C27959%2C27962%2C40712%2C40711'


    cntry_url=''
    first_element=True
    for k,v in countries_tc360.items():
        if first_element:
            cntry_url=cntry_url+k
            first_element=False
        else:
            cntry_url=cntry_url+'%2C'+k


    ind_url=''
    first_element=True
    for k,v in ind.items():
        if first_element:
            ind_url=ind_url+str(k)
            first_element=False
        else:
            ind_url=ind_url+'%2C'+str(k)

    data_url=main_data_url+'countries='+cntry_url+'&'+'indicators='+ind_url
    logging.info(f'data_url constructed as {data_url}')

except Exception as e:
    logging.warning(f' !!! WARNING !!! {e}')

############################################################
################get time series data #######################
try:
    logging.info('#############getting timeseries data for TCdata360##############')
    response=requests.get(data_url)
    logging.info(f'status code for response ~/api/v1/indicators/ is {response.status_code}')
    r=json.loads(response.content)

    '''r['data'] is a list of nested dictionaries of the below structure
    [{'id': 'DZA',
    'indicators': [{'id': 799,
        'values': {'1988': 6.3, '1995': 5.6, '2011': 0.4},
        'estimated': []}, ...'''

    final_data=[]
    #looping over each country data
    for c in r['data']:
        country=c['id']
        #looping over data to get metadata
        for m in c['indicators']:
            metadata=[m['id'],m['estimated']]
            metadata.append(country)
            #looping over data to get time series values and append to eat list above metadata
            country_data=[]
            for v in m['values'].items():
                datapoint=list(v)
                datapoint.extend(metadata)
                country_data.append(datapoint)

            final_data.extend(country_data)
                
    df_tc360=pd.DataFrame(final_data,columns=['Year','Value','indicator_id','estimated','Country'])
    logging.info(f'dataframe for time series created successfuly with  {len(df_tc360)} rows')

    # create a column for indicator name
    df_tc360['Indicator_name']=df_tc360['indicator_id'].map(ind)
    logging.info('Indicator_name created')
    df_tc360['country_name']=df_tc360['Country'].map(countries_tc360)
    logging.info('country_name created')
    df_tc360.to_excel('C:/Users/511232/Desktop/Worldwide Governance Indicators/CODES/Datasets/TC360_dataset.xlsx',index=False)
    logging.info('TC360_data.xlsx saved')

except Exception as e:
    logging.warning(f' !!! WARNING !!! {e}')

logging.info('///////////////////////////////////////////////////////\n')
#######################################################################


#### WorldBank WDI

In [6]:
try:
    logging.info('Getting WDI data')
    #get countries codes to add to the url
    df_countries=pd.read_csv('C:/Users/511232/Desktop/Worldwide Governance Indicators/WDI metadata/WDICountry.csv')
    df_countries=df_countries[df_countries['Filter']==1].copy()
    countries=dict(zip(df_countries['Country Code'],df_countries['Short Name']))

    #get indicator codes to add to the url
    df_ind=pd.read_csv('C:/Users/511232/Desktop/Worldwide Governance Indicators/WDI metadata/WDISeries.csv')
    df_ind=df_ind[df_ind['Filter']==1]
    ind=dict(zip(df_ind['Series Code'],df_ind['Indicator Name']))

    ########################################
    #concatenate the country and series codes
    country_codes=''
    i=0
    for k in countries.keys():
        if i==0:
            country_codes=country_codes+k
            i=1
        else:
            country_codes=country_codes+';'+k

    #############################################
    main_url='http://api.worldbank.org/v2/country/'+country_codes+'/indicator/'
    # df_WDI=pd.DataFrame()
    df_list=[]

    for series in ind.keys():
        logging.info(f'getting data for series {ind[series]}')
        page=1
        #loop through pages for each series code
        while page:
            url=main_url+series+f'?format=json&page={page}'

            #get the url for download
            response=requests.get(url)

            #check if valid data was returned
            if len(json.loads(response.content)[1])>0:
                #convert to dataframe and append to df_list
                df=pd.DataFrame.from_dict(json.loads(response.content)[1], orient='columns')
                df_list.append(df)
                page+=1
                total_pages=page
            else:
                page=False
                logging.info(f'{ind[series]} downloaded with total {total_pages} pages')
                logging.info('--------------------------------------')
    
    #concatenate the dataframes
    df_WDI=pd.concat(df_list)
    #extract Indicator, Type and Country
    df_WDI['Indicator']=df_WDI['indicator'].map(lambda x: x['value'].split(':')[0])

    def extract(x):
        if len(x['value'].split(':'))>1:
            return(x['value'].split(':')[1])
        else:
            return('')

    df_WDI['Type']=df_WDI['indicator'].map(extract)
    df_WDI['Country']=df_WDI['country'].map(lambda x: x['value'])
    df_WDI.drop(['country', 'indicator'], axis=1, inplace=True)
    logging.info('Indicator, Type and Country successfully extracted')

    #change PSE to State of Palestine
    df_WDI.loc[df_WDI['countryiso3code']=='PSE', 'Country']='State of Palestine'
    logging.info('Country changed from West Bank and Gaza to State of Palestine')

    #add Note regarding State of Palestine
    df_WDI.loc[df_WDI['countryiso3code']=='PSE', 'Note']='Only for West Bank and Gaza'
    logging.info('West bank and Gaza note is added')

    df_WDI.to_excel('C:/Users/511232/Desktop/Worldwide Governance Indicators/CODES/Datasets/WDI.xlsx')
    logging.info('WDI.xlsx saved')

except Exception as e:
    logging.warning(f'!!! WARNING !!!! {e}')
    raise e
    


### Add all the dataframes to a consolidated dataframe

In [None]:
#add the columns to cols_for_consolidated_df
cols_for_consolidated_df=[]

cols_for_consolidated_df.append(df_WGI_Arab.columns)



In [None]:
#adding the world average
Governance_dataset['Value']=pd.to_numeric(Governance_dataset['Value'])
world_avg=Governance_dataset.groupby(['Indicator', 'Year']).agg({'Value':'mean'}).reset_index()
world_avg['Country']='World'
world_avg
Governance_dataset=pd.concat([Governance_dataset,world_avg], axis=0)
logging.info('World averages added to the Governance_dataset')

#change WBG to PSE for State of Palestine
# Governance_dataset_Arab.loc[Governance_dataset_Arab['Code']=='WBG', 'Code']='PSE'
# logging.info('Code changed from WBG to PSE for State of Palestine')
Governance_dataset_Arab.loc[Governance_dataset_Arab['iso3']=='PSE', 'Country']='State of Palestine'
logging.info('Country/Territory changed from WBG to PSE for State of Palestine')

#add Note regarding State of Palestine
Governance_dataset_Arab.loc[Governance_dataset_Arab['iso3']=='PSE', 'Note']='Only for West Bank and Gaza'
logging.info('West bank and Gaza note is added')