#### bulk download World Bank Governance data

In [1]:
import pandas as pd
import numpy as np
import os
import time
import logging
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import openpyxl
import json
import io

# pd.set_option('display.max_colwidth', None)
#logger
logging.basicConfig(filename='WB governance.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)

In [2]:
logging.info('#############getting data by downloading from https://info.worldbank.org/governance/wgi/#home ##############')

url='https://info.worldbank.org/governance/wgi/#home'

#get the url for download
response=requests.get(url)
logging.info(f'request status code for the main url {response.status_code}')
soup=BeautifulSoup(response.content, 'html.parser')

#getting the href for Download full dataset (Excel) link in the webpage
s1=soup.find_all("ul", {"class": "listItems"})

url=s1[1].find_all("a", {"class": "file-download"})[2]['href']
domain='https://info.worldbank.org'
#construct the url
bulk_download_url=domain+url
#to download the url
try:
    r=requests.get(bulk_download_url)
    logging.info(f'Download request is status code is {r.status_code}')
except Exception as e:
    logging.warning(e)

#write the raw data to excel
with open('WB governance.xlsx','wb') as f:
    try:
        f.write(r.content)
        logging.info('successfuly downloaded bulk excel file')
    except Exception as e:
        logging.warning(e)
logging.info('---------------------------------------------------')

In [3]:
Governance_dataset=pd.DataFrame()

#get the sheet names to loop over
wb=openpyxl.load_workbook("WB governance.xlsx")
sheetnames=wb.sheetnames[1:]

#counter to count rows of newly added dataframes
sheet_counter={}

for sh in sheetnames:
    logging.info(f'processing sheet {sh}')    
    df=pd.read_excel('WB governance.xlsx', sheet_name=sh)

    #EXTRACT THE DATAFRAME FROM UNSTRCUTURED EXCEL SHEET
    #get the dataframe filtered on idx at value = "Country/Territory"
    idx=df[df.iloc[:,0]=="Country/Territory"].index[0]
    df1=df.iloc[idx-1:, :].reset_index(drop=True)

    #zip the 1st 2 rows to make columns
    cols=tuple(zip(df1.iloc[0,:],df1.iloc[1,:]))
    #drop the 2 rows
    df1.drop(axis=0, index=[0,1], inplace=True)
    #add the cols as columns
    df1.columns=cols
    
    #add the sheet name as a column
    df1[sh]=sh

    #filter the dataframe on ['Country/Territory','Code','Estimate', 'Rank']
    cols_to_filter=[(a,b) for (a,b) in list(df1.columns) if (b in ['Country/Territory','Code','Estimate', 'Rank']) | (a==sh)]
    df1=df1.loc[:,cols_to_filter]

    #'Estimate', 'Rank' will be melted, so the melted dataframe must have double the row numbers of the unmelted dataframe. the number of  'Estimate', 'Rank' for timeseries = len(cols_to_filter) - 2 (remove 'Country/Territory','Code', sh)
    wide_df_rows= len(df1)
    logging.info(f'wide dataframe for {sh} has {wide_df_rows} rows') 
    n_wide=len(cols_to_filter)-3

    #RESHAPE THE DATAFRAME
    #choose 'Country/Territory','Code' and the sheetname col added at the end of the dataframe as id_vars (vars not to be reshaped) and the rest as value_vars (variables to reshape)
    id_variables=df1.columns.to_list()[:2]
    id_variables.append(df1.columns.to_list()[-1])

    df_melt=pd.melt(df1,
            id_vars=id_variables,
            value_vars=df1.columns.to_list()[2:-1],
            var_name=['Year','Type'])
            
    df_melt.columns=['Country/Territory','Code','Section','Year','Type','value']
    
    #check if melting went ok by checking the number of rows pre and post melting
    if len(df_melt)==wide_df_rows*n_wide:
        logging.info(f'sheet {sh} succesfully reshaped to long with {len(df_melt)} rows')
    else:
        logging.warning(f'WARNING!!!!!!!!! problem with melting {sh}, which has {len(df_melt)} rows whereas before melting it had {wide_df_rows} rows and n_wide was {n_wide}')

    #APPEND TO Governance_total DATAFRAME
    Governance_dataset=Governance_dataset.append(df_melt)
    logging.info(f'sheet {sh} is appended successfully')
    logging.info(f'Governance_total has {len(Governance_dataset)} rows')
    logging.info('---------------------------------------------------------')

Governance_dataset.to_excel('Governance_dataset.xlsx', index=False)
logging.info(f'Governance_dataset saved with {len(Governance_dataset)} rows')

#get countries codes
df_countries=pd.read_excel('Countries.xlsx')
df_countries=df_countries[(df_countries['Filter']==1)&(df_countries['dataset']=='Governance_dataset')]
countries=dict(zip(df_countries['iso3'],df_countries['Country']))

#filter on Arab countries
Governance_dataset_Arab=Governance_dataset[Governance_dataset['Country/Territory'].isin(countries.values())]
Governance_dataset_Arab.to_excel('Governance_dataset_Arab.xlsx', index=False)
logging.info(f'Governance_dataset_Arab saved with {len(Governance_dataset_Arab)} rows')

#change the code of West Bank and Gaza from WBG to PSE for State of Palestine
Governance_dataset_Arab['Code']=Governance_dataset_Arab['Code'].map({'WBG':'PSE'})
logging.info(f'{len(Governance_dataset_Arab["Code"][Governance_dataset_Arab["Code"]=="PSE"])} labels were changed from WBG to PSE')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Governance_dataset_Arab['Code']=Governance_dataset_Arab['Code'].map({'WBG':'PSE'})


#### ISPAR data

In [4]:
logging.info('#############getting data from ISPAR##############')

id={3:'Global Cybersecurity Index',
        4:'Global Gender Gap Index',
        5:'ICT Development Index',
        6:'Network Readiness Index',
        7:'Global Innovation Index',
        9:'E-Government Development Index',
        11:'Global Competitiveness Index',
        12:'E-Participation index',
        14:'AI Readiness Index',
        16:'Open Data Inventory Index'}


#get countries codes to add to the url
df_countries=pd.read_excel('Countries.xlsx')
df_countries=df_countries[(df_countries['Filter']==1)&(df_countries['dataset']=='ISPAR')]
# countries=dict(zip(df_countries['iso3'],df_countries['Country']))

url='https://datacatalog.unescwa.org/datastore/dump_v2?resource_id=e0d88222-f90c-4a92-8de4-397f6529c402&format=csv'

#get the url for download
response=requests.get(url)

try:
    if response.status_code!=200:
        logging.warning(f'!!!!!!WARNING!!!!!!status code is {response.status_code}')

    #read the response into pandas dataframe
    df = pd.read_csv(io.BytesIO(response.content))
    logging.info(f'dataframe returned with {len(df)} rows')

    #remove the 0.colname from column names
    df.columns=[i.replace("0.", "") for i in df.columns]

    #filter on Index ID and 'Country Name'
    df_filtered=df[(df['Index ID'].isin(id)) & (df['Country Name'].isin(df_countries['Country']))]

    df_filtered.to_excel('ISPAR_dataset.xlsx', index=False)
    logging.info(f'dataframe for Arab countries returned with {len(df_filtered)} rows')

    #goupby and aggregate as mean
    ispar_aggregated=df_filtered.groupby(['Index Name', 'Year']).aggregate({'Score':'mean'})
    ispar_aggregated.to_excel('ISPAR_AGGREGATED.xlsx')
    logging.info('groupby.aggregate() successful and saved as ISPAR_AGGREGATED.xlsx')

    # #to download to a csv
    # with open('ISPAR.csv', 'wb') as f:
    #     f.write(response.content)

except Exception as e:
    logging.warning(f'!!!WARNING!!!  {e}')

### Worldbank  TCdata360

In [5]:
try:

    logging.info('#############creating the url for API request for TCdata360##############')

    ind_url='https://tcdata360-backend.worldbank.org/api/v1/indicators/'
    #get the url for download
    response=requests.get(ind_url)
    logging.info(f'status code for response ~/api/v1/indicators/ is {response.status_code}')

    #get countries codes to add to the url
    df_countries=pd.read_excel('Countries.xlsx')
    df_countries=df_countries[(df_countries['Filter']==1)&(df_countries['dataset']=='TCdata360')]
    countries=dict(zip(df_countries['iso3'],df_countries['Country']))

    #get indicator codes to add to the url
    df_ind=pd.read_excel('indicators_TC360.xlsx')
    df_ind=df_ind[df_ind['Filter']==1]
    ind=dict(zip(df_ind['id'],df_ind['name']))

    main_data_url='https://tcdata360-backend.worldbank.org/api/v1/data?'

    # m_data_url='https://tcdata360-backend.worldbank.org/api/v1/data?countries=ARE%2CBHR%2CCOM%2CDJI%2CDZA%2CEGY%2CIRQ%2CJOR%2CKWT%2CLBN%2CLBY%2CMAR%2CMRT%2COMN%2CQAT%2CSAU%2CSDN%2CSOM%2CSYR%2CTUN%2CYEM%2CPSE&indicators=45410%2C45283%2C3294%2C1776%2C799%2C3500%2C960%2C3421%2C922%2C2073%2C27959%2C27962%2C40712%2C40711'

    cntry_url=''
    first_element=True
    for k,v in countries.items():
        if first_element:
            cntry_url=cntry_url+k
            first_element=False
        else:
            cntry_url=cntry_url+'%2C'+k


    ind_url=''
    first_element=True
    for k,v in ind.items():
        if first_element:
            ind_url=ind_url+str(k)
            first_element=False
        else:
            ind_url=ind_url+'%2C'+str(k)

    data_url=main_data_url+'countries='+cntry_url+'&'+'indicators='+ind_url
    logging.info(f'data_url constructed as {data_url}')

except Exception as e:
    logging.warning(f' !!! WARNING !!! {e}')


In [6]:
################get time series data #######################
try:
    logging.info('#############getting timeseries data for TCdata360##############')
    response=requests.get(data_url)
    logging.info(f'status code for response ~/api/v1/indicators/ is {response.status_code}')
    r=json.loads(response.content)

    '''r['data'] is a list of nested dictionaries of the below structure
    [{'id': 'DZA',
    'indicators': [{'id': 799,
        'values': {'1988': 6.3, '1995': 5.6, '2011': 0.4},
        'estimated': []}, ...'''

    final_data=[]
    #looping over each country data
    for c in r['data']:
        country=c['id']
        #looping over data to get metadata
        for m in c['indicators']:
            metadata=[m['id'],m['estimated']]
            metadata.append(country)
            #looping over data to get time series values and append to eat list above metadata
            country_data=[]
            for v in m['values'].items():
                datapoint=list(v)
                datapoint.extend(metadata)
                country_data.append(datapoint)

            final_data.extend(country_data)
                
    final_data_df=pd.DataFrame(final_data,columns=['Year','Value','indicator_id','estimated','Country'])
    logging.info(f'dataframe for time series created successfuly with  {len(final_data_df)} rows')

    # create a column for indicator name
    final_data_df['Indicator_name']=final_data_df['indicator_id'].map(ind)
    logging.info('Indicator_name created')
    final_data_df['country_name']=final_data_df['Country'].map(countries)
    logging.info('country_name created')
    final_data_df.to_excel('TC360_dataset.xlsx',index=False)
    logging.info('TC360_data.xlsx saved')

except Exception as e:
    logging.warning(f' !!! WARNING !!! {e}')

#######################################################################





#### WorldBank WDI

In [11]:
#get countries codes to add to the url
df_countries=pd.read_csv('C:/Users/511232/Desktop/Worldwide Governance Indicators/WDI metadata/WDICountry.csv')
df_countries=df_countries[df_countries['Filter']==1].copy()
countries=dict(zip(df_countries['Country Code'],df_countries['Short Name']))

#get indicator codes to add to the url
df_ind=pd.read_csv('C:/Users/511232/Desktop/Worldwide Governance Indicators/WDI metadata/WDISeries.csv')
df_ind=df_ind[df_ind['Filter']==1]
ind=dict(zip(df_ind['Series Code'],df_ind['Indicator Name']))

In [None]:
#concatenate the country and series codes
country_codes=''
i=0
for k in countries.keys():
    if i==0:
        country_codes=country_codes+k
        i=1
    else:
        country_codes=country_codes+';'+k

series_codes=''
i=0
for k in ind.keys():
    if i==0:
        series_codes=series_codes+k
        i=1
    else:
        series_codes=series_codes+';'+k

page=1
url='http://api.worldbank.org/v2/country/'+country_codes+'/indicator/'+series_codes+f'?format=json&page={page}'

url

In [114]:
# logging.info('#############creating the url for API request for WDI##############')
ind_url=url
#get the url for download
response=requests.get(ind_url)
print(response.status_code)

json.loads(response.content)

200


[{'message': [{'id': '120',
    'key': 'Invalid value',
    'value': 'The provided parameter value is not valid'}]}]

In [94]:
pd.DataFrame.from_dict(json.loads(response.content)[1], orient='columns')

In [115]:
url='http://api.worldbank.org/v2/country/ARB;ARE/indicator/CC.EST;DT.DOD.DECT.GN.ZS?format=json&page=1'
''
ind_url=url
#get the url for download
response=requests.get(ind_url)
print(response.status_code)

json.loads(response.content)

200


[{'message': [{'id': '120',
    'key': 'Invalid value',
    'value': 'The provided parameter value is not valid'}]}]