#### bulk download World Bank Governance data

In [5]:
import pandas as pd
import numpy as np
import os
import time
import logging
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import openpyxl

pd.set_option('display.max_colwidth', None)

In [13]:
#logger
logging.basicConfig(filename='WB governance.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)

url='https://info.worldbank.org/governance/wgi/#home'

response=requests.get(url)
print(response.status_code)
#to get the url for download
soup=BeautifulSoup(response.content, 'html.parser')

#getting the href for Download full dataset (Excel) link in the webpage
s1=soup.find_all("ul", {"class": "listItems"})

url=s1[1].find_all("a", {"class": "file-download"})[2]['href']
domain='https://info.worldbank.org'
#construct the url
bulk_download_url=domain+url
#to download the url
try:
    r=requests.get(bulk_download_url)
    logging.info(f'Download request is sucessful {r.status_code}')
except Exception as e:
    logging.warning(e)

#write the raw data to excel
with open('WB governance.xlsx','wb') as f:
    try:
        f.write(r.content)
        logging.info('successfuly downloaded bulk excel file')
    except Exception as e:
        logging.warning(e)

200


In [128]:
Governance_total=pd.DataFrame()

#get the sheet names to loop over
wb=openpyxl.load_workbook("WB governance.xlsx")
sheetnames=wb.sheetnames[1:]

for sh in sheetnames:
    logging.info(f'processing sheet {sh}')    
    df=pd.read_excel('WB governance.xlsx', sheet_name=sh)

    #EXTRACT THE DATAFRAME FROM UNSTRCUTURED EXCEL SHEET
    #get the dataframe filtered on idx at value = "Country/Territory"
    idx=df[df.iloc[:,0]=="Country/Territory"].index[0]
    df1=df.iloc[idx-1:, :].reset_index(drop=True)
    #add the sheet name as a column
    df1[sheetnames[1]]=sh

    #zip the 1st 2 rows to make columns
    cols=tuple(zip(df1.iloc[0,:],df1.iloc[1,:]))
    #drop the 2 rows
    df1.drop(axis=0, index=[0,1], inplace=True)
    #add the cols as columns
    df1.columns=cols

    #RESHAPE THE DATAFRAME
    #choose the id_vars (vars not to be reshaped) and value_vars (variables to reshape)
    id_variables=df1.columns.to_list()[:2]
    id_variables.append(df1.columns.to_list()[-1])

    df_melt=pd.melt(df1,
            id_vars=id_variables,
            value_vars=df1.columns.to_list()[2:-1],
            var_name=['Year','Type'])
            
    df_melt.columns=['Country/Territory','Code','Section','Year','Type','value']
    logging.info(f'sheet {sh} succesfully reshaped to long with {len(df_melt)} rows')

    #APPEND TO Governance_total DATAFRAME
    Governance_total=Governance_total.append(df_melt)
    logging.info(f'sheet {sh} is appended successfully')
    logging.info(f'Governance_total has {len(Governance_total)} rows')
    logging.info('/////////////////////////////////////////////////////')


In [129]:
Governance_total.to_excel('Governance_total.xlsx', index=False)