# World Bank API Exploration
## Data collection and baseline setup

goal: pull governance indicators and economic data from world bank api for our case study countries

countries:
- malaysia (MYS) - 1MDB scandal
- mozambique (MOZ) - hidden debt crisis
- canada (CAN) - control country

timeframe: 2010-2024

In [12]:
import wbdata
import pandas as pd
import datetime
import os

## setting up the query parameters

In [13]:
# country codes as used by world bank
countries = ["CAN", "MYS", "MOZ"]

# date range for historical data
data_range = (datetime.datetime(2010, 1, 1), datetime.datetime(2024, 1, 1))

In [14]:
# defining all indicators we want to pull
# split into governance indicators (main focus) and economic indicators (context)

indicators = {
    # governance indicators - these match table 1 from morgan's case study
    'VA.EST': 'Voice_Accountability',
    'PV.EST': 'Political_Stability',
    'GE.EST': 'Government_Effectiveness',
    'RQ.EST': 'Regulatory_Quality',
    'RL.EST': 'Rule_of_Law',
    'CC.EST': 'Control_of_Corruption',
    
    # economic indicators - useful for detecting financial patterns
    'DT.DOD.DECT.GN.ZS': 'External_Debt_perc_GNI',
    'NY.GDP.MKTP.KD.ZG': 'GDP_Growth_annual_perc',
    'GC.XPN.TOTL.GD.ZS': 'Govt_Expenditure_perc_GDP',
    'BX.KLT.DINV.WD.GD.ZS': 'FDI_Inflows_perc_GDP',
    'SI.POV.DDAY': 'Poverty_Headcount_Ratio'
}

## fetching data from world bank api

In [15]:
# fetching data from api

# get_dataframe pulls all indicators for specified countries and dates
df = wbdata.get_dataframe(indicators, 
                          country=countries, 
                          date=data_range,
                          parse_dates=False)  # keep dates as year strings


## cleaning and formatting the dataframe

In [16]:
# reset index so country and date become regular columns
df = df.reset_index()
df = df.rename(columns={'date': 'Year', 'country': 'Country'})

# reorder columns for readability
column_order = ['Country', 'Year'] + list(indicators.values())
existing_columns = [col for col in column_order if col in df.columns]
df = df[existing_columns]

# sort by country then year
df = df.sort_values(by=['Country', 'Year']).reset_index(drop=True)

## inspecting the data

In [17]:
print(f"shape: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"years covered: {df['Year'].min()} to {df['Year'].max()}")
print(f"\ncolumns in dataset:")
print(df.columns.tolist())

shape: 45 rows, 13 columns
years covered: 2010 to 2024

columns in dataset:
['Country', 'Year', 'Voice_Accountability', 'Political_Stability', 'Government_Effectiveness', 'Regulatory_Quality', 'Rule_of_Law', 'Control_of_Corruption', 'External_Debt_perc_GNI', 'GDP_Growth_annual_perc', 'Govt_Expenditure_perc_GDP', 'FDI_Inflows_perc_GDP', 'Poverty_Headcount_Ratio']


In [18]:
# checking for missing values
print("missing values per column:")
print(df.isnull().sum())
print(f"\nmissing data percentage:")
print(round(df.isnull().sum() / len(df) * 100, 2))

missing values per column:
Country                       0
Year                          0
Voice_Accountability          3
Political_Stability           3
Government_Effectiveness      3
Regulatory_Quality            3
Rule_of_Law                   3
Control_of_Corruption         3
External_Debt_perc_GNI       31
GDP_Growth_annual_perc        0
Govt_Expenditure_perc_GDP     7
FDI_Inflows_perc_GDP          0
Poverty_Headcount_Ratio      25
dtype: int64

missing data percentage:
Country                       0.00
Year                          0.00
Voice_Accountability          6.67
Political_Stability           6.67
Government_Effectiveness      6.67
Regulatory_Quality            6.67
Rule_of_Law                   6.67
Control_of_Corruption         6.67
External_Debt_perc_GNI       68.89
GDP_Growth_annual_perc        0.00
Govt_Expenditure_perc_GDP    15.56
FDI_Inflows_perc_GDP          0.00
Poverty_Headcount_Ratio      55.56
dtype: float64


In [19]:
# preview the data
print("first 15 rows:")
df.head(15)

first 15 rows:


Unnamed: 0,Country,Year,Voice_Accountability,Political_Stability,Government_Effectiveness,Regulatory_Quality,Rule_of_Law,Control_of_Corruption,External_Debt_perc_GNI,GDP_Growth_annual_perc,Govt_Expenditure_perc_GDP,FDI_Inflows_perc_GDP,Poverty_Headcount_Ratio
0,Canada,2010,1.352659,0.936318,1.777827,1.69343,1.79859,2.061873,,3.090806,19.084707,1.837256,0.2
1,Canada,2011,1.380145,1.077176,1.772545,1.68484,1.72712,1.971133,,3.137194,17.850268,2.137833,0.2
2,Canada,2012,1.437505,1.113016,1.75697,1.707195,1.756421,1.918904,,1.755661,17.51752,2.700169,0.2
3,Canada,2013,1.45344,1.061422,1.780741,1.729891,1.747508,1.879378,,2.325814,17.084882,3.629804,0.5
4,Canada,2014,1.412332,1.175504,1.753718,1.838725,1.886297,1.832193,,2.873467,16.40205,3.553903,0.2
5,Canada,2015,1.467299,1.262337,1.730935,1.706058,1.807141,1.84565,,0.649971,17.059779,3.853895,0.5
6,Canada,2016,1.445611,1.240412,1.744541,1.727414,1.800915,1.944466,,1.038551,17.498604,2.23835,0.5
7,Canada,2017,1.478084,1.089681,1.815573,1.879656,1.763439,1.881446,,3.033835,17.606595,1.537521,0.5
8,Canada,2018,1.502411,0.963971,1.675134,1.69942,1.715142,1.790208,,2.742963,17.5409,2.469312,0.2
9,Canada,2019,1.430308,0.994934,1.697311,1.710002,1.719776,1.729897,,1.908432,18.10571,2.806767,0.2


## saving to data/raw/

In [20]:
# create directory if it doesn't exist
os.makedirs('../data/raw', exist_ok=True)

# save the baseline dataset
output_path = '../data/raw/corruption_data_baseline.csv'
df.to_csv(output_path, index=False)

print(f"saved to: {output_path}")

saved to: ../data/raw/corruption_data_baseline.csv


## quick analysis of governance indicators

checking how malaysia and mozambique compare to canada across the six governance indicators

In [21]:
# filtering to just governance indicators
governance_cols = ['Country', 'Year', 'Voice_Accountability', 'Political_Stability', 
                   'Government_Effectiveness', 'Regulatory_Quality', 'Rule_of_Law', 
                   'Control_of_Corruption']

gov_df = df[governance_cols]

# calculate average scores by country
print("average governance scores by country (2010-2024):")
print(gov_df.groupby('Country')[governance_cols[2:]].mean().round(2))

average governance scores by country (2010-2024):
            Voice_Accountability  Political_Stability  \
Country                                                 
Canada                      1.44                 1.04   
Malaysia                   -0.26                 0.14   
Mozambique                 -0.40                -0.62   

            Government_Effectiveness  Regulatory_Quality  Rule_of_Law  \
Country                                                                 
Canada                          1.70                1.71         1.71   
Malaysia                        0.96                0.63         0.46   
Mozambique                     -0.78               -0.64        -0.90   

            Control_of_Corruption  
Country                            
Canada                       1.81  
Malaysia                     0.19  
Mozambique                  -0.73  


In [22]:
# looking at specific years from table 1 (2013, 2018, 2023)
key_years = ['2013', '2018', '2023']
print("governance scores for key years (matching table 1):")
print(gov_df[gov_df['Year'].isin(key_years)].sort_values(['Year', 'Country']))

governance scores for key years (matching table 1):
       Country  Year  Voice_Accountability  Political_Stability  \
3       Canada  2013              1.453440             1.061422   
18    Malaysia  2013             -0.339791             0.051792   
33  Mozambique  2013             -0.256402            -0.226966   
8       Canada  2018              1.502411             0.963971   
23    Malaysia  2018             -0.099501             0.248114   
38  Mozambique  2018             -0.484721            -0.833230   
13      Canada  2023              1.479646             0.822421   
28    Malaysia  2023              0.087619             0.168515   
43  Mozambique  2023             -0.593393            -1.268691   

    Government_Effectiveness  Regulatory_Quality  Rule_of_Law  \
3                   1.780741            1.729891     1.747508   
18                  0.993432            0.567808     0.341338   
33                 -0.635946           -0.417519    -0.820203   
8                