# Step 1: Expand Dataset with Additional Countries

**Goal**: Collect World Bank data for additional countries to expand the training dataset from 3 countries (42 rows) to ~15-17 countries (~200+ rows)

**Strategy**:
- **High-Risk Countries (5)**: Countries with known corruption scandals, weak governance
- **Medium-Risk Countries (4)**: Countries with mixed governance scores
- **Low-Risk Countries (7)**: Stable, high-governance countries

**Timeframe**: 2010-2023 (same as baseline)

**Output**: Extended dataset saved to `data/raw/corruption_data_expanded.csv`


In [None]:
import wbdata
import pandas as pd
import datetime
import os

# set working directory to project root
current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    os.chdir('..')
elif 'notebooks' in current_dir:
    project_root = current_dir.split('notebooks')[0].rstrip('/')
    if os.path.exists(project_root):
        os.chdir(project_root)

print(f"Working directory: {os.getcwd()}")


## Define Country Lists by Risk Category


In [None]:
# Original baseline countries (already have data)
baseline_countries = {
    'CAN': 'Canada',  # Low-risk control
    'MYS': 'Malaysia',  # High-risk (1MDB)
    'MOZ': 'Mozambique'  # High-risk (hidden debt)
}

# High-risk countries: known corruption scandals, weak governance
high_risk_countries = {
    'AGO': 'Angola',  # Oil revenue corruption
    'VEN': 'Venezuela',  # PDVSA corruption, collapsing governance
    'ZWE': 'Zimbabwe',  # Infrastructure project corruption
    'IRQ': 'Iraq',  # Reconstruction fund corruption
    'UKR': 'Ukraine'  # Pre-2014 development fund issues
}

# Medium-risk countries: mixed governance scores, isolated incidents
medium_risk_countries = {
    'BRA': 'Brazil',  # Lava Jato but stronger institutions
    'ZAF': 'South Africa',  # State capture but decent baseline
    'IND': 'India',  # Mixed governance, large economy
    'PHL': 'Philippines'  # Variable governance scores
}

# Low-risk countries: stable, high-governance
low_risk_countries = {
    'NOR': 'Norway',  # Consistently top scores
    'DNK': 'Denmark',  # Strong anti-corruption
    'SGP': 'Singapore',  # High effectiveness and rule of law
    'AUS': 'Australia',  # Stable governance
    'NZL': 'New Zealand',  # Clean governance record
    'CHE': 'Switzerland',  # Strong institutions
    'DEU': 'Germany'  # Solid governance throughout period
}

# Combine all new countries (excluding baseline)
all_new_countries = {**high_risk_countries, **medium_risk_countries, **low_risk_countries}

print(f"Baseline countries: {len(baseline_countries)}")
print(f"High-risk countries: {len(high_risk_countries)}")
print(f"Medium-risk countries: {len(medium_risk_countries)}")
print(f"Low-risk countries: {len(low_risk_countries)}")
print(f"Total new countries: {len(all_new_countries)}")
print(f"\nAll new country codes: {list(all_new_countries.keys())}")


## Define Indicators (Same as Baseline)


In [None]:
# Same indicators as baseline notebook
indicators = {
    # governance indicators - these match table 1 from morgan's case study
    'VA.EST': 'Voice_Accountability',
    'PV.EST': 'Political_Stability',
    'GE.EST': 'Government_Effectiveness',
    'RQ.EST': 'Regulatory_Quality',
    'RL.EST': 'Rule_of_Law',
    'CC.EST': 'Control_of_Corruption',
    
    # economic indicators - useful for detecting financial patterns
    'DT.DOD.DECT.GN.ZS': 'External_Debt_perc_GNI',
    'NY.GDP.MKTP.KD.ZG': 'GDP_Growth_annual_perc',
    'GC.XPN.TOTL.GD.ZS': 'Govt_Expenditure_perc_GDP',
    'BX.KLT.DINV.WD.GD.ZS': 'FDI_Inflows_perc_GDP',
    'SI.POV.DDAY': 'Poverty_Headcount_Ratio'
}

print(f"Indicators to collect: {len(indicators)}")
print(f"  - Governance: 6")
print(f"  - Economic: 5")


## Load Baseline Data


In [None]:
# Load existing baseline data
baseline_path = 'data/raw/corruption_data_baseline.csv'
if os.path.exists(baseline_path):
    df_baseline = pd.read_csv(baseline_path)
    print(f"Loaded baseline data: {df_baseline.shape[0]} rows, {df_baseline.shape[1]} columns")
    print(f"Baseline countries: {df_baseline['Country'].unique()}")
    print(f"Baseline years: {df_baseline['Year'].min()} to {df_baseline['Year'].max()}")
else:
    print(f"Warning: Baseline file not found at {baseline_path}")
    df_baseline = pd.DataFrame()


## Fetch Data for New Countries


In [None]:
# Date range: 2010-2023 (excluding 2024 as in baseline processing)
data_range = (datetime.datetime(2010, 1, 1), datetime.datetime(2024, 1, 1))

# Get country codes for new countries
new_country_codes = list(all_new_countries.keys())

print(f"Fetching data for {len(new_country_codes)} new countries...")
print(f"Date range: 2010-2023")
print(f"Country codes: {new_country_codes}")

# Fetch data from World Bank API
try:
    df_new = wbdata.get_dataframe(indicators, 
                                  country=new_country_codes, 
                                  date=data_range,
                                  parse_dates=False)
    print(f"\n✓ Successfully fetched data!")
    print(f"  Shape: {df_new.shape[0]} rows, {df_new.shape[1]} columns")
except Exception as e:
    print(f"\n✗ Error fetching data: {e}")
    raise


## Clean and Format New Data


In [None]:
# Reset index so country and date become regular columns
df_new = df_new.reset_index()
df_new = df_new.rename(columns={'date': 'Year', 'country': 'Country'})

# Reorder columns for readability (same as baseline)
column_order = ['Country', 'Year'] + list(indicators.values())
existing_columns = [col for col in column_order if col in df_new.columns]
df_new = df_new[existing_columns]

# Sort by country then year
df_new = df_new.sort_values(by=['Country', 'Year']).reset_index(drop=True)

print(f"Cleaned data shape: {df_new.shape[0]} rows, {df_new.shape[1]} columns")
print(f"\nCountries in new data:")
print(df_new['Country'].unique())
print(f"\nYears: {df_new['Year'].min()} to {df_new['Year'].max()}")


## Inspect New Data Quality


In [None]:
# Check for missing values
print("Missing values per column:")
missing_counts = df_new.isnull().sum()
missing_pct = (df_new.isnull().sum() / len(df_new)) * 100

missing_summary = pd.DataFrame({
    'Missing_Count': missing_counts,
    'Missing_Percent': missing_pct
})

print(missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))

# Check data availability by country
print("\n" + "="*60)
print("Data availability by country (rows per country):")
country_counts = df_new['Country'].value_counts().sort_index()
print(country_counts)
print(f"\nExpected: 14 rows per country (2010-2023)")
print(f"Countries with complete data: {(country_counts == 14).sum()} / {len(country_counts)}")


## Combine with Baseline Data


In [None]:
# Combine baseline and new data
if not df_baseline.empty:
    # Ensure same columns
    common_columns = list(set(df_baseline.columns) & set(df_new.columns))
    df_baseline_subset = df_baseline[common_columns]
    df_new_subset = df_new[common_columns]
    
    # Combine
    df_expanded = pd.concat([df_baseline_subset, df_new_subset], ignore_index=True)
    
    # Sort by country then year
    df_expanded = df_expanded.sort_values(by=['Country', 'Year']).reset_index(drop=True)
    
    print(f"Combined dataset:")
    print(f"  Total rows: {df_expanded.shape[0]}")
    print(f"  Total columns: {df_expanded.shape[1]}")
    print(f"  Total countries: {df_expanded['Country'].nunique()}")
    print(f"\nCountries in expanded dataset:")
    print(sorted(df_expanded['Country'].unique()))
else:
    print("No baseline data found, using only new data")
    df_expanded = df_new.copy()


## Save Expanded Dataset


In [None]:
# Create directory if it doesn't exist
os.makedirs('data/raw', exist_ok=True)

# Save expanded dataset
output_path = 'data/raw/corruption_data_expanded.csv'
df_expanded.to_csv(output_path, index=False)

print(f"✓ Saved expanded dataset to: {output_path}")
print(f"  Shape: {df_expanded.shape[0]} rows, {df_expanded.shape[1]} columns")
print(f"  Countries: {df_expanded['Country'].nunique()}")
print(f"  Years: {df_expanded['Year'].min()} to {df_expanded['Year'].max()}")

# Summary by country
print(f"\n" + "="*60)
print("Summary by country:")
summary = df_expanded.groupby('Country').agg({
    'Year': ['count', 'min', 'max']
})
summary.columns = ['Row_Count', 'Min_Year', 'Max_Year']
print(summary)


## Quick Preview of Governance Scores

Check that governance indicators are present and reasonable for new countries


In [None]:
# Governance indicators
governance_cols = ['Voice_Accountability', 'Political_Stability', 'Government_Effectiveness', 
                   'Regulatory_Quality', 'Rule_of_Law', 'Control_of_Corruption']

# Check average governance scores by country (for 2023 as example)
print("Average governance scores by country (2023):")
df_2023 = df_expanded[df_expanded['Year'] == '2023'].copy()
if not df_2023.empty:
    gov_summary = df_2023.groupby('Country')[governance_cols].mean().round(2)
    print(gov_summary)
else:
    print("No 2023 data available")

# Check a few sample countries from each risk category
print("\n" + "="*60)
print("Sample governance scores (2023) by risk category:")

sample_high = ['Angola', 'Venezuela', 'Zimbabwe'] if 'Angola' in df_expanded['Country'].values else []
sample_medium = ['Brazil', 'South Africa', 'India'] if 'Brazil' in df_expanded['Country'].values else []
sample_low = ['Norway', 'Denmark', 'Singapore'] if 'Norway' in df_expanded['Country'].values else []

for country_list, category in [(sample_high, 'High-Risk'), (sample_medium, 'Medium-Risk'), (sample_low, 'Low-Risk')]:
    if country_list:
        print(f"\n{category} countries:")
        for country in country_list:
            country_data = df_2023[df_2023['Country'] == country]
            if not country_data.empty:
                print(f"  {country}: Control_of_Corruption = {country_data['Control_of_Corruption'].values[0]:.2f}, "
                      f"Rule_of_Law = {country_data['Rule_of_Law'].values[0]:.2f}")
