In [2]:
# Installing Basic Liberaries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Imports the pandas library for data handling and analysis.
import pandas as pd

# Loads the Kenya survey data file from the public GitHub repository.
df_ken = pd.read_csv('https://raw.githubusercontent.com/sitahlango-maker/Financial_Inclusion/main/Colab%20Notebooks/FinancialInclution/Findex_Microdata_2025_Kenya.csv')

# Loads the Tanzania survey data file from the public GitHub repository.
df_tza = pd.read_csv('https://raw.githubusercontent.com/sitahlango-maker/Financial_Inclusion/main/Colab%20Notebooks/FinancialInclution/Findex_Microdata_2025_Tanzania.csv')

# Loads the Uganda survey data file from the public GitHub repository.
df_uga = pd.read_csv('https://raw.githubusercontent.com/sitahlango-maker/Financial_Inclusion/main/Colab%20Notebooks/FinancialInclution/Findex_Microdata_2025_Uganda.csv')

# Prints the number of rows and columns for each countryâ€™s survey file to confirm successful loading.
print("Kenya shape:", df_ken.shape)
print("Tanzania shape:", df_tza.shape)
print("Uganda shape:", df_uga.shape)

# Shows the first 15 column names of the Kenya file to check the structure.
print("\nKenya columns:", df_ken.columns.tolist()[:15], "...")

Kenya shape: (1000, 183)
Tanzania shape: (1000, 183)
Uganda shape: (1000, 183)

Kenya columns: ['year', 'economy', 'economycode', 'regionwb', 'pop_adult', 'wpid_random', 'wgt', 'female', 'age', 'educ', 'inc_q', 'emp_in', 'urbanicity', 'account_fin', 'account_mob'] ...


**Cleaning** **initial** **Findex** **Dataset**

In [None]:

#Basic cleaning & harmonisation of Findex files

def clean_findex(df, country_code, country_name):
    df = df.copy()
    df['country'] = country_name
    df['country_code'] = country_code

    # Convert weight to float
    if 'wgt' in df.columns:
        df['wgt'] = pd.to_numeric(df['wgt'], errors='coerce')

    # Key target variables
    target_cols = ['account_mob', 'dig_account', 'anydigpayment']
    for col in target_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Keeping only useful columns
    keep = ['country', 'country_code', 'wgt', 'female', 'age', 'educ', 'inc_q',
            'emp_in', 'urbanicity', 'account_fin', 'account_mob', 'dig_account',
            'anydigpayment', 'internet_use'] + \
           [c for c in df.columns if c.startswith(('fin','con','fh'))][:20]  # limit for now

    keep = [c for c in keep if c in df.columns]
    df = df[keep]

    return df

df_ken_clean = clean_findex(df_ken, 'KEN', 'Kenya')
df_tza_clean = clean_findex(df_tza, 'TZA', 'Tanzania')
df_uga_clean = clean_findex(df_uga, 'UGA', 'Uganda')

# Stack them
df_micro = pd.concat([df_ken_clean, df_tza_clean, df_uga_clean], ignore_index=True)

print("Combined microdata shape:", df_micro.shape)
print(df_micro['country'].value_counts())
print("\nMissing values (%):\n", df_micro.isna().mean().sort_values(ascending=False).head(12))

**Adding and cleaning other datadets**

In [None]:
# Load & prepare country-level data
# Prevalence (MMPI)
import pandas as pd

df_reg = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FinancialInclution/Mobile_Money_Regulatory_Index_Database_2025_v2(Data).csv')


# Selecting only the 2025 rows (latest year)
df_reg = df_reg[df_reg['Year'] == 2025]

# Select useful columns only
df_reg = df_reg[[
    'Country', 'Year', 'Index', 'Consumer Protection', 'KYC Proportionality',
    'Entry-level transaction limits', 'Maximum transaction limits', 'Agent Eligibility'
]]

# Rename columns for clarity
df_reg = df_reg.rename(columns={
    'Country': 'country_name',
    'Index': 'reg_index',
    'Consumer Protection': 'reg_cons_prot',
    'KYC Proportionality': 'reg_kyc_prop',
    'Entry-level transaction limits': 'reg_entry_lim',
    'Maximum transaction limits': 'reg_max_lim',
    'Agent Eligibility': 'reg_agent_el'
})

# Add country code (manual mapping for the three countries)
df_reg['country_code'] = df_reg['country_name'].map({
    'Kenya': 'KEN',
    'Tanzania': 'TZA',
    'Uganda': 'UGA'
})

# Keep only the three countries
df_reg = df_reg[df_reg['country_code'].isin(['KEN', 'TZA', 'UGA'])]

df_reg.columns.tolist()


In [None]:
# Load the Mobile Money Deployment Tracker
df_deploy = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FinancialInclution/Mobile Money Deployment.csv',
 )

# Keep only rows for the three countries
df_deploy = df_deploy[df_deploy['Country ISO Code'].isin(['KEN', 'TZA', 'UGA'])]

# Count number of providers per country
df_providers = df_deploy.groupby('Country ISO Code').size().reset_index(name='num_providers')

# Rename column for merging
df_providers = df_providers.rename(columns={'Country ISO Code': 'country_code'})

# add launch year of oldest provider (earliest service)
df_deploy['launch_year'] = pd.to_numeric(df_deploy['Launch Year'], errors='coerce')
df_oldest = df_deploy.groupby('Country ISO Code')['launch_year'].min().reset_index(name='earliest_launch_year')
df_oldest = df_oldest.rename(columns={'Country ISO Code': 'country_code'})

df_deploy.columns.tolist()

In [None]:
# Load the Mobile Money Deployment Tracker
df_preval = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FinancialInclution/Mobile Money Prevalent Index-2020-23-Public(MMPI 2020-23).csv',
 )

# Rename column for merging
df_preval = df_preval[['Country', 'ISO3', 'Mobile Money Prevalence (2023)']]

#--Remove rows without valid country code
df_preval = df_preval.dropna(subset=['ISO3'])

#--Select the columns to be used:
df_preval = df_preval[['Country', 'ISO3', 'Mobile Money Prevalence (2023)']]

#--Make the column names short and clear
df_preval.columns = ['country_name', 'country_code', 'mmpi_2023']


#--Keep only three countries (Kenya, Uganda and Tanzania)
df_preval = df_preval[df_preval['country_code'].isin(['KEN', 'TZA', 'UGA'])]

df_preval.columns.tolist()

**Combining the four latter GSMA datasets With the Original Findex Dataset**

In [None]:
df_country_facts = df_preval[['country_code', 'mmpi_2023']].copy()
df_country_facts = df_country_facts.merge(df_reg[['country_code', 'reg_index', 'reg_cons_prot', 'reg_kyc_prop',
                                                  'reg_entry_lim', 'reg_max_lim', 'reg_agent_el']],
                                          on='country_code', how='left')
df_deploy_info = df_providers.merge(df_oldest, on='country_code', how='left')
df_country_facts = df_country_facts.merge(df_deploy_info, on='country_code', how='left')

In [None]:
# Preparing microdata (the three Findex survey files already combined in df_micro)
# Adding country_code (just in case it's not updated properly)
df_micro['country_code'] = df_micro['country'].map({
    'Kenya': 'KEN',
    'Tanzania': 'TZA',
    'Uganda': 'UGA'
})

# Keep only the most useful survey columns
keep_survey = [
    'country_code', 'female', 'age', 'educ', 'inc_q', 'urbanicity',
    'account_mob', 'dig_account', 'anydigpayment', 'internet_use', 'wgt'
]
df_survey_clean = df_micro[keep_survey].copy()

# Building one small country facts table from the other five sources
# Starting with prevalence dataset as base
df_country_facts = df_preval[['country_code', 'mmpi_2023']].copy()

# Adding regulatory scores
df_country_facts = df_country_facts.merge(
    df_reg[['country_code', 'reg_index', 'reg_cons_prot', 'reg_kyc_prop',
            'reg_entry_lim', 'reg_max_lim', 'reg_agent_el']],
    on='country_code',
    how='left'
)

# Adding the number of providers and earliest launch year
df_providers = df_deploy.groupby('Country ISO Code').size().reset_index(name='num_providers')
df_earliest = df_deploy.groupby('Country ISO Code')['launch_year'].min().reset_index(name='earliest_launch')
df_providers = df_providers.rename(columns={'Country ISO Code': 'country_code'})
df_earliest  = df_earliest.rename(columns={'Country ISO Code': 'country_code'})

df_deploy_info = df_providers.merge(df_earliest, on='country_code', how='left')
df_country_facts = df_country_facts.merge(df_deploy_info, on='country_code', how='left')


# Joining the country facts to every row of the survey data
df_final = df_survey_clean.merge(
    df_country_facts,
    on='country_code',
    how='left'
)

# Checking the result
print("Final combined dataset shape:", df_final.shape)
print("First few rows:\n", df_final.head())
print("\nMissing values (%):\n", df_final.isna().mean().sort_values(ascending=False).head(10))

# Saving the final file
df_final.to_parquet(
    '/content/drive/MyDrive/Colab Notebooks/FinancialInclution/final_combined_data.parquet',
    index=False
)