In [13]:
import pandas as pd
import numpy as np
from datetime import datetime

# Read the datasets
loan_data = pd.read_csv('Train.csv')
metadata = pd.read_csv('economic_indicators.csv')

# Convert date columns to datetime
loan_data['disbursement_date'] = pd.to_datetime(loan_data['disbursement_date'])
loan_data['due_date'] = pd.to_datetime(loan_data['due_date'])

# Extract year and month from dates
loan_data['disbursement_year'] = loan_data['disbursement_date'].dt.year
loan_data['disbursement_month'] = loan_data['disbursement_date'].dt.month

# Calculate derived features
loan_data['loan_to_repay_ratio'] = loan_data['Total_Amount_to_Repay'] / loan_data['Total_Amount']
loan_data['funded_amount_ratio'] = loan_data['Amount_Funded_By_Lender'] / loan_data['Total_Amount']
loan_data['lender_portion_ratio'] = loan_data['Lender_portion_to_be_repaid'] / loan_data['Lender_portion_Funded']

# Convert categorical variables to dummy variables
#loan_data = pd.get_dummies(loan_data, columns=['loan_type', 'New_versus_Repeat', 'country_id'])
loan_data

Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,...,New_versus_Repeat,Amount_Funded_By_Lender,Lender_portion_Funded,Lender_portion_to_be_repaid,target,disbursement_year,disbursement_month,loan_to_repay_ratio,funded_amount_ratio,lender_portion_ratio
0,ID_266671248032267278,266671,Kenya,248032,267278,Type_1,8448.0,8448.0,2022-08-30,2022-09-06,...,Repeat Loan,120.85,0.014305,121.0,0,2022,8,1.000000,0.014305,8458.485726
1,ID_248919228515267278,248919,Kenya,228515,267278,Type_1,25895.0,25979.0,2022-07-30,2022-08-06,...,Repeat Loan,7768.50,0.300000,7794.0,0,2022,7,1.003244,0.300000,25980.000000
2,ID_308486370501251804,308486,Kenya,370501,251804,Type_7,6900.0,7142.0,2024-09-06,2024-09-13,...,Repeat Loan,1380.00,0.200000,1428.0,0,2024,9,1.035072,0.200000,7140.000000
3,ID_266004285009267278,266004,Kenya,285009,267278,Type_1,8958.0,9233.0,2022-10-20,2022-10-27,...,Repeat Loan,2687.40,0.300000,2770.0,0,2022,10,1.030699,0.300000,9233.333333
4,ID_253803305312267278,253803,Kenya,305312,267278,Type_1,4564.0,4728.0,2022-11-28,2022-12-05,...,Repeat Loan,1369.20,0.300000,1418.0,0,2022,11,1.035933,0.300000,4726.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68649,ID_244559228408267278,244559,Kenya,228408,267278,Type_1,1460.0,1515.0,2022-07-30,2022-08-06,...,Repeat Loan,438.00,0.300000,455.0,0,2022,7,1.037671,0.300000,1516.666667
68650,ID_260062217784267278,260062,Kenya,217784,267278,Type_1,5029.0,5116.0,2022-07-16,2022-07-23,...,Repeat Loan,1508.70,0.300000,1535.0,0,2022,7,1.017300,0.300000,5116.666667
68651,ID_259137216701267278,259137,Kenya,216701,267278,Type_1,5289.0,5289.0,2022-07-15,2022-07-22,...,Repeat Loan,1586.70,0.300000,1587.0,0,2022,7,1.000000,0.300000,5290.000000
68652,ID_266801303201267278,266801,Kenya,303201,267278,Type_1,3334.0,3334.0,2022-11-23,2022-11-30,...,Repeat Loan,741.09,0.222283,741.0,0,2022,11,1.000000,0.222283,3333.595110


In [14]:
# Process metadata
# Get unique indicators
indicators = metadata['Indicator'].unique()
processed_dfs = {}

# Process each indicator separately
for indicator in indicators:
    indicator_df = metadata[metadata['Indicator'] == indicator].copy()
    
    # Get the latest available year for each country
    latest_year_data = indicator_df.melt(
        id_vars=['Country', 'Indicator'],
        value_vars=[col for col in indicator_df.columns if col.startswith('YR')],
        var_name='Year',
        value_name=indicator.replace(', ', '_').replace(' ', '_').replace('(', '').replace(')', '').lower()
    )
    
    latest_year_data['Year'] = latest_year_data['Year'].str.replace('YR', '').astype(int)
    latest_year_data = latest_year_data.sort_values('Year', ascending=False).groupby('Country').first()
    processed_dfs[indicator] = latest_year_data

# Merge all indicator dataframes
final_metadata = pd.concat([df[df.columns[-1]] for df in processed_dfs.values()], axis=1)
final_metadata = final_metadata.reset_index()
final_metadata

Unnamed: 0,Country,inflation_consumer_prices_annual_%,official_exchange_rate_lcu_per_us$_period_average,real_interest_rate_%,average_precipitation_in_depth_mm_per_year,deposit_interest_rate_%,lending_interest_rate_%,interest_rate_spread_lending_rate_minus_deposit_rate_%,fossil_fuel_energy_consumption_%_of_total,unemployment_rate
0,Cote d'Ivoire,4.387117,606.56975,6.246748,1348.0,6.49,5.14,-1.35,26.49233,2.403
1,Ghana,38.106966,11.020408,,1187.0,11.416667,,,52.54306,3.079
2,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682


In [15]:
# Merge loan data with metadata based on country
final_data = loan_data.merge(
    final_metadata,
    left_on='country_id',
    right_on='Country',
    how='left'
)
final_data


Unnamed: 0,ID,customer_id,country_id,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,...,Country,inflation_consumer_prices_annual_%,official_exchange_rate_lcu_per_us$_period_average,real_interest_rate_%,average_precipitation_in_depth_mm_per_year,deposit_interest_rate_%,lending_interest_rate_%,interest_rate_spread_lending_rate_minus_deposit_rate_%,fossil_fuel_energy_consumption_%_of_total,unemployment_rate
0,ID_266671248032267278,266671,Kenya,248032,267278,Type_1,8448.0,8448.0,2022-08-30,2022-09-06,...,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682
1,ID_248919228515267278,248919,Kenya,228515,267278,Type_1,25895.0,25979.0,2022-07-30,2022-08-06,...,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682
2,ID_308486370501251804,308486,Kenya,370501,251804,Type_7,6900.0,7142.0,2024-09-06,2024-09-13,...,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682
3,ID_266004285009267278,266004,Kenya,285009,267278,Type_1,8958.0,9233.0,2022-10-20,2022-10-27,...,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682
4,ID_253803305312267278,253803,Kenya,305312,267278,Type_1,4564.0,4728.0,2022-11-28,2022-12-05,...,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68649,ID_244559228408267278,244559,Kenya,228408,267278,Type_1,1460.0,1515.0,2022-07-30,2022-08-06,...,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682
68650,ID_260062217784267278,260062,Kenya,217784,267278,Type_1,5029.0,5116.0,2022-07-16,2022-07-23,...,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682
68651,ID_259137216701267278,259137,Kenya,216701,267278,Type_1,5289.0,5289.0,2022-07-15,2022-07-22,...,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682
68652,ID_266801303201267278,266801,Kenya,303201,267278,Type_1,3334.0,3334.0,2022-11-23,2022-11-30,...,Kenya,7.671396,139.846384,6.546517,630.0,9.16769,13.588502,4.420812,17.379573,5.682


In [None]:
# Drop unnecessary columns
columns_to_drop = ['ID', 'customer_id', 'tbl_loan_id', 'lender_id', 'disbursement_date', 
                  'due_date', 'Country']
final_data = final_data.drop(columns=columns_to_drop)

# Handle missing values
final_data = final_data.fillna(final_data.mean())

# Separate features and target
X = final_data.drop('target', axis=1)
y = final_data['target']

# Print shape of final datasets
print("Features shape:", X.shape)
print("Target shape:", y.shape)

# Print feature names
print("\nFeature names:")
print(X.columns.tolist())

# Basic statistics of numerical columns
print("\nNumerical features statistics:")
print(X.describe())