Team Gordon

Student Name	Student Number
 Alisha Sahota	20497348
 Anthony Ramelo	20499391
 Chris Wu	10182394
 Elizabeth Zhang	20161231
 Emily Zhao	10096273
 Sam Hossain	20466500


In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Set plot style
sns.set(style='whitegrid')

# Constants
OUTPUT_DIR = 'output_repayment_risk'
os.makedirs(OUTPUT_DIR, exist_ok=True)
CURRENT_YEAR = datetime.now().year

# Functions

def load_data(file_path, sheet_name):
    """
    Load data from an Excel file and strip whitespace from column names.
    """
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    df.columns = df.columns.str.strip()
    return df

def ensure_column_exists(df, column_name, alternative_names=None):
    """
    Ensure a specific column exists in the DataFrame, possibly under alternative names.
    """
    if column_name in df.columns:
        return column_name
    if alternative_names:
        for alt_name in alternative_names:
            if alt_name in df.columns:
                df.rename(columns={alt_name: column_name}, inplace=True)
                return column_name
    raise ValueError(f"Column '{column_name}' or alternatives {alternative_names} not found.")

def create_financial_features(df):
    """
    Create financial features for analysis.
    """
    # Avoid division by zero
    df['Qualified / Verified\nIncome'].replace(0, np.nan, inplace=True)
    df['Loan Term (Months)'].replace(0, np.nan, inplace=True)

    # Financial Ratios
    df['Debt_to_Income_Ratio'] = df['Outstanding Principal'] / df['Qualified / Verified\nIncome']
    df['Loan_to_Income_Ratio'] = df['Loan Amount'] / df['Qualified / Verified\nIncome']
    df['Monthly_Repayment_Burden'] = df['Outstanding Balance'] / df['Loan Term (Months)']

    # Date Calculations
    df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], errors='coerce')
    df['Age'] = CURRENT_YEAR - df['Date of Birth'].dt.year

    df['Disbursement Date'] = pd.to_datetime(df['Disbursement Date'], errors='coerce')
    df['Elapsed_Months'] = (datetime.now() - df['Disbursement Date']).dt.days // 30
    df['Remaining_Tenure'] = df['Loan Term (Months)'] - df['Elapsed_Months']

    # Handle infinite and missing values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    return df

def validate_features(df):
    """
    Validate engineered features by analyzing correlations and identifying redundant features.
    """
    # Select numeric columns
    numeric_df = df.select_dtypes(include=['float64', 'int64'])

    # Correlation matrix
    correlation_matrix = numeric_df.corr()

    # Heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap="coolwarm", cbar=True)
    plt.title("Feature Correlation Heatmap")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'repayment_correlation_heatmap.png'))
    plt.close()

    # Identify redundant features
    redundant_features = set()
    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
    for column in upper_triangle.columns:
        if any(upper_triangle[column].abs() > 0.9):
            redundant_features.add(column)

    print(f"Redundant features (correlation > 0.9): {redundant_features}")
    return redundant_features

def merge_transunion_data(df, trans_union_path, sheet_name='TransUnion Data'):
    """
    Merge data with TransUnion data.
    """
    trans_union_data = pd.read_excel(trans_union_path, sheet_name=sheet_name)
    df['ID'] = df['ID'].astype(str)
    trans_union_data['loan_id'] = trans_union_data['loan_id'].astype(str)

    merged_data = pd.merge(df, trans_union_data, left_on='ID', right_on='loan_id', how='left')
    return merged_data

def create_credit_features(df):
    """
    Create additional credit-related features for analysis.
    """
    # Avoid division by zero
    df[['revolving_credit_limit', 'instalment_credit_limit']].replace(0, np.nan, inplace=True)

    df['Total_Debt'] = df['revolving_credit_balance'] + df['instalment_credit_balance']
    df['Total_Credit_Limit'] = df['revolving_credit_limit'] + df['instalment_credit_limit']
    df['Debt_to_Credit_Ratio'] = df['Total_Debt'] / df['Total_Credit_Limit']
    df['Revolving_Utilization'] = df['revolving_credit_balance'] / df['revolving_credit_limit']
    df['Installment_Utilization'] = df['instalment_credit_balance'] / df['instalment_credit_limit']
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    return df

def derive_repayment_risk(df):
    """
    Derive 'Repayment_Risk' feature.
    """
    # Using a threshold for 'Debt_to_Credit_Ratio'
    df['Repayment_Risk'] = (df['Debt_to_Credit_Ratio'] > 0.8).astype(int)
    return df

def validate_credit_features(df):
    """
    Validate credit-related features by analyzing correlations and identifying redundant features.
    """
    # Similar to validate_features, but for credit features
    redundant_features = validate_features(df)
    return redundant_features

def preprocess_data(df, columns_to_remove):
    """
    Preprocess data: remove columns, handle missing values, encode categorical variables, and scale features.
    """
    # Remove unnecessary columns
    df.drop(columns=[col for col in columns_to_remove if col in df.columns], inplace=True)

    # Drop columns with >50% missing values
    df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)

    # Handle missing values
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    if numeric_cols:
        imputer_num = SimpleImputer(strategy='median')
        df[numeric_cols] = imputer_num.fit_transform(df[numeric_cols])

    if categorical_cols:
        imputer_cat = SimpleImputer(strategy='most_frequent')
        df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

    # Encode categorical variables
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    # Scale numerical features
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

def clean_data(df):
    """
    Cleans the dataset by replacing infinities and NaN values with the median.
    """
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df.fillna(df.median())

def create_model_dataset(df):
    """
    Create dataset for repayment risk prediction.
    """
    repayment_features = [
        'Debt_to_Credit_Ratio', 'Revolving_Utilization', 'Installment_Utilization',
        'Total_Debt', 'Total_Credit_Limit', 'fico_score', 'count_of_inquiries',
        'Debt_to_Income_Ratio', 'Monthly_Repayment_Burden', 'Loan_to_Income_Ratio'
    ]
    repayment_features = [feat for feat in repayment_features if feat in df.columns]

    if 'Repayment_Risk' in df.columns:
        # Include 'ID' in the dataset
        repayment_data = df[['ID'] + repayment_features + ['Repayment_Risk']]
        output_path = os.path.join(OUTPUT_DIR, 'repayment_model_data.xlsx')
        repayment_data.to_excel(output_path, index=False)
        print(f"Repayment model data saved to '{output_path}'.")
    else:
        print("Warning: 'Repayment_Risk' not found. Repayment model data not created.")

def train_repayment_risk_model():
    """
    Train models to predict repayment risk and evaluate their performance.
    """
    # Load the data
    data_path = os.path.join(OUTPUT_DIR, 'repayment_model_data.xlsx')
    data = pd.read_excel(data_path)

    # Define features and target
    repayment_features = [
        'Debt_to_Credit_Ratio', 'Revolving_Utilization', 'Installment_Utilization',
        'Total_Debt', 'Total_Credit_Limit', 'fico_score', 'count_of_inquiries',
        'Debt_to_Income_Ratio', 'Monthly_Repayment_Burden', 'Loan_to_Income_Ratio'
    ]
    repayment_features = [feat for feat in repayment_features if feat in data.columns]
    X = data[repayment_features]
    y = data['Repayment_Risk']

    # Ensure no NaN or infinite values
    X = clean_data(X)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Feature Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)

    # Train Random Forest
    print("\nTraining Random Forest for Repayment Risk...")
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train_scaled, y_train_resampled)
    y_pred_rf = rf.predict(X_test_scaled)
    print("\nRandom Forest Report:")
    print(classification_report(y_test, y_pred_rf))
    print(f"AUC (Random Forest): {roc_auc_score(y_test, rf.predict_proba(X_test_scaled)[:, 1]):.2f}")

    # Train XGBoost
    print("\nTraining XGBoost for Repayment Risk...")
    xgb = XGBClassifier(random_state=42, eval_metric='logloss')
    xgb.fit(X_train_resampled, y_train_resampled)
    y_pred_xgb = xgb.predict(X_test_scaled)
    print("\nXGBoost Report:")
    print(classification_report(y_test, y_pred_xgb))
    print(f"AUC (XGBoost): {roc_auc_score(y_test, xgb.predict_proba(X_test_scaled)[:, 1]):.2f}")

    # Feature importance for Random Forest
    feature_importances = pd.Series(rf.feature_importances_, index=repayment_features).sort_values(ascending=False)
    plt.figure(figsize=(10, 6))
    feature_importances.plot(kind='bar', color='skyblue')
    plt.title('Feature Importances (Random Forest) - Repayment Risk')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'repayment_feature_importances.png'))
    plt.close()

    # Save predictions for analysis
    results = pd.DataFrame({
        'Actual': y_test,
        'Random Forest': y_pred_rf,
        'XGBoost': y_pred_xgb
    })
    predictions_output_path = os.path.join(OUTPUT_DIR, 'repayment_predictions.xlsx')
    results.to_excel(predictions_output_path, index=False)
    print(f"Repayment risk predictions saved to '{predictions_output_path}'.")

# Main Execution

def main():
    file_path = 'Data.xlsx'  # Update with your data file path
    sheet_name = 'Financial Data'  # Update with your sheet name

    # Load data
    df = load_data(file_path, sheet_name)

    # Ensure necessary columns exist
    required_columns = ['ID', 'Outstanding Principal', 'Qualified / Verified\nIncome', 'Loan Amount', 'Outstanding Balance', 'Loan Term (Months)']
    for col in required_columns:
        ensure_column_exists(df, col)

    # Create financial features
    df = create_financial_features(df)

    # Validate features and remove redundant ones
    redundant_features = validate_features(df)
    if redundant_features:
        df.drop(columns=redundant_features, inplace=True)
        print("Dropped redundant features.")

    # Merge with TransUnion data
    trans_union_path = 'TransUnion_Data.xlsx'  # Update with your TransUnion data file path
    df_merged = merge_transunion_data(df, trans_union_path)

    # Create credit features
    df_merged = create_credit_features(df_merged)

    # Validate credit features and remove redundant ones
    redundant_features = validate_credit_features(df_merged)
    if redundant_features:
        df_merged.drop(columns=redundant_features, inplace=True)
        print("Dropped redundant credit features.")

    # Derive repayment risk
    df_merged = derive_repayment_risk(df_merged)

    # Preprocess data
    columns_to_remove = [
        'ID', 'Gender', 'Date of Birth', 'Disbursement Date', 'loan_id',
        'Loan Amount', 'Outstanding Balance', 'Outstanding Principal',
        'Loan Term (Months)', 'Stated Income on application',
        'Qualified / Verified\nIncome', 'Revolving_Utilization', 'Installment_Utilization'
    ]
    df_encoded = preprocess_data(df_merged.copy(), columns_to_remove)

    # Save processed data
    processed_output_path = os.path.join(OUTPUT_DIR, 'processed_repayment_data.xlsx')
    df_encoded.to_excel(processed_output_path, index=False)
    print(f"Processed repayment risk data saved to '{processed_output_path}'.")

    # Create dataset for model
    create_model_dataset(df_encoded)

    # Train and evaluate the repayment risk model
    train_repayment_risk_model()

if __name__ == '__main__':
    main()

Processed engagement data saved to 'output_engagement/processed_engagement_data.xlsx'.
Engagement model data saved to 'output_engagement/engagement_model_data.xlsx'.

Training Logistic Regression for Engagement...

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83        32
           1       0.84      0.81      0.83        32

    accuracy                           0.83        64
   macro avg       0.83      0.83      0.83        64
weighted avg       0.83      0.83      0.83        64

AUC (Logistic Regression): 0.92

Training Random Forest for Engagement...

Random Forest Report:
              precision    recall  f1-score   support

           0       0.96      0.75      0.84        32
           1       0.79      0.97      0.87        32

    accuracy                           0.86        64
   macro avg       0.88      0.86      0.86        64
weighted avg       0.88      0.86      0.86        64

AUC (R