In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score


In [2]:
# 1. Load the Dataset
df = pd.read_csv('Delinquency_prediction_dataset.csv')

In [3]:
# 2. Data Cleaning & Preprocessing
# Standardize Employment Status labels
df['Employment_Status'] = df['Employment_Status'].str.lower().replace({'emp': 'employed'})

# Handle Missing Values for numerical columns using the Median
num_cols = ['Age', 'Income', 'Credit_Score', 'Credit_Utilization', 'Loan_Balance', 'Debt_to_Income_Ratio', 'Account_Tenure']
imputer = SimpleImputer(strategy='median')
df[num_cols] = imputer.fit_transform(df[num_cols])

# Convert Categorical Payment History (Month 1-6) into Numeric Scores
# Mapping: On-time -> 0, Late -> 1, Missed -> 2
payment_map = {'On-time': 0, 'Late': 1, 'Missed': 2}
month_cols = ['Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6']
for col in month_cols:
    df[col] = df[col].map(payment_map)

# One-Hot Encode remaining categorical variables
df = pd.get_dummies(df, columns=['Employment_Status', 'Credit_Card_Type', 'Location'], drop_first=True)

In [4]:
# 3. Define Features (X) and Target (y)
# We drop Customer_ID as it's just an identifier, and Delinquent_Account is our target
X = df.drop(columns=['Customer_ID', 'Delinquent_Account'])
y = df['Delinquent_Account']

In [6]:
# 4. Split the Data into Training (80%) and Testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# 5. Build and Train the Random Forest Model
# We use 'balanced' class weights because only 16% of the data is delinquent
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [9]:
# 6. Model Prediction and Evaluation
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("--- Model Performance Metrics ---")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


--- Model Performance Metrics ---
Accuracy Score: 0.84
ROC-AUC Score: 0.35

Classification Report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91        84
           1       0.00      0.00      0.00        16

    accuracy                           0.84       100
   macro avg       0.42      0.50      0.46       100
weighted avg       0.71      0.84      0.77       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# 7. Identify Top Predictors (Feature Importance)
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 5 Predictors of Delinquency:")
print(importances.head(5))


Top 5 Predictors of Delinquency:
Credit_Utilization      0.120897
Credit_Score            0.098636
Age                     0.097290
Debt_to_Income_Ratio    0.092994
Income                  0.090539
dtype: float64
