In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. Load Data
df = pd.read_csv('/content/drive/MyDrive/loan_data.csv')

numeric_features = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
                    'loan_amount', 'interest_rate']

categorical_nominal = ['gender', 'marital_status', 'employment_status', 
                       'loan_purpose']

# Added 'grade_subgrade' here. 
# Alphabetical sorting (A1, A2, B1...) works perfectly for risk grading.
categorical_ordinal = ['education_level', 'grade_subgrade']

# --- 2. Define Preprocessing Pipelines ---
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_nom_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

cat_ord_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# --- 3. Combine into a Preprocessor ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numeric_features),
        ('cat_nom', cat_nom_pipeline, categorical_nominal),
        ('cat_ord', cat_ord_pipeline, categorical_ordinal)
    ])

# --- 4. Split Data (Corrected Drop) ---
# We only drop 'id' and the target. We keep 'grade_subgrade' as a feature.
X = df.drop(['id', 'loan_paid_back'], axis=1)
y = df['loan_paid_back']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, stratify=y)

print("Data successfully split!")
print(f"Training shape: {X_train.shape}")
print(f"Testing shape: {X_test.shape}")


In [None]:
print(df.columns.tolist())

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve

# 1. create the Full Pipeline with Model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, 
                                          max_depth=10, 
                                          random_state=42, 
                                          n_jobs=-1))
])

# 2. Train the Model
model_pipeline.fit(X_train, y_train)

# 3. Predict Probabilities (We need probability for AUC, not just class labels)
# predict_proba returns [prob_class_0, prob_class_1]
y_pred_probs = model_pipeline.predict_proba(X_test)[:, 1]

# 4. Calculate AUC
auc_score = roc_auc_score(y_test, y_pred_probs)

print(f"Overall Model AUC on Test Set: {auc_score:.4f}")

In [None]:
# Function to calculate AUC for specific groups
def calculate_subgroup_metrics(X_test, y_test, y_pred_probs, column_name):
    # Create a temporary dataframe for analysis to keep raw values
    analysis_df = X_test.copy()
    analysis_df['target'] = y_test
    analysis_df['score'] = y_pred_probs
    
    results = {}
    
    # Iterate through unique values in the subgroup (e.g., 'PhD', 'Masters')
    for group in analysis_df[column_name].unique():
        subset = analysis_df[analysis_df[column_name] == group]
        
        # We need at least one '0' and one '1' in the target to calculate AUC
        if len(subset['target'].unique()) > 1:
            auc = roc_auc_score(subset['target'], subset['score'])
            results[group] = auc
        else:
            results[group] = "N/A (Not enough data)"
            
    return results

# 1. Analyze by Education Level
print("--- Fairness Report: Education Level ---")
edu_results = calculate_subgroup_metrics(X_test, y_test, y_pred_probs, 'education_level')

# Sort and print
for level, score in sorted(edu_results.items()):
    print(f"Education: {level:20} | AUC: {score:.4f}")


# 2. Analyze by Loan Purpose (Top 3 vs Bottom 3)
print("\n--- Fairness Report: Loan Purpose ---")
purpose_results = calculate_subgroup_metrics(X_test, y_test, y_pred_probs, 'loan_purpose')

# Sort by AUC score (descending)
sorted_purposes = sorted(
    [item for item in purpose_results.items() if isinstance(item[1], float)], 
    key=lambda x: x[1], 
    reverse=True
)

print("Top 3 Performing Purposes (Safest Predictions):")
for p in sorted_purposes[:3]:
    print(f"  {p[0]:20}: {p[1]:4f}")

print("\nBottom 3 Performing Purposes (Hardest to Predict):")
for p in sorted_purposes[-3:]:
    print(f"  {p[0]:20}: {p[1]:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Extract Feature Names from the Preprocessor
# This automatically gets names for One-Hot Encoded columns too
feature_names = model_pipeline.named_steps['preprocessor'].get_feature_names_out()

# 2. Get Importance values from the Random Forest
importances = model_pipeline.named_steps['classifier'].feature_importances_

# 3. Create a DataFrame for plotting
feat_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# 4. Sort and Plot Top 10 Features
top_10 = feat_imp_df.sort_values(by='Importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=top_10, palette='viridis')
plt.title('Top 10 Drivers of Loan Default Risk')
plt.xlabel('Relative Importance')
plt.ylabel('Feature Name')
plt.tight_layout()
plt.show()

In [None]:
# 1. Retrieve the original IDs for the test set samples
# (We use the index of X_test to find the matching IDs in the original df)
test_ids = df.loc[X_test.index, 'id']

# 2. Create the Results DataFrame
results_df = pd.DataFrame({
    'id': test_ids,
    'predicted_probability': y_pred_probs,  # The probability of class 1 (Paid Back)
    'actual_outcome': y_test                # Useful for visual comparison
})

# 3. Sort by probability (optional, to see highest risk vs safest loans)
results_df = results_df.sort_values(by='predicted_probability', ascending=True)

# 4. Display the first 10 rows (The highest risk loans)
print("--- Lowest Probability of Repayment (High Risk) ---")
print(results_df.head(10))

# 5. Display the last 10 rows (The safest loans)
print("\n--- Highest Probability of Repayment (Safe) ---")
print(results_df.tail(10))