In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [11]:
df = pd.read_csv("Task 3 and 4_Loan_Data.csv")
X = df[['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding', 'income', 'years_employed', 'fico_score']]
y = df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [13]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [14]:
y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]


In [15]:
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC AUC Score: {roc_auc:.2f}")

ROC AUC Score: 1.00


In [16]:
def calculate_expected_loss(loan_amt_outstanding, pd, recovery_rate=0.1):
    expected_loss = pd * (1 - recovery_rate) * loan_amt_outstanding
    return expected_loss

In [17]:
sample_loan_amt = 100000  
sample_pd = y_pred_prob[0]  
expected_loss = calculate_expected_loss(sample_loan_amt, sample_pd)
print(f"Expected Loss: ${expected_loss:.2f}")


Expected Loss: $0.00


In [18]:
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix

print("Class distribution before resampling:")
print(df['default'].value_counts())

df_majority = df[df['default'] == 0]
df_minority = df[df['default'] == 1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=len(df_majority), 
                                 random_state=42)  

df_balanced = pd.concat([df_majority, df_minority_upsampled])

print("Class distribution after resampling:")
print(df_balanced['default'].value_counts())

X_balanced = df_balanced[['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding', 'income', 'years_employed', 'fico_score']]
y_balanced = df_balanced['default']

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled, y_train)

y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]

y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

sample_loan_amt = 100000 
sample_pd = y_pred_prob[0]  
expected_loss = calculate_expected_loss(sample_loan_amt, sample_pd)

print(f"Expected Loss: ${expected_loss:.2f}")


Class distribution before resampling:
default
0    8149
1    1851
Name: count, dtype: int64
Class distribution after resampling:
default
0    8149
1    8149
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1619
           1       1.00      1.00      1.00      1641

    accuracy                           1.00      3260
   macro avg       1.00      1.00      1.00      3260
weighted avg       1.00      1.00      1.00      3260

[[1611    8]
 [   0 1641]]
Expected Loss: $89998.86


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_rf_pred_prob = rf_model.predict_proba(X_test_scaled)[:, 1]

y_rf_pred = (y_rf_pred_prob > 0.5).astype(int)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_rf_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_rf_pred))

roc_auc_rf = roc_auc_score(y_test, y_rf_pred_prob)
print(f"ROC AUC Score: {roc_auc_rf:.2f}")

def calculate_expected_loss(loan_amt_outstanding, pd, recovery_rate=0.1):
    expected_loss = pd * (1 - recovery_rate) * loan_amt_outstanding
    return expected_loss

sample_loan_amt = 100000  
sample_rf_pd = y_rf_pred_prob[0] 
expected_loss_rf = calculate_expected_loss(sample_loan_amt, sample_rf_pd)

print(f"Expected Loss using Random Forest: ${expected_loss_rf:.2f}")


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.66      1619
           1       0.00      0.00      0.00      1641

    accuracy                           0.50      3260
   macro avg       0.25      0.50      0.33      3260
weighted avg       0.25      0.50      0.33      3260

Confusion Matrix:
[[1619    0]
 [1641    0]]
ROC AUC Score: 0.70
Expected Loss using Random Forest: $9000.00


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression: The model shows perfect performance in terms of precision, recall, and F1-score for both classes, indicating it is classifying all instances correctly. The ROC AUC score of 1.00 confirms this model's excellent performance.
Random Forest: The model struggles with classifying defaults (class 1), showing a high number of false negatives (1641) and no true positives. The ROC AUC score of 0.70 suggests the model performs better than random guessing but is not as effective as the logistic regression model.

Observations:
Class Imbalance Handling: Logistic Regression seems to have handled the class imbalance better in this case. The Random Forest's performance might be suffering from its inability to generalize well on the minority class, even after resampling.
Model Choice: Based on the results, Logistic Regression is currently performing better and giving more reliable predictions for this dataset.

Next:
Hyperparameter Tuning: For Random Forest, tuning hyperparameters like the number of trees (n_estimators), tree depth, and splitting criteria might improve performance.
Ensemble Methods: Maybe using ensemble methods combining both models' predictions for better performance.
