In [11]:
# 03_model_development.ipynb

# Import necessary libraries
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import joblib
import os

# Load the train-test splits
splits_path = 'data/splits/'

X_train = pd.read_csv(os.path.join(splits_path, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(splits_path, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(splits_path, 'y_train.csv'))
y_test = pd.read_csv(os.path.join(splits_path, 'y_test.csv'))

# Display the first few rows of the data
print("X_train Preview:")
print(X_train.head())

# Step 1: Balance the classes using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Check if the classes are balanced after SMOTE
print("\nClass Distribution After SMOTE:")
print(y_train_balanced.value_counts())

# Step 2: Train a model - You can use RandomForestClassifier or LogisticRegression
# Here we use RandomForest as an example
model = RandomForestClassifier(random_state=42)
model.fit(X_train_balanced, y_train_balanced)

# Step 3: Make predictions
y_pred = model.predict(X_test)

# Step 4: Evaluate the model
print("\nModel Evaluation - Accuracy Score:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 5: Save the trained model
model_path = '../models/credit_risk_model.pkl'

# Make sure the models folder exists, if not create it
if not os.path.exists('../models'):
    os.makedirs('../models')

# Save the trained model
joblib.dump(model, model_path)
print(f"\nTrained model has been saved to: {model_path}")


X_train Preview:
        Age  Annual Income  Credit Score  Loan Amount  Debt to Income  \
0 -0.299879       1.167263     -1.163268    -0.800606        1.498765   
1 -0.500190       0.663436     -0.016393    -1.410475        1.037388   
2 -1.167897      -1.643562     -1.686294     1.252116        0.537564   
3  1.102304      -0.569615      0.128542     0.171241        1.344973   
4 -0.900814       0.902091      1.590493     1.319188       -0.192950   

   Existing Loan_1  
0             True  
1             True  
2             True  
3             True  
4            False  

Class Distribution After SMOTE:
Has Default
0              18469
1              18469
Name: count, dtype: int64


  return fit_method(estimator, *args, **kwargs)



Model Evaluation - Accuracy Score:
1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3346
           1       1.00      1.00      1.00      4654

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000


Trained model has been saved to: ../models/credit_risk_model.pkl


In [12]:
import joblib

# Load the trained model from the saved file
model_path = 'models/credit_risk_model.pkl'

try:
    model = joblib.load(model_path)
    print(f"Model loaded successfully from {model_path}")
except Exception as e:
    print(f"Error loading the model: {e}")





Model loaded successfully from models/credit_risk_model.pkl


In [13]:
print(type(model))


<class 'sklearn.ensemble._forest.RandomForestClassifier'>


In [14]:

from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model using the balanced training data
model.fit(X_train_balanced, y_train_balanced)

# Optionally, check the model’s score to ensure it's being trained properly
print("Training Accuracy:", model.score(X_train_balanced, y_train_balanced))

# Save the trained model
import joblib
model_path = 'models/credit_risk_model.pkl'
joblib.dump(model, model_path)
print(f"Trained model has been saved to: {model_path}")


  return fit_method(estimator, *args, **kwargs)


Training Accuracy: 1.0
Trained model has been saved to: models/credit_risk_model.pkl


In [15]:
# Assuming model is loaded correctly and X_test is available
y_pred = model.predict(X_test)

# Optionally, print some predictions to verify
print("First 5 predictions:", y_pred[:5])


First 5 predictions: [1 1 0 0 1]


In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Evaluate the model's performance on the test data
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy}")

# Step 2: Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 3: Confusion Matrix
print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)


Accuracy Score: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3346
           1       1.00      1.00      1.00      4654

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000


Confusion Matrix:
[[3346    0]
 [   0 4654]]


In [17]:
# Saving evaluation metrics to a text file
with open('../models/evaluation_results.txt', 'w') as f:
    f.write(f"Accuracy Score: {accuracy}\n\n")
    f.write("Classification Report:\n")
    f.write(classification_report(y_test, y_pred))
    f.write("\nConfusion Matrix:\n")
    f.write(str(conf_matrix))

print("Evaluation results have been saved to: models/evaluation_results.txt")


Evaluation results have been saved to: models/evaluation_results.txt


In [18]:
from sklearn.model_selection import cross_val_score

# Cross-validation
cv_scores = cross_val_score(model, X_train_balanced, y_train_balanced, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Cross-validation scores: [1. 1. 1. 1. 1.]
Mean cross-validation score: 1.0


In [19]:
model.fit(X_train_balanced, y_train_balanced.values.ravel())


In [20]:
with open('../models/evaluation_results.txt', 'w') as f:
    f.write(f"Accuracy Score: {accuracy}\n\n")
    f.write("Classification Report:\n")
    f.write(classification_report(y_test, y_pred))
    f.write("\nConfusion Matrix:\n")
    f.write(str(conf_matrix))

print("Evaluation results have been saved to: models/evaluation_results.txt")

Evaluation results have been saved to: models/evaluation_results.txt
