In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [45]:
# Load the training data
train_data = pd.read_excel('train_data.xlsx')  # Adjust this line based on how you load your data

# Check the columns and the first few rows of the DataFrame
print("Columns in the DataFrame:", train_data.columns.tolist())
print("First few rows of the DataFrame:")
print(train_data.head())

# Strip whitespace from column names
train_data.columns = train_data.columns.str.strip()

Columns in the DataFrame: ['customer_id', 'transaction_date', 'sub_grade', 'term', 'home_ownership', 'cibil_score', 'total_no_of_acc', 'annual_inc', 'int_rate', 'purpose', 'loan_amnt', 'application_type', 'installment', 'verification_status', 'account_bal', 'emp_length', 'loan_status']
First few rows of the DataFrame:
   customer_id transaction_date sub_grade        term home_ownership  \
0     10608026       2014-01-01        C5   36 months       MORTGAGE   
1     10235120       2014-01-01        E5   36 months       MORTGAGE   
2     10705805       2014-01-01        D2   36 months       MORTGAGE   
3     11044991       2014-01-01        B4   36 months       MORTGAGE   
4     10161054       2014-01-01        C3   60 months       MORTGAGE   

   cibil_score  total_no_of_acc  annual_inc  int_rate             purpose  \
0          665                9     70000.0     16.24  debt_consolidation   
1          660                8     65000.0     23.40    home_improvement   
2          660  

In [51]:
class LoanRepaymentModel:
    def load(self, train_data):
        """Load and preprocess the data."""
        # Check if the DataFrame is empty
        if train_data.empty:
            raise ValueError("The input DataFrame is empty.")

        # Check if the target column exists
        if 'loan_status' not in train_data.columns:
            raise ValueError("The target column 'loan_status' does not exist in the DataFrame.")

        # Handle datetime columns
        for col in train_data.select_dtypes(include=['datetime64']).columns:
            train_data[col + '_year'] = train_data[col].dt.year
            train_data[col + '_month'] = train_data[col].dt.month
            train_data[col + '_day'] = train_data[col].dt.day
            train_data[col + '_dayofweek'] = train_data[col].dt.dayofweek
            train_data = train_data.drop(col, axis=1)  # Drop the original datetime column

        # Handle missing values for numeric columns
        numeric_cols = train_data.select_dtypes(include=[np.number]).columns
        train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].mean())

        # Handle missing values for categorical columns
        for col in train_data.select_dtypes(include=['object']).columns:
            train_data[col] = train_data[col].fillna(train_data[col].mode()[0])

        # Optionally, convert categorical columns to category type
        for col in train_data.select_dtypes(include=['object']).columns:
            train_data[col] = train_data[col].astype('category')

        # Define features and target
        X = train_data.drop('loan_status', axis=1)  # Use 'loan_status' as the target column
        y = train_data['loan_status']  # Use 'loan_status' as the target column

        # Optional: One-hot encoding for categorical features
        X = pd.get_dummies(X, drop_first=True)

        return X, y


In [52]:
# Initialize the model
loan_model = LoanRepaymentModel()

# Load and preprocess the data
try:
    X, y = loan_model.load(train_data)
except ValueError as e:
    print(e)

In [53]:
# Step 5: Split the Data
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
# Step 6: Train the Model
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Fit the model
model.fit(X_train, y_train)

In [63]:
# Make predictions using the model
y_pred = model.predict(X_test)

print("Predictions completed successfully.")

Predictions completed successfully.


In [64]:
from sklearn.metrics import classification_report, confusion_matrix

# Print the classification report
print("\nClassification Report:")
print("-"*50)
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("\nConfusion Matrix:")
print("-"*50)
print(confusion_matrix(y_test, y_pred))

print("\nModel evaluation completed successfully.")


Classification Report:
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.59      0.29      0.39      5917
           1       0.79      0.93      0.85     16824

    accuracy                           0.76     22741
   macro avg       0.69      0.61      0.62     22741
weighted avg       0.74      0.76      0.73     22741


Confusion Matrix:
--------------------------------------------------
[[ 1724  4193]
 [ 1206 15618]]

Model evaluation completed successfully.


In [65]:
# Step 10: Final Model Evaluation (Direct Re-training with Full Dataset)
print("="*50)
print("Step 10: Final Model Evaluation")
print("="*50)

# Re-train the model on the entire training set
print("\nTraining the model on the full dataset...")
model.fit(X_train, y_train)

# Make predictions with the re-trained model
y_pred_final = model.predict(X_test)

# Print the classification report for the re-trained model
print("\nClassification Report for Final Model:")
print("-"*50)
print(classification_report(y_test, y_pred_final))

# Print the confusion matrix for the re-trained model
print("\nConfusion Matrix for Final Model:")
print("-"*50)
print(confusion_matrix(y_test, y_pred_final))

print("\nFinal model evaluation completed successfully.")



Step 10: Final Model Evaluation

Training the model on the full dataset...

Classification Report for Final Model:
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.59      0.29      0.39      5917
           1       0.79      0.93      0.85     16824

    accuracy                           0.76     22741
   macro avg       0.69      0.61      0.62     22741
weighted avg       0.74      0.76      0.73     22741


Confusion Matrix for Final Model:
--------------------------------------------------
[[ 1724  4193]
 [ 1206 15618]]

Final model evaluation completed successfully.
