In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv("/content/drive/MyDrive/ML project files/credit_risk_dataset.csv")

In [3]:
print("Dataset Shape:", df.shape)
print(df.head())
print(df.isnull().sum())

Dataset Shape: (32581, 12)
   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_default_on_file  cb_person_cred_hist_length  
0                 0.59                         Y    

In [4]:
df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)


In [5]:
le = LabelEncoder()

In [6]:
cat_cols = [
    'person_home_ownership',
    'loan_intent',
    'loan_grade',
    'cb_person_default_on_file'
]


In [7]:
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


In [8]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [10]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

In [11]:
rf.fit(X_train, y_train)

In [12]:
y_pred = rf.predict(X_test)

In [14]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9294132089369015

Confusion Matrix:
 [[6286   45]
 [ 530 1285]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.96      6331
           1       0.97      0.71      0.82      1815

    accuracy                           0.93      8146
   macro avg       0.94      0.85      0.89      8146
weighted avg       0.93      0.93      0.93      8146



In [15]:
import pickle

# Save the trained model to a pickle file
filename = 'Loan Prediction.pkl'
pickle.dump(rf, open(filename, 'wb'))

print(f"Model successfully saved to {filename}")

Model successfully saved to Loan Prediction.pkl


In [16]:
import pickle
import pandas as pd

# Load the trained model
loaded_model = pickle.load(open('Loan Prediction.pkl', 'rb'))

print("Model loaded successfully!")

Model loaded successfully!


In [18]:
# Create a sample input for prediction (ensure it matches the training features)
# Note: Categorical features must be encoded to numerical values as done during training.
# For example, 'person_home_ownership' was encoded to numbers (e.g., RENT=3, OWN=2, MORTGAGE=0)
# 'cb_person_default_on_file' was encoded (e.g., N=0, Y=1)

sample_input = pd.DataFrame({
    'person_age': [28],                       # Example age
    'person_income': [75000],                 # Example income
    'person_home_ownership': [3],             # RENT (encoded value)
    'person_emp_length': [5.0],               # Example employment length
    'loan_intent': [1],                       # EDUCATION (encoded value, assuming 1 for EDUCATION from previous encoding)
    'loan_grade': [1],                        # B (encoded value, assuming 1 for B from previous encoding)
    'loan_amnt': [10000],                     # Example loan amount
    'loan_int_rate': [9.5],                   # Example interest rate
    'loan_percent_income': [0.13],            # Example loan to income ratio
    'cb_person_default_on_file': [0],         # N (encoded value)
    'cb_person_cred_hist_length': [7]         # Example credit history length
})

print("Sample Input Data:\n", sample_input)

# Make a prediction
prediction = loaded_model.predict(sample_input)

# Interpret the prediction
if prediction[0] == 0:
    result = "The loan is predicted to be approved (Not Default)."
else:
    result = "The loan is predicted to be rejected (Default)."

print(f"\nPrediction for the sample input: {result}")

Sample Input Data:
    person_age  person_income  person_home_ownership  person_emp_length  \
0          28          75000                      3                5.0   

   loan_intent  loan_grade  loan_amnt  loan_int_rate  loan_percent_income  \
0            1           1      10000            9.5                 0.13   

   cb_person_default_on_file  cb_person_cred_hist_length  
0                          0                           7  

Prediction for the sample input: The loan is predicted to be approved (Not Default).
