In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE  # For balancing the dataset


In [3]:
# Load dataset
df = pd.read_csv("updated_loan_dataset.csv")


In [4]:
df.head()


Unnamed: 0,Property_Area,Gender,Married,Education,Self_Employed,ApplicantIncome,LoanAmount,CoapplicantIncome,Property_Value,Loan_Sanctioned,Loan_Term
0,Rural,Male,Yes,Graduate,No,380389,10624.0,125164.0,32188977,Yes,180
1,Urban,Male,Yes,Graduate,Yes,249000,5478.0,0.0,219128632,Yes,120
2,Urban,Male,Yes,Not Graduate,No,214389,9960.0,195714.0,232416683,Yes,120
3,Urban,Male,No,Graduate,No,498000,11703.0,0.0,232814751,Yes,180
4,Urban,Male,Yes,Not Graduate,No,193639,7885.0,125828.0,235532005,No,120


In [5]:
# Define categorical and numerical features
categorical_features = ["Gender", "Married", "Education", "Self_Employed", "Property_Area"]
numerical_features = ["ApplicantIncome", "CoapplicantIncome", "Property_Value", "Loan_Term"]
target = "Loan_Sanctioned"



In [6]:
# Drop rows where the target variable is missing
df = df.dropna(subset=[target])

# Encode target variable (Yes -> 1, No -> 0)
df[target] = df[target].map({'Yes': 1, 'No': 0})


In [7]:
# Encode categorical features
label_encoders = {}
for col in categorical_features:
    df[col] = df[col].astype(str)  # Ensure categorical data is string
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [8]:
# Handle missing values in numerical features
imputer = SimpleImputer(strategy="mean")
df[numerical_features] = imputer.fit_transform(df[numerical_features])


In [9]:
# Feature scaling for numerical columns
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])



In [10]:
# Splitting dataset into features and target variable
X = df[categorical_features + numerical_features]
y = df[target]


In [11]:
# Handle Imbalanced Dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [12]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [13]:
# Train classification model
model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
model.fit(X_train, y_train)


In [14]:
# Predictions
y_pred = model.predict(X_test)



In [15]:
# Evaluate model performance
print("Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81        49
           1       0.80      0.83      0.82        48

    accuracy                           0.81        97
   macro avg       0.81      0.81      0.81        97
weighted avg       0.82      0.81      0.81        97

✅ Accuracy Score: 0.8144329896907216


In [19]:
# Save model and preprocessing objects
with open("loan_prediction_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
with open("label_encoders.pkl", "wb") as encoders_file:
    pickle.dump(label_encoders, encoders_file)

with open("imputer.pkl", "wb") as imputer_file:
    pickle.dump(imputer, imputer_file)

with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)



In [20]:
print("Model training complete. Files saved successfully!")

Model training complete. Files saved successfully!
