## Import the Libraries & Load the data

In [76]:
# Important necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [64]:
# Load the Dataset

train_data = pd.read_csv("Train_data.csv")
test_data = pd.read_csv("Test_data.csv")

## Data Preprocessing

In [65]:
# Check basic rows

train_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [66]:
test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [67]:
train_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [68]:
test_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,367.0,367.0,362.0,361.0,338.0
mean,4805.599455,1569.577657,136.132597,342.537396,0.825444
std,4910.685399,2334.232099,61.366652,65.156643,0.38015
min,0.0,0.0,28.0,6.0,0.0
25%,2864.0,0.0,100.25,360.0,1.0
50%,3786.0,1025.0,125.0,360.0,1.0
75%,5060.0,2430.5,158.0,360.0,1.0
max,72529.0,24000.0,550.0,480.0,1.0


In [69]:
print(train_data.shape)
print(test_data.shape)

(614, 13)
(367, 12)


**Handle missing values**

In [70]:
# Fill missing categorical columns with their mode
train_data.fillna({
    'Gender': train_data['Gender'].mode()[0],
    'Married': train_data['Married'].mode()[0],
    'Dependents': train_data['Dependents'].mode()[0],
    'Self_Employed': train_data['Self_Employed'].mode()[0],
    'Credit_History': train_data['Credit_History'].mode()[0]
}, inplace=True)

test_data.fillna({
    'Gender': test_data['Gender'].mode()[0],
    'Married': test_data['Married'].mode()[0],
    'Dependents': test_data['Dependents'].mode()[0],
    'Self_Employed': test_data['Self_Employed'].mode()[0],
    'Credit_History': test_data['Credit_History'].mode()[0]
}, inplace=True)

In [71]:
print(train_data.isnull().sum()) 
print(test_data.isnull().sum())

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           5
Loan_Amount_Term     6
Credit_History       0
Property_Area        0
dtype: int64


In [72]:
# Fill missing numerical columns with their median

train_data.fillna({
    'LoanAmount': train_data['LoanAmount'].median(),
    'Loan_Amount_Term': train_data['Loan_Amount_Term'].median(),
}, inplace=True)

test_data.fillna({
    'LoanAmount': test_data['LoanAmount'].median(),
    'Loan_Amount_Term': test_data['Loan_Amount_Term'].median(),
}, inplace=True)

print(train_data.isnull().sum()) 
print(test_data.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64


In [73]:
# Encode categorical columns

cols_to_encode = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
le = LabelEncoder()

for col in cols_to_encode:
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.transform(test_data[col]) 

## Separate Features and Target

In [88]:
# Split the training data into train and validation sets

X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_data['Loan_Status']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [89]:
# Standardize (only for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## Model Building and Training

In [90]:
# Train Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_scaled, y_train)

# Train Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

## Prediction on Validation Data

In [91]:
# Predict on dataset

logistic_preds = logistic_model.predict(X_val_scaled)  # Logistic needs scaled
dt_preds = dt.predict(X_val)
rf_preds = rf.predict(X_val)

## Model Evaluation

In [92]:
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_val, logistic_preds))
print(confusion_matrix(y_val, logistic_preds))
print(classification_report(y_val, logistic_preds))

Logistic Regression:
Accuracy: 0.7886178861788617
[[18 25]
 [ 1 79]]
              precision    recall  f1-score   support

           N       0.95      0.42      0.58        43
           Y       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123



In [93]:
print("\nDecision Tree:")
print("Accuracy:", accuracy_score(y_val, dt_preds))
print(confusion_matrix(y_val, dt_preds))
print(classification_report(y_val, dt_preds))


Decision Tree:
Accuracy: 0.6910569105691057
[[23 20]
 [18 62]]
              precision    recall  f1-score   support

           N       0.56      0.53      0.55        43
           Y       0.76      0.78      0.77        80

    accuracy                           0.69       123
   macro avg       0.66      0.65      0.66       123
weighted avg       0.69      0.69      0.69       123



In [94]:
print("\nRandom Forest:")
print("Accuracy:", accuracy_score(y_val, rf_preds))
print(confusion_matrix(y_val, rf_preds))
print(classification_report(y_val, rf_preds))


Random Forest:
Accuracy: 0.7560975609756098
[[18 25]
 [ 5 75]]
              precision    recall  f1-score   support

           N       0.78      0.42      0.55        43
           Y       0.75      0.94      0.83        80

    accuracy                           0.76       123
   macro avg       0.77      0.68      0.69       123
weighted avg       0.76      0.76      0.73       123



In [95]:
# 8. After model selection, predict on real test data

final_test_predictions = rf.predict(X_test)

## Submission file

In [96]:
submission = pd.DataFrame({
    'Loan_ID': test_data['Loan_ID'],
    'Loan_Status': final_test_predictions
})