## Importing Required Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
warnings.filterwarnings("ignore")

## Step 1: Load Dataset

In [5]:
df = pd.read_csv("loan_approval_dataset.csv")

print("Data Loaded Successfully")
print("Shape of dataset:", df.shape)


Data Loaded Successfully
Shape of dataset: (4269, 13)


In [6]:
df.head(2)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


## Step 1: Divide X and y

In [11]:
X = df.drop(" loan_status", axis=1)
y = df[" loan_status"]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (4269, 12)
y shape: (4269,)


## Step 2: Split Data into Train and Test

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (3415, 12)
Test shape: (854, 12)


## Step 3: Feature Engineering on Train Data
- Categorical → Label Encoding
- Numerical → Normalization

In [14]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np

# Fill missing values (train only)
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = X_train[col].fillna(X_train[col].mode()[0])
    else:
        X_train[col] = X_train[col].fillna(X_train[col].median())

# Separate categorical and numerical columns
cat_cols = X_train.select_dtypes(include=['object']).columns
num_cols = X_train.select_dtypes(exclude=['object']).columns

# Label Encoding for categorical features
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    label_encoders[col] = le

# Normalization for numerical features
scaler = MinMaxScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

print("Feature engineering on train data completed")


Feature engineering on train data completed


## Step 4: Model Building using KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

print(" KNN model trained successfully")

 KNN model trained successfully


##Step 5: Feature Engineering on Test Data
-  (Use same encoders and scaler fitted on train)

In [16]:
# Fill missing values in test data using same strategy
for col in X_test.columns:
    if X_test[col].dtype == 'object':
        X_test[col] = X_test[col].fillna(X_train[col].mode()[0])
    else:
        X_test[col] = X_test[col].fillna(X_train[col].median())

# Apply same LabelEncoders
for col in cat_cols:
    le = label_encoders[col]
    X_test[col] = le.transform(X_test[col])

# Apply same scaler
X_test[num_cols] = scaler.transform(X_test[num_cols])

print("Feature engineering on test data completed")


Feature engineering on test data completed


## Step 6: Prediction on Test Data

In [17]:
y_pred = knn.predict(X_test)

print("Predictions generated")

Predictions generated


## Step 7: Model Evaluation using Accuracy

In [18]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f" Model Accuracy = {accuracy * 100:.2f}%")

 Model Accuracy = 88.52%


## Step 8: Save Model for deployment

In [19]:
import pickle

# Save model for deployment
with open("knn_loan_model.pkl", "wb") as model_file:
    pickle.dump(knn, model_file)
print(" Model saved as pickle files")

 Model saved as pickle files
