In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import joblib
import os

In [2]:
df = pd.read_csv('../data/processed_data/processed_data.csv')

In [3]:
df.head()

Unnamed: 0,Applicant_Income,Coapplicant_Income,Age,Dependents,Credit_Score,Existing_Loans,DTI_Ratio,Savings,Collateral_Value,Loan_Amount,...,Loan_Purpose_Education,Loan_Purpose_Home,Loan_Purpose_Personal,Property_Area_Semiurban,Property_Area_Urban,Gender_Male,Employer_Category_Government,Employer_Category_MNC,Employer_Category_Private,Employer_Category_Unemployed
0,17795.0,1387.0,51.0,0.0,637.0,4.0,0.53,19403.0,45638.0,16619.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2860.0,2679.0,46.0,3.0,621.0,2.0,0.3,2580.0,49272.0,38687.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,7390.0,2106.0,25.0,2.0,674.0,4.0,0.2,13844.0,6908.0,27943.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,13964.0,8173.0,40.0,2.0,579.0,3.0,0.31,9553.0,10844.0,27819.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,13284.0,4223.0,31.0,2.0,721.0,1.0,0.29,9386.0,37629.0,12741.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


## Feature Engineering

In [4]:
df['DTI_Ratio_sq'] = df['DTI_Ratio']**2
df['Credit_Score_sq'] = df['Credit_Score']**2

X = df.drop(columns = ['Loan_Approved', 'Credit_Score', 'DTI_Ratio'])
y = df['Loan_Approved']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
model_path = 'models'
if not os.path.exists(model_path):
    os.makedirs(model_path)

### Logistic Regression Model

In [6]:
log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)

y_pred = log_model.predict(X_test_scaled)

print("Precision : " , precision_score(y_test, y_pred))
print("Recall score : " , recall_score(y_test, y_pred))
print("F1 Score : " , f1_score(y_test, y_pred))
print("Accuracy : " , accuracy_score(y_test, y_pred))
print("Confusion matrix : " , confusion_matrix(y_test, y_pred))

Precision :  0.7903225806451613
Recall score :  0.8032786885245902
F1 Score :  0.7967479674796748
Accuracy :  0.875
Confusion matrix :  [[126  13]
 [ 12  49]]


In [7]:
joblib.dump(log_model, '../models/logistic_regression_model.pkl')
print("Model saved successfully in the models/ folder!")

Model saved successfully in the models/ folder!


### KNN Model

In [8]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors = 5)
knn_model.fit(X_train_scaled, y_train)

y_pred = knn_model.predict(X_test_scaled)

print("Precision : " , precision_score(y_test, y_pred))
print("Recall score : " , recall_score(y_test, y_pred))
print("F1 Score : " , f1_score(y_test, y_pred))
print("Accuracy : " , accuracy_score(y_test, y_pred))
print("Confusion matrix : " , confusion_matrix(y_test, y_pred))

Precision :  0.62
Recall score :  0.5081967213114754
F1 Score :  0.5585585585585585
Accuracy :  0.755
Confusion matrix :  [[120  19]
 [ 30  31]]


In [9]:
joblib.dump(knn_model, '../models/knn_model.pkl')
print("Model saved successfully in the models/ folder!")

Model saved successfully in the models/ folder!


### Naive Bayes Model

In [10]:
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

y_pred = nb_model.predict(X_test_scaled)

print('Naive Bayes')
print("Precision : " , precision_score(y_test, y_pred))
print("Recall score : " , recall_score(y_test, y_pred))
print("F1 Score : " , f1_score(y_test, y_pred))
print("Accuracy : " , accuracy_score(y_test, y_pred))
print("Confusion matrix : " , confusion_matrix(y_test, y_pred))

Naive Bayes
Precision :  0.7833333333333333
Recall score :  0.7704918032786885
F1 Score :  0.7768595041322314
Accuracy :  0.865
Confusion matrix :  [[126  13]
 [ 14  47]]


In [11]:
joblib.dump(nb_model, '../models/naive_bayes_model.pkl')
print("Model saved successfully in the models/ folder!")

Model saved successfully in the models/ folder!


### Best model after feature engineering based on precision and recall is Logistic Regression