In [None]:
# We're trying to solve binary classifiction problem. Till now, we have discussed three models to solve binary classification problem -
# Logistic regression, KNN, Bayes
# We're going to train the data on all three and then see which is the best suited for this problem

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load data
loan_df = pd.read_csv("loan_approval_data.csv")
# loan_df.head()

# Data cleaning - handle missing values, df will automatically fill missing value with NaN s we need to handle NaN
# Fill numerical value columns with it's mean value

numeric_cols = loan_df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if loan_df[col].isnull().any(): # Check if the column actually has NaNs to fill
        col_mean = loan_df[col].mean()
        loan_df[col] = loan_df[col].fillna(col_mean)

# Fill non-numerical column values with it's mode(most-frequent)
non_numeric_cols = loan_df.select_dtypes(include=['object', 'category']).columns
for col in non_numeric_cols:
    # Calculate the mode and take the first value (as mode can return multiple values)
    col_mode = loan_df[col].mode()[0]
    # Fill NaN values in the column with the calculated mode
    loan_df[col] = loan_df[col].fillna(col_mode)

# We can use skleanr SimpleImputer also to fill NaN values
# To check if all the columns have null values it will return 0 if there are no null values
loan_df.isnull().sum()

# Next step is to check for outliers by plotting the data
# In this data set there are no such outliers
# In case we have outliers, then we have to remove it but not all outliers have to be removed
# We remove two types of outliers - 
# 1. Illogical/Impossible/wrong values - eg if age is in negative or age is 10 years r 150 years or dependent value is -1
# 2. No meaningful information

# Remove applicant id because this doesn't have to do woth loan approval
loan_df = loan_df.drop("Applicant_ID", axis=1)
loan_df

# Feature engineering - encoding - update non-numerical value to numerical value
# We use LabelEncoder - ordinal data and ouptut data and OneHotEncoder for nominal data

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
loan_df["Education_Level"] = le.fit_transform(loan_df["Education_Level"])
loan_df["Loan_Approved"] = le.fit_transform(loan_df["Loan_Approved"])

cols = ["Employment_Status", "Marital_Status", "Loan_Purpose", "Property_Area", "Gender", "Employer_Category"]
ohe = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")
encoded = ohe.fit_transform(loan_df[cols]) # This will return encoded value in a 2d list so we need to create a dataframe

encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(cols), index=loan_df.index)
loan_df = pd.concat([loan_df.drop(columns=cols), encoded_df], axis=1)

# Split data
X = loan_df.drop("Loan_Approved", axis=1)
y = loan_df["Loan_Approved"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

# Feature scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate models
# Logistic Regression
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)

y_pred = log_model.predict(X_test_scaled)
# Evaluation
# False positive we need to reduce so here we consider precision and then recall(False negative)
from sklearn.metrics import precision_score, recall_score
print("Logistic regression")
ps_score = precision_score(y_test, y_pred)
print(ps_score)
rc_score = recall_score(y_test, y_pred)
print(rc_score)

# KNN
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

y_pred = knn_model.predict(X_test_scaled)
# Evaluation
# False positive we need to reduce so here we consider precision and then recall(False negative)
from sklearn.metrics import precision_score, recall_score
print("KNN")
ps_score = precision_score(y_test, y_pred)
print(ps_score)
rc_score = recall_score(y_test, y_pred)
print(rc_score)

# Bayes
from sklearn.naive_bayes import GaussianNB

bayes_model = GaussianNB()
bayes_model.fit(X_train_scaled, y_train)

y_pred = bayes_model.predict(X_test_scaled)
# Evaluation
# False positive we need to reduce so here we consider precision and then recall(False negative)
from sklearn.metrics import precision_score, recall_score
print("Bayes")
ps_score = precision_score(y_test, y_pred)
print(ps_score);
rc_score = recall_score(y_test, y_pred)
print(rc_score);

Logistic regression
0.7833333333333333
0.7704918032786885
KNN
0.6274509803921569
0.5245901639344263
Bayes
0.8035714285714286
0.7377049180327869
