In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [2]:
df=pd.read_csv("training_set.csv")
df['Loan_ID'] = df['Loan_ID'].str.replace("LP", "", regex=True).astype(int)
# Create one-hot encoded columns for property_Area while keeping the original column
df_encoded = pd.get_dummies(df['property_Area'], prefix='property_Area')
df = pd.concat([df, df_encoded], axis=1)
df.rename(columns={'property_Area_Urban': 'Urban', 'property_Area_Rural': 'Rural', 'property_Area_Semiurban': 'Semiurban'}, inplace=True)

df_encoded_gender = pd.get_dummies(df['Gender'], prefix='Gender')
df = pd.concat([df, df_encoded_gender], axis=1)
df.rename(columns={'Gender_Male': 'Male', 'Gender_Female': 'Female'}, inplace=True)
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                       'property_Area', 'Loan_Status','Rural','Urban','Semiurban','Female','Male']

# Convert categorical values to numeric using label encoding
for col in categorical_columns:
    df[col] = df[col].astype('category').cat.codes  # Converts categories to numbers
df.replace(-1, np.nan, inplace=True)


for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['Loan_Income_Ratio'] = df['LoanAmount'] / df['Total_Income']
df['EMI'] = df['LoanAmount'] / df['Loan_Amount_Term']
df['Balance_Income'] = df['Total_Income'] - df['EMI']

len(df.columns)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

22

In [3]:
train, test = train_test_split(df,train_size=0.75, random_state=42)

In [4]:
y_train = train['Loan_Status']
X_train = train.drop(columns='Loan_Status')
X_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,...,property_Area,Rural,Semiurban,Urban,Female,Male,Total_Income,Loan_Income_Ratio,EMI,Balance_Income
92,1319,1.0,1.0,2.0,1.0,0.0,3273.0,1820.0,81.0,360.0,...,2,0,0,1,0,1,5093.0,0.015904,0.225,5092.775
304,1978,1.0,0.0,0.0,0.0,0.0,4000.0,2500.0,140.0,360.0,...,0,1,0,0,0,1,6500.0,0.021538,0.388889,6499.611111
68,1238,1.0,1.0,3.0,1.0,1.0,7100.0,0.0,125.0,60.0,...,2,0,0,1,0,1,7100.0,0.017606,2.083333,7097.916667
15,1032,1.0,0.0,0.0,0.0,0.0,3806.0,0.0,125.0,360.0,...,2,0,0,1,0,1,3806.0,0.032843,0.347222,3805.652778
211,1711,1.0,1.0,3.0,0.0,0.0,3430.0,1250.0,128.0,360.0,...,1,0,1,0,0,1,4680.0,0.02735,0.355556,4679.644444


In [5]:
y_test = test['Loan_Status']
X_test = test.drop(columns='Loan_Status')

In [6]:
def showStatistcs(Y_pred,Y,methodname):
    print(f'the performance metrics for test data using {methodname}:\n')
    print(f'Confusion matrix:\n{confusion_matrix(Y_pred,Y)}\n')
    print(f'Accuracy Score :{accuracy_score(Y_pred,Y)*100 :.4f}\n')
    print(f'Precision Score :{precision_score(Y_pred,Y)*100 :.4f}\n')
    print(f'Recall Score :{recall_score(Y_pred,Y)*100 :.4f}\n')
    print(f'f1 Score :{f1_score(Y_pred,Y)*100 :.4f}\n')

In [7]:
nb = GaussianNB()
nb.fit(X_train, y_train)

y_test_pred = nb.predict(X_test)

showStatistcs(y_test_pred, y_test, 'NaiveBayes')

the performance metrics for test data using NaiveBayes:

Confusion matrix:
[[25  4]
 [29 96]]

Accuracy Score :78.5714

Precision Score :96.0000

Recall Score :76.8000

f1 Score :85.3333



In [8]:
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1)
bst.fit(X_train, y_train)

y_test_pred = bst.predict(X_test)

showStatistcs(y_test_pred, y_test, 'XGBoost')

the performance metrics for test data using XGBoost:

Confusion matrix:
[[24  4]
 [30 96]]

Accuracy Score :77.9221

Precision Score :96.0000

Recall Score :76.1905

f1 Score :84.9558

