In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
df=pd.read_csv("training_set.csv")
df['Loan_ID'] = df['Loan_ID'].str.replace("LP", "", regex=True).astype(int)
# Create one-hot encoded columns for property_Area while keeping the original column
df_encoded = pd.get_dummies(df['property_Area'], prefix='property_Area')
df = pd.concat([df, df_encoded], axis=1)
df.rename(columns={'property_Area_Urban': 'Urban', 'property_Area_Rural': 'Rural', 'property_Area_Semiurban': 'Semiurban'}, inplace=True)

df_encoded_gender = pd.get_dummies(df['Gender'], prefix='Gender')
df = pd.concat([df, df_encoded_gender], axis=1)
df.rename(columns={'Gender_Male': 'Male', 'Gender_Female': 'Female'}, inplace=True)
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                       'property_Area', 'Loan_Status','Rural','Urban','Semiurban','Female','Male']

# Convert categorical values to numeric using label encoding
for col in categorical_columns:
    df[col] = df[col].astype('category').cat.codes  # Converts categories to numbers
df.replace(-1, np.nan, inplace=True)


for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['Loan_Income_Ratio'] = df['LoanAmount'] / df['Total_Income']
df['EMI'] = df['LoanAmount'] / df['Loan_Amount_Term']
df['Balance_Income'] = df['Total_Income'] - df['EMI']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [7]:
train, test = train_test_split(df,train_size=0.75, random_state=42)

In [8]:
y_train = train['Loan_Status']
X_train = train.drop(columns='Loan_Status')

In [9]:
y_test = test['Loan_Status']
X_test = test.drop(columns='Loan_Status')

In [10]:
def showStatistcs(Y_pred,Y,methodname):
    print(f'the performance metrics for test data using {methodname}:\n')
    print(f'Confusion matrix:\n{confusion_matrix(Y_pred,Y)}\n')
    print(f'Accuracy Score :{accuracy_score(Y_pred,Y)*100 :.4f}\n')
    print(f'Precision Score :{precision_score(Y_pred,Y)*100 :.4f}\n')
    print(f'Recall Score :{recall_score(Y_pred,Y)*100 :.4f}\n')
    print(f'f1 Score :{f1_score(Y_pred,Y)*100 :.4f}\n')

In [11]:
lgbm = LGBMClassifier(n_estimators=2, max_depth=2, learning_rate=1)
lgbm.fit(X_train, y_train)

y_test_pred = lgbm.predict(X_test)

showStatistcs(y_test_pred, y_test, 'LightGBM')

[LightGBM] [Info] Number of positive: 322, number of negative: 138
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000621 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 460, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.700000 -> initscore=0.847298
[LightGBM] [Info] Start training from score 0.847298
the performance metrics for test data using LightGBM:

Confusion matrix:
[[25  6]
 [29 94]]

Accuracy Score :77.2727

Precision Score :94.0000

Recall Score :76.4228

f1 Score :84.3049



In [12]:
detree = DecisionTreeClassifier(random_state=0)
detree.fit(X_train, y_train)

y_test_pred = detree.predict(X_test)

showStatistcs(y_test_pred, y_test, 'DecisionTrees')

the performance metrics for test data using DecisionTrees:

Confusion matrix:
[[35 24]
 [19 76]]

Accuracy Score :72.0779

Precision Score :76.0000

Recall Score :80.0000

f1 Score :77.9487

