In [36]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report


data = pd.read_excel("Copy of loan.xlsx")  # Replace with your data file path
data.head()

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [37]:

categorical_features = [
    col for col in data.columns if data[col].dtype == "object" and col != "Loan_ID"
]

for col in categorical_features:
    data[col] = data[col].astype(str)
    data[col].fillna("other")

le = LabelEncoder()
for col in categorical_features:
    data[col] = le.fit_transform(data[col])

print(data[categorical_features].dtypes)

Gender           int32
Married          int32
Dependents       int32
Education        int32
Self_Employed    int32
Property_Area    int32
Loan_Status      int32
dtype: object


In [39]:

features = data.drop(["Loan_Status", "Loan_ID"], axis=1)  # Drop Loan_Status and Loan_ID columns
target = data["Loan_Status"]


imputer = SimpleImputer(strategy='most_frequent')  # Replace missing values with most frequent value
features_imputed = imputer.fit_transform(features)


X_train, X_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.2, random_state=42)




gnb = GaussianNB()
gnb.fit(X_train, y_train)


y_pred_gnb = gnb.predict(X_test)


gnb_accuracy = accuracy_score(y_test, y_pred_gnb)
print("GaussianNB Accuracy:", gnb_accuracy)


rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", rf_accuracy)

gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)

print("Gradient Boosting Accuracy:", gb_accuracy)
print(classification_report(y_test, y_pred_gb))

best_algorithm = "Gradient Boosting" if gb_accuracy > max(gnb_accuracy, rf_accuracy) else best_algorithm
print("Best Algorithm:", best_algorithm)

GaussianNB Accuracy: 0.7886178861788617
Random Forest Accuracy: 0.7642276422764228
Gradient Boosting Accuracy: 0.7398373983739838
              precision    recall  f1-score   support

           0       0.72      0.42      0.53        43
           1       0.74      0.91      0.82        80

    accuracy                           0.74       123
   macro avg       0.73      0.67      0.67       123
weighted avg       0.74      0.74      0.72       123

Best Algorithm: GaussianNB
