In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [5]:
df=pd.read_csv('loan_data.csv')

In [6]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [7]:
X=df.drop(columns='loan_status',axis=1)
y=df['loan_status']

In [8]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [9]:
numerical_features = [
    'person_age',
    'person_income',
    'person_emp_exp',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length',
    'credit_score'
]

In [10]:
categorical_features = [
    'person_gender',
    'person_education',
    'person_home_ownership',
    'loan_intent',
    'previous_loan_defaults_on_file'
]

In [11]:
num_pipeline=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [12]:
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


In [13]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
])


In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [16]:
def evaluate_model(y_test,y_pred):
   accuracy=accuracy_score(y_test,y_pred)
   c_report=classification_report(y_test, y_pred)
   cm=confusion_matrix(y_test,y_pred)
   return accuracy,c_report,cm
    

In [17]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ),
    
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    
    "SVM": SVC(kernel='rbf', probability=True),
    
    "KNN": KNeighborsClassifier(n_neighbors=5),
    
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ),
    
    "CatBoost": CatBoostClassifier(
        verbose=0,
        random_state=42
    )
}
model_lists = []
accuracy_lists=[]
for name, model in models.items():
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred  = model.predict(X_test)

    # Evaluation
    train_acc, train_report, train_cm = evaluate_model(y_train, y_train_pred)
    test_acc, test_report, test_cm = evaluate_model(y_test, y_test_pred)

    print(name)

    print("Model performance for Training set")
    print(f"- Accuracy: {train_acc:.4f}")

    print("\nModel performance for Test set")
    print(f"- Accuracy: {test_acc:.4f}")
    print("\nClassification Report:\n", test_report)
    print("Confusion Matrix:\n", test_cm)

    model_lists.append((name, model))
    accuracy_lists.append(test_acc)
    print("=" * 40)


Logistic Regression
Model performance for Training set
- Accuracy: 0.8958

Model performance for Test set
- Accuracy: 0.8993

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.94      7000
           1       0.79      0.75      0.77      2000

    accuracy                           0.90      9000
   macro avg       0.86      0.84      0.85      9000
weighted avg       0.90      0.90      0.90      9000

Confusion Matrix:
 [[6600  400]
 [ 506 1494]]
Decision Tree
Model performance for Training set
- Accuracy: 1.0000

Model performance for Test set
- Accuracy: 0.8999

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      7000
           1       0.77      0.78      0.78      2000

    accuracy                           0.90      9000
   macro avg       0.85      0.86      0.86      9000
weighted avg       0.90      0.90      0.90      9000

Confus

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost
Model performance for Training set
- Accuracy: 0.9659

Model performance for Test set
- Accuracy: 0.9364

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96      7000
           1       0.89      0.81      0.85      2000

    accuracy                           0.94      9000
   macro avg       0.92      0.89      0.90      9000
weighted avg       0.94      0.94      0.94      9000

Confusion Matrix:
 [[6808  192]
 [ 380 1620]]
CatBoost
Model performance for Training set
- Accuracy: 0.9544

Model performance for Test set
- Accuracy: 0.9369

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96      7000
           1       0.90      0.80      0.85      2000

    accuracy                           0.94      9000
   macro avg       0.92      0.89      0.90      9000
weighted avg       0.94      0.94      0.94      9000

Confusion Matrix:
 [[68

In [18]:
pd.DataFrame(list(zip(model_lists,accuracy_lists)), columns=['Model Name', 'accuracy_score']).sort_values(by=["accuracy_score"],ascending=False)

Unnamed: 0,Model Name,accuracy_score
7,"(CatBoost, <catboost.core.CatBoostClassifier o...",0.936889
6,"(XGBoost, XGBClassifier(base_score=None, boost...",0.936444
2,"(Random Forest, (DecisionTreeClassifier(max_fe...",0.929778
3,"(Gradient Boosting, ([DecisionTreeRegressor(cr...",0.925222
4,"(SVM, SVC(probability=True))",0.917778
1,"(Decision Tree, DecisionTreeClassifier(random_...",0.899889
0,"(Logistic Regression, LogisticRegression(max_i...",0.899333
5,"(KNN, KNeighborsClassifier())",0.895556
