In [124]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.metrics import accuracy_score, r2_score, classification_report, roc_auc_score


import warnings
warnings.filterwarnings('ignore')


In [125]:
df = pd.read_csv(r'C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\loan.csv')
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [126]:
df.set_index('Loan_ID', inplace=True)

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


In [128]:
X = df.drop(['Loan_Status'], axis=1)
y = df['Loan_Status'] 

In [129]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 0)

In [130]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, make_column_selector(dtype_include=[int, float])),
        ('cat', cat_transformer, make_column_selector(dtype_include=object))
    ],
    remainder='passthrough'
)

# Define the full pipeline with logistic regression
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=200))
])

# Parameter grid for logistic regression solvers
params = {
    'classifier__solver': [
        'lbfgs',
        'liblinear',
        'newton-cg',
        'newton-cholesky',
        'sag'
    ]
}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)


# Apply GridSearchCV
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, error_score='raise')
gcv.fit(X_train, y_train)


# Output best parameters and scores
print("Best Parameters:", gcv.best_params_)
print("Best Cross-Validation Score:", gcv.best_score_)

# Test set predictions and performance
y_pred = gcv.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Best Parameters: {'classifier__solver': 'lbfgs'}
Best Cross-Validation Score: 0.8043083900226756
Test Accuracy: 0.8373983739837398
Classification Report:
               precision    recall  f1-score   support

           N       0.88      0.45      0.60        33
           Y       0.83      0.98      0.90        90

    accuracy                           0.84       123
   macro avg       0.86      0.72      0.75       123
weighted avg       0.84      0.84      0.82       123



In [131]:
best_model=gcv.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)
y_pred_proba = y_pred_proba[:, 1]

print("\naccuracy_scor: ", accuracy_score(y_test, y_pred))
print("\nclassification_report: \n", classification_report(y_test, y_pred))
print("\nroc_auc_score: ", roc_auc_score(y_test, y_pred_proba))


accuracy_scor:  0.8373983739837398

classification_report: 
               precision    recall  f1-score   support

           N       0.88      0.45      0.60        33
           Y       0.83      0.98      0.90        90

    accuracy                           0.84       123
   macro avg       0.86      0.72      0.75       123
weighted avg       0.84      0.84      0.82       123


roc_auc_score:  0.7932659932659932


--- 
Logistic Regression

In [132]:
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [133]:
#Sir
loan = df.drop('Loan_Status', axis=1)

In [143]:
#Sir
#Testing

imp_cat = SimpleImputer(strategy='constant', fill_value="unknown").set_output(transform="pandas")
imp_num = SimpleImputer(strategy="median").set_output(transform="pandas")
trans_imp = make_column_transformer((imp_cat, make_column_selector(dtype_include=object)),
                                  (imp_num, make_column_selector(dtype_exclude=object)),
                                    verbose_feature_names_out=False)

trans_imp = trans_imp.set_output(transform='pandas')

loan_imp = trans_imp.fit_transform(loan)
print(loan_imp.isnull().sum().sum())
print(loan_imp.columns)



0
Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')


In [135]:
######
ohe = OneHotEncoder(handle_unknown='ignore',sparse_output=False, drop='first').set_output(transform='pandas')
trans_ohe = make_column_transformer((ohe, make_column_selector(dtype_include=object)),
                                    ('passthrough', make_column_selector(dtype_exclude=object)),
                                      verbose_feature_names_out=False)

# trans_ohe = trans_ohe.set_output(transform='pandas')

# X_imp_ohe trans_ohe.fit_transform(X_imp)
# X_imp_ohe.dtypes

In [136]:
lr = LogisticRegression(random_state=24)

pipe = Pipeline([('IMP', trans_imp), ('OHE', trans_ohe), ('LR', lr)])
kfold = StratifiedKFold(n_splits=5, random_state= 24, shuffle= True)
params = {'LR__solver': ['lbfgs", "liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}
gcv = GridSearchCV(pipe, param_grid=params, scoring ='roc_auc', cv=kfold, verbose=3)

gcv.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[CV 1/5] END .......LR__solver=lbfgs", "liblinear;, score=nan total time=   0.0s
[CV 2/5] END .......LR__solver=lbfgs", "liblinear;, score=nan total time=   0.0s
[CV 3/5] END .......LR__solver=lbfgs", "liblinear;, score=nan total time=   0.0s
[CV 4/5] END .......LR__solver=lbfgs", "liblinear;, score=nan total time=   0.0s
[CV 5/5] END .......LR__solver=lbfgs", "liblinear;, score=nan total time=   0.0s
[CV 1/5] END ..............LR__solver=newton-cg;, score=0.846 total time=   0.0s
[CV 2/5] END ..............LR__solver=newton-cg;, score=0.702 total time=   0.0s
[CV 3/5] END ..............LR__solver=newton-cg;, score=0.816 total time=   0.0s
[CV 4/5] END ..............LR__solver=newton-cg;, score=0.719 total time=   0.0s
[CV 5/5] END ..............LR__solver=newton-cg;, score=0.608 total time=   0.0s
[CV 1/5] END ........LR__solver=newton-cholesky;, score=0.846 total time=   0.0s
[CV 2/5] END ........LR__solver=newton-cholesky;, score=0.702 total time=   0.0s
[CV 3/5] END ........LR__sol

In [137]:
print(gcv.best_params_)
print(gcv.best_score_)

{'LR__solver': 'newton-cg'}
0.7381164193694285


In [138]:
best_model=gcv.best_estimator_

In [139]:
y_pred = best_model.predict(X_test)

In [140]:
y_pred_proba = best_model.predict_proba(X_test)
y_pred_proba = y_pred_proba[:, 1] #roc_auc_score: Takes y_test and the predicted probabilities (y_pred_proba) for the positive class.

In [141]:
print("\naccuracy_scor: ", accuracy_score(y_test, y_pred))
print("\nclassification_report: \n", classification_report(y_test, y_pred))
print("\nroc_auc_score: ", roc_auc_score(y_test, y_pred_proba))


accuracy_scor:  0.8373983739837398

classification_report: 
               precision    recall  f1-score   support

           N       0.88      0.45      0.60        33
           Y       0.83      0.98      0.90        90

    accuracy                           0.84       123
   macro avg       0.86      0.72      0.75       123
weighted avg       0.84      0.84      0.82       123


roc_auc_score:  0.81986531986532


---
KNN

In [156]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

imp_cat = SimpleImputer(
  strategy='constant', 
  fill_value="unknown"
).set_output(transform="pandas")

imp_num = SimpleImputer(
  strategy="median"
).set_output(transform="pandas") 

trans_imp = make_column_transformer(
  (imp_cat, make_column_selector(dtype_include=object)),
  (imp_num, make_column_selector(dtype_exclude=object)),
  verbose_feature_names_out=False
).set_output(transform='pandas')

scale_mm = MinMaxScaler()
scale_std = StandardScaler()

ohe = OneHotEncoder(handle_unknown='ignore',sparse_output=False, drop='first').set_output(transform='pandas')
trans_ohe = make_column_transformer((ohe, make_column_selector(dtype_include=object)),
                                    ('passthrough', make_column_selector(dtype_exclude=object)),
                                      verbose_feature_names_out=False)

knn = KNeighborsClassifier()

pipe = Pipeline([
    ('IMP', trans_imp), 
    ('OHE', trans_ohe),
    ('SCL' ,None),
    ('KNN', knn)
])

kfold = StratifiedKFold(n_splits=5, random_state= 24, shuffle= True)
params = {
    'KNN__n_neighbors': np.arange(1, 30, 3),
    'SCL' : [None, scale_mm, scale_std],
}
gcv = GridSearchCV(pipe, param_grid=params, scoring ='roc_auc', cv=kfold, verbose=3)

gcv.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.538 total time=   0.0s
[CV 2/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.506 total time=   0.0s
[CV 3/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.513 total time=   0.0s
[CV 4/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.450 total time=   0.0s
[CV 5/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.511 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.754 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.549 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.698 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.673 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.597 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.801 total time=   0.0s
[CV 2/5] END KNN__n_n

In [155]:

gcv.best_score_, gcv.best_params_

(0.741161745342204, {'KNN__n_neighbors': 22, 'SCL': StandardScaler()})