In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, roc_auc_score,log_loss
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore") 

In [2]:
df = pd.read_csv("loan.csv").set_index('Loan_ID')

In [3]:
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


In [5]:
X = df.drop('Loan_Status',axis=1)
y = df['Loan_Status']

In [6]:
impnum = SimpleImputer(strategy = 'median',).set_output(transform='pandas')

In [7]:
impcat = SimpleImputer(strategy='constant',fill_value='unknown').set_output(transform='pandas')

In [8]:
trans_imp = make_column_transformer((impcat, make_column_selector(dtype_include=object)  ),
                             (impnum,make_column_selector(dtype_exclude = object)  ),
                             verbose_feature_names_out=False).set_output(transform='pandas')


In [9]:
X_imp = trans_imp.fit_transform(X)

In [10]:
X_imp.isna().sum().sum()


0

In [11]:
ohe  = OneHotEncoder(handle_unknown='ignore',sparse_output=False,drop='first').set_output(transform='pandas')
trans_ohe = make_column_transformer(('passthrough',make_column_selector(dtype_exclude=object)),
                             (ohe,make_column_selector(dtype_include=object)),verbose_feature_names_out=False).set_output(transform='pandas')

X_imp_ohe = trans_ohe.fit_transform(X_imp)

In [12]:
X_imp_ohe.dtypes

ApplicantIncome            float64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Gender_Male                float64
Gender_unknown             float64
Married_Yes                float64
Married_unknown            float64
Dependents_1               float64
Dependents_2               float64
Dependents_3+              float64
Dependents_unknown         float64
Education_Not Graduate     float64
Self_Employed_Yes          float64
Self_Employed_unknown      float64
Property_Area_Semiurban    float64
Property_Area_Urban        float64
dtype: object

In [13]:
knn = KNeighborsClassifier()
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()


In [23]:
pipe = Pipeline([('IMP',trans_imp),('OHE',trans_ohe),('SCL',None),('KNN',knn)])

In [24]:
kfold = StratifiedKFold(n_splits=5, random_state=24,shuffle=True)
params = {'KNN__n_neighbors': np.arange(1,8),
          'KNN__metric':['cityblock','minkowski','manhattan','haversine'],
          'SCL':[scaler_mm, scaler_std, None]
          }
gcv = GridSearchCV(pipe, param_grid=params,scoring='roc_auc',cv=kfold, verbose=3)                   

In [25]:
gcv.fit(X,y)

Fitting 5 folds for each of 84 candidates, totalling 420 fits
[CV 1/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.662 total time=   0.0s
[CV 2/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.618 total time=   0.0s
[CV 3/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.622 total time=   0.0s
[CV 4/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.698 total time=   0.0s
[CV 5/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.699 total time=   0.0s
[CV 1/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=0.635 total time=   0.0s
[CV 2/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=0.644 total time=   0.0s
[CV 3/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=0.622 total time=   0.0s
[CV 4/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=Standard

In [26]:
print("Best Params:", gcv.best_params_)
print("Best Score:", gcv.best_score_)

Best Params: {'KNN__metric': 'minkowski', 'KNN__n_neighbors': 3, 'SCL': MinMaxScaler()}
Best Score: 0.7246499733496637


In [27]:
bm = gcv.best_estimator_
bm

In [28]:
###INFERENCING

In [29]:
test = pd.read_csv('test.csv')

In [30]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [31]:
bm.predict(test)

array(['Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'N', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [32]:
submit = pd.read_csv('sample_submission_49d68Cx.csv')

In [33]:
submit

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,N
1,LP001022,N
2,LP001031,N
3,LP001035,N
4,LP001051,N
...,...,...
362,LP002971,N
363,LP002975,N
364,LP002980,N
365,LP002986,N


In [34]:
submit['Loan_Status'] = bm.predict(test)

In [35]:
submit.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,N
