# Data load and Cleaning

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
data=train.append(test, ignore_index=True,sort=False)

In [4]:
data.isna().sum()

Loan_ID                0
Gender                24
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
dtype: int64

In [5]:
for column in ["Gender","Married","Dependents","Self_Employed","Loan_Amount_Term","Credit_History"]:
    data[column].fillna(data[column].mode()[0],inplace=True)
    
data['LoanAmount'].fillna(data['LoanAmount'].median(),inplace=True)

data['Dependents'].replace('3+', 3,inplace=True)

In [6]:
data.isna().sum()

Loan_ID                0
Gender                 0
Married                0
Dependents             0
Education              0
Self_Employed          0
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount             0
Loan_Amount_Term       0
Credit_History         0
Property_Area          0
Loan_Status          367
dtype: int64

In [7]:
data['Loan_Status'].replace('N', 0,inplace=True) 
data['Loan_Status'].replace('Y', 1,inplace=True)

In [8]:
train=data[:614]
test=data[614:]
test_copy=data[614:]

In [9]:
train.drop(['Loan_ID'], axis=1, inplace=True)
test.drop(['Loan_ID'], axis=1, inplace=True)

test.drop(['Loan_Status'], axis=1, inplace=True)

In [10]:
train=pd.get_dummies(train) 
test=pd.get_dummies(test)

In [11]:
x=train.drop('Loan_Status',axis=1) 
y=train.Loan_Status

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)

# Applying Models

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_knn=KNeighborsClassifier(n_neighbors=5)
clf_knn.fit(x_train,y_train)
knn_pred=clf_knn.predict(x_test)

cfm_knn=confusion_matrix(y_test,knn_pred)
print(cfm_knn)
Accuracy_knn_train = clf_knn.score(x_train,y_train)
print("knn train score:-",Accuracy_knn_train)
Accuracy_knn_test = accuracy_score(y_test,knn_pred)
print("knn test score:-",Accuracy_knn_test)
cv_knn = cross_val_score(clf_knn, x_train, y_train, cv=5)
print("knn cross val score:-",round(cv_knn.mean(), 2) * 100)
print(classification_report(y_test,knn_pred))
knn_pred[0:10]

[[10 33]
 [26 85]]
knn train score:- 0.741304347826087
knn test score:- 0.6168831168831169
knn cross val score:- 65.0
              precision    recall  f1-score   support

         0.0       0.28      0.23      0.25        43
         1.0       0.72      0.77      0.74       111

    accuracy                           0.62       154
   macro avg       0.50      0.50      0.50       154
weighted avg       0.60      0.62      0.61       154



array([1., 1., 1., 1., 1., 1., 0., 1., 1., 0.])

In [18]:
from sklearn.model_selection import GridSearchCV

parameters=[{'n_neighbors':[1,2,3,4,5,6]}]
grid_search= GridSearchCV(estimator= clf_knn,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_knn=grid_search.best_score_
best_parameters_knn=grid_search.best_params_
print(best_accuracy_knn)
print(best_parameters_knn)

0.6478260869565218
{'n_neighbors': 5}


In [19]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_xgb=XGBClassifier(learning_rate=0.5,random_state=0)
clf_xgb.fit(x_train,y_train)
xgb_pred=clf_xgb.predict(x_test)

cfm_xgb=confusion_matrix(y_test,xgb_pred)
print(cfm_xgb)
Accuracy_xgb_train = clf_xgb.score(x_train,y_train)
print("xgb train score:-",Accuracy_xgb_train)
Accuracy_xgb_test = accuracy_score(y_test,xgb_pred)
print("xgb test score:-",Accuracy_xgb_test)
cv_xgb = cross_val_score(clf_xgb, x_train, y_train, cv=5)
print("xgb cross val score:-",round(cv_xgb.mean(), 2) * 100)
print(classification_report(y_test,xgb_pred))
xgb_pred[0:10]

[[ 24  19]
 [  9 102]]
xgb train score:- 0.9978260869565218
xgb test score:- 0.8181818181818182
xgb cross val score:- 74.0
              precision    recall  f1-score   support

         0.0       0.73      0.56      0.63        43
         1.0       0.84      0.92      0.88       111

    accuracy                           0.82       154
   macro avg       0.79      0.74      0.76       154
weighted avg       0.81      0.82      0.81       154



array([0., 1., 1., 1., 1., 0., 1., 1., 0., 1.])

In [20]:
from sklearn.model_selection import GridSearchCV

parameters=[{'learning_rate' : [0.2,0.5,0.8,0.9],'random_state':[0,1,2,3]}]
grid_search= GridSearchCV(estimator= clf_xgb,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_xgb=grid_search.best_score_
best_parameters_xgb=grid_search.best_params_
print(best_accuracy_xgb)
print(best_parameters_xgb)

0.7413043478260869
{'learning_rate': 0.5, 'random_state': 0}


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_rf=RandomForestClassifier(criterion='entropy',random_state = 1,n_estimators = 20)
clf_rf.fit(x_train,y_train)
rf_pred=clf_rf.predict(x_test)

cfm_rf=confusion_matrix(y_test,rf_pred)
print(cfm_rf)
Accuracy_rf_train = clf_rf.score(x_train,y_train)
print("rf train score:-",Accuracy_rf_train)
Accuracy_rf_test = accuracy_score(y_test,rf_pred)
print("rf test score:-",Accuracy_rf_test)
cv_rf = cross_val_score(clf_rf, x_train, y_train, cv=5)
print("rf cross val score:-",round(cv_rf.mean(), 2) * 100)
print(classification_report(y_test,rf_pred))
rf_pred[0:10]

[[ 23  20]
 [ 11 100]]
rf train score:- 0.9956521739130435
rf test score:- 0.7987012987012987
rf cross val score:- 78.0
              precision    recall  f1-score   support

         0.0       0.68      0.53      0.60        43
         1.0       0.83      0.90      0.87       111

    accuracy                           0.80       154
   macro avg       0.75      0.72      0.73       154
weighted avg       0.79      0.80      0.79       154



array([1., 1., 0., 1., 1., 0., 1., 0., 0., 1.])

In [22]:
from sklearn.model_selection import GridSearchCV

parameters=[{'n_estimators' : [10,20,30], 'criterion' : ['entropy'],'random_state':[0,1,2]},
           {'n_estimators' : [10,20,30], 'criterion' : ['gini'],'random_state':[0,1,2]}]
grid_search= GridSearchCV(estimator= clf_rf,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_rf=grid_search.best_score_
best_parameters_rf=grid_search.best_params_
print(best_accuracy_rf)
print(best_parameters_rf)

0.7782608695652175
{'criterion': 'entropy', 'n_estimators': 20, 'random_state': 1}


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_log=LogisticRegression(random_state=0)
clf_log.fit(x_train,y_train)
log_pred=clf_log.predict(x_test)

cfm_log=confusion_matrix(y_test,log_pred)
print(cfm_log)
Accuracy_log_train = clf_log.score(x_train,y_train)
print("log train score:-",Accuracy_log_train)
Accuracy_log_test = accuracy_score(y_test,log_pred)
print("log test score:-",Accuracy_log_test)
cv_log = cross_val_score(clf_log, x_train, y_train, cv=5)
print("log cross val score:-",round(cv_log.mean(), 2) * 100)
print(classification_report(y_test,log_pred))
log_pred[0:10]

[[ 20  23]
 [  6 105]]
log train score:- 0.7869565217391304
log test score:- 0.8116883116883117
log cross val score:- 80.0
              precision    recall  f1-score   support

         0.0       0.77      0.47      0.58        43
         1.0       0.82      0.95      0.88       111

    accuracy                           0.81       154
   macro avg       0.79      0.71      0.73       154
weighted avg       0.81      0.81      0.80       154



array([1., 1., 1., 1., 1., 0., 1., 1., 0., 1.])

In [25]:
from sklearn.model_selection import GridSearchCV

parameters=[{'random_state':[0,1,2,3,4,5,6,7,8,9,10]}]
grid_search= GridSearchCV(estimator= clf_log,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_log=grid_search.best_score_
best_parameters_log=grid_search.best_params_
print(best_accuracy_log)
print(best_parameters_log)

0.8
{'random_state': 0}


In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_nb=MultinomialNB()
clf_nb.fit(x_train,y_train)
nb_pred=clf_nb.predict(x_test)

cfm_nb=confusion_matrix(y_test,nb_pred)
print(cfm_nb)
Accuracy_nb_train = clf_nb.score(x_train,y_train)
print("nb train score:-",Accuracy_nb_train)
Accuracy_nb_test = accuracy_score(y_test,nb_pred)
print("nb test score:-",Accuracy_nb_test)
cv_nb = cross_val_score(clf_nb, x_train, y_train, cv=5)
print("nb cross val score:-",round(cv_nb.mean(), 2) * 100)
print(classification_report(y_test,nb_pred))
nb_pred[0:10]

[[22 21]
 [60 51]]
nb train score:- 0.49130434782608695
nb test score:- 0.474025974025974
nb cross val score:- 49.0
              precision    recall  f1-score   support

         0.0       0.27      0.51      0.35        43
         1.0       0.71      0.46      0.56       111

    accuracy                           0.47       154
   macro avg       0.49      0.49      0.45       154
weighted avg       0.59      0.47      0.50       154



array([1., 1., 0., 0., 0., 0., 1., 1., 1., 0.])

In [27]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score

clf_svc=SVC(gamma=0.1)
clf_svc.fit(x_train,y_train)
svc_pred=clf_svc.predict(x_test)

cfm_svc=confusion_matrix(y_test,svc_pred)
print(cfm_svc)
Accuracy_svc_train = clf_svc.score(x_train,y_train)
print("svc train score:-",Accuracy_svc_train)
Accuracy_svc_test = accuracy_score(y_test,svc_pred)
print("svc test score:-",Accuracy_svc_test)
cv_svc = cross_val_score(clf_svc, x_train, y_train, cv=5)
print("svc cross val score:-",round(cv_svc.mean(), 2) * 100)
print(classification_report(y_test,svc_pred))
svc_pred[0:10]

[[  1  42]
 [  0 111]]
svc train score:- 1.0
svc test score:- 0.7272727272727273
svc cross val score:- 68.0
              precision    recall  f1-score   support

         0.0       1.00      0.02      0.05        43
         1.0       0.73      1.00      0.84       111

    accuracy                           0.73       154
   macro avg       0.86      0.51      0.44       154
weighted avg       0.80      0.73      0.62       154



array([1., 1., 1., 0., 1., 1., 1., 1., 1., 1.])

In [28]:
from sklearn.model_selection import GridSearchCV

parameters=[{'gamma':[0.1,0.2,0.3]}]
grid_search= GridSearchCV(estimator= clf_svc,param_grid=parameters,scoring='accuracy',cv=5,n_jobs= -1)
grid_search=grid_search.fit(x_train,y_train)
best_accuracy_svc=grid_search.best_score_
best_parameters_svc=grid_search.best_params_
print(best_accuracy_svc)
print(best_parameters_svc)

0.6760869565217391
{'gamma': 0.1}


# We got good accuracy on Logistic Model.Therefore we are using it for the prediction.

In [15]:
pred_log=clf_log.predict(test)
df = pd.DataFrame({'Id':test_copy.Loan_ID,'Loan Status':pred_log})
df['Loan Status'].replace(0, 'N',inplace=True) 
df['Loan Status'].replace(1, 'Y',inplace=True)
df.to_csv('prediction.csv', index = False)