In [1]:
#metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np



In [6]:
data=pd.read_csv('Dataset1.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0.0,0.0,0.0,1,0.0,2.160333,0.0,1.579397,360.0,1.0,0,Y
1,LP001003,0.0,1.0,1.0,1,0.0,2.13181,1508.0,1.579397,360.0,1.0,1,N
2,LP001005,0.0,1.0,0.0,1,1.0,2.080237,0.0,1.432618,360.0,1.0,0,Y
3,LP001006,0.0,1.0,0.0,0,0.0,2.061368,2358.0,1.566007,360.0,1.0,0,Y
4,LP001008,0.0,0.0,0.0,1,0.0,2.163267,0.0,1.599137,360.0,1.0,0,Y


In [35]:
X=data.drop(['Loan_ID','Loan_Status'],axis=1)
y=data['Loan_Status']
test=pd.read_csv('X_test.csv')
X_test=test.drop(['Loan_ID'],axis=1)

In [36]:
logistic_reg=LogisticRegression(max_iter=400)
logistic_reg.fit(X,y)
score1=cross_val_score(logistic_reg,X,y,cv=10)
print(score1.mean())
predict1=logistic_reg.predict(X_test)



0.8095716552088842


In [37]:
clf=LinearSVC(dual=False,max_iter=1000,C=10)
clf.fit(X,y)
score2=cross_val_score(clf,X,y,cv=10)
print(score2.mean())
predict2=clf.predict(X_test)


0.8095716552088842


In [10]:
knn_clf=KNeighborsClassifier(n_neighbors=100)
knn_clf.fit(X,y)
score3=cross_val_score(knn_clf,X,y,cv=10)
print(score3.mean())

0.6873083024854575


In [20]:
clf_rf=RandomForestClassifier(n_estimators=100,criterion='gini',n_jobs=-1,bootstrap=True,max_depth=5)
clf_rf.fit(X,y)
score4=cross_val_score(clf_rf,X,y,cv=10)
print(score4.mean())

0.8063194077207825


In [12]:
clf_boost=XGBClassifier(max_depth=1,booster='gbtree',n_jobs=1,n_estimators=100,learning_rate=0.01,min_child_weight=1)
clf_boost.fit(X,y)
score5=cross_val_score(clf_boost,X,y,cv=10)
print(score5.mean())

0.8095716552088842


In [None]:
pd.DataFrame(X)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0.0,0.0,0.0,1,0.0,2.160333,0.0,1.579397,360.0,1.0,0
1,0.0,1.0,1.0,1,0.0,2.131810,1508.0,1.579397,360.0,1.0,1
2,0.0,1.0,0.0,1,1.0,2.080237,0.0,1.432618,360.0,1.0,0
3,0.0,1.0,0.0,0,0.0,2.061368,2358.0,1.566007,360.0,1.0,0
4,0.0,0.0,0.0,1,0.0,2.163267,0.0,1.599137,360.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
609,1.0,0.0,0.0,1,0.0,2.075994,0.0,1.449898,360.0,1.0,1
610,0.0,1.0,3.0,1,0.0,2.118687,0.0,1.305323,180.0,1.0,1
611,0.0,1.0,1.0,1,0.0,2.196797,240.0,1.710801,360.0,1.0,0
612,0.0,1.0,2.0,1,0.0,2.189827,0.0,1.654623,360.0,1.0,0


In [13]:
#hyperparametertuning
from sklearn.model_selection import RandomizedSearchCV
import random
from random import randint
parameters=[{'max_depth':[3,5,8,10,None],'n_estimators':[100,500,1000],'max_features':[1,2,3],'criterion':['gini','entropy'],'bootstrap':[True,False],'min_samples_leaf':[1,2,3,4]}]

In [14]:
def hyperparamter_opt(model,parameters):
  rst_search=RandomizedSearchCV(model,param_distributions=parameters,n_iter=10,n_jobs=-1,cv=10)
  rst_search.fit(X,y)
  high_score=rst_search.best_score_
  best_paramters=rst_search.best_params_

  return high_score,best_paramters



In [24]:
hyperparamter_opt(clf_rf,parameters)

(0.8079587519830775,
 {'bootstrap': True,
  'criterion': 'gini',
  'max_depth': 8,
  'max_features': 1,
  'min_samples_leaf': 3,
  'n_estimators': 500})

In [16]:
parameters1=[{'max_depth':[1,2,3,4,5],'booster':['gbtree','dart'],'learning_rate':[0.05,0.010,0.15,0.20,0.30],'min_child_weight':[1,3,5,7],'gamma':[0.0,0.1,0.2,0.3,0.4],'colsample_bytree':[0.3,0.4,0.5,0.7]}]

In [17]:
hyperparamter_opt(clf_boost,parameters1)

(0.8095716552088842,
 {'booster': 'dart',
  'colsample_bytree': 0.7,
  'gamma': 0.4,
  'learning_rate': 0.01,
  'max_depth': 1,
  'min_child_weight': 3})

In [38]:
clf_rf=RandomForestClassifier(n_estimators=500,criterion='gini',bootstrap=True,max_depth=8,max_features=1,min_samples_leaf=3)
clf_rf.fit(X,y)
score4=cross_val_score(clf_rf,X,y,cv=10)
print(score4.mean())
predict3=clf_rf.predict(X_test)

0.8063194077207825


In [39]:
clf_boost=XGBClassifier(max_depth=1,booster='dart',n_jobs=1,n_estimators=100,learning_rate=0.01,min_child_weight=3,colsample_bytree=0.7)
clf_boost.fit(X,y)
score5=cross_val_score(clf_boost,X,y,cv=10)
print(score5.mean())
predict4=clf_boost.predict(X_test)

0.8095716552088842


In [41]:
logireg_predict=pd.concat([test['Loan_ID'],pd.DataFrame(predict1)],axis=1)
logireg_predict.columns=['LoanId','LoanStatus']

In [43]:
SVC_predict=pd.concat([test['Loan_ID'],pd.DataFrame(predict2)],axis=1)
SVC_predict.columns=['LoanId','LoanStatus']

In [44]:
randomforest_predict=pd.concat([test['Loan_ID'],pd.DataFrame(predict3)],axis=1)
randomforest_predict.columns=['LoanId','LoanStatus']

In [45]:
XGBOOST_predict=pd.concat([test['Loan_ID'],pd.DataFrame(predict4)],axis=1)
XGBOOST_predict.columns=['LoanId','LoanStatus']

In [52]:
XGBOOST_predict

Unnamed: 0,LoanId,LoanStatus
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
...,...,...
362,LP002971,Y
363,LP002975,Y
364,LP002980,Y
365,LP002986,Y


In [53]:
XGBOOST_predict.head()

Unnamed: 0,LoanId,LoanStatus
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y


In [54]:
logireg_predict.to_csv('LLogiReg.csv',index=False)

In [55]:
SVC_predict.to_csv('SVC.csv',index=False)

In [56]:
randomforest_predict.to_csv('RandomForest.csv',index=False)

In [57]:
XGBOOST_predict.to_csv('XGBOOST.csv',index=False)