In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv('Train.csv')
test=pd.read_csv('Test.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
data.shape

(614, 13)

In [4]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
data.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [6]:
test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

# Null Imputation

In [7]:
data.Gender.fillna(data.Gender.mode()[0],inplace=True)
test.Gender.fillna(data.Gender.mode()[0],inplace=True)

data.Married.fillna(data.Married.mode()[0],inplace=True)

data.Dependents.fillna(data.Dependents.mode()[0],inplace=True)
test.Dependents.fillna(data.Dependents.mode()[0],inplace=True)

data.Self_Employed.fillna(data.Self_Employed.mode()[0],inplace=True)
test.Self_Employed.fillna(data.Self_Employed.mode()[0],inplace=True)

data.LoanAmount.fillna(data.LoanAmount.median(),inplace=True)
test.LoanAmount.fillna(data.LoanAmount.median(),inplace=True)

data.Loan_Amount_Term.fillna(data.Loan_Amount_Term.mode()[0],inplace=True)
test.Loan_Amount_Term.fillna(data.Loan_Amount_Term.mode()[0],inplace=True)

data.Credit_History.fillna("Unknown",inplace=True)
test.Credit_History.fillna("Unknown",inplace=True)

# Oulier Removal 

In [8]:
from scipy import stats
data['z_score_ApplicantIncome'] = np.abs(stats.zscore(data['ApplicantIncome']))
data['z_score_LoanAmount']=np.abs(stats.zscore(data['LoanAmount']))
data=data[data['z_score_ApplicantIncome']<=3]
data=data[data['z_score_LoanAmount']<=3]
data.drop(columns=['z_score_ApplicantIncome','z_score_LoanAmount'],axis=1,inplace=True)

# Encoding

In [9]:
data['Gender']=data['Gender'].map({"Male":0,"Female":1})
test['Gender']=test['Gender'].map({"Male":0,"Female":1})

data['Married']=data['Married'].map({"Yes":1,"No":0})
test['Married']=test['Married'].map({"Yes":1,"No":0})

data['Dependents']=data['Dependents'].map({"0":0,"1":1,"2":2,"3+":3})
test['Dependents']=test['Dependents'].map({"0":0,"1":1,"2":2,"3+":3})

data['Education']=data['Education'].map({"Graduate":1,'Not Graduate':0})
test['Education']=test['Education'].map({"Graduate":1,'Not Graduate':0})

data['Self_Employed']=data['Self_Employed'].map({"Yes":1,"No":0})
test['Self_Employed']=test['Self_Employed'].map({"Yes":1,"No":0})

data['Credit_History']=data['Credit_History'].astype(str)
test['Credit_History']=test['Credit_History'].astype(str)
data['Credit_History']=data['Credit_History'].map({"1.0":1,"0.0":0,"Unknown":2})
test['Credit_History']=test['Credit_History'].map({"1.0":1,"0.0":0,"Unknown":2})

data['Property_Area']=data['Property_Area'].map({'Urban':0,'Rural':1,'Semiurban':2})
test['Property_Area']=test['Property_Area'].map({'Urban':0,'Rural':1,'Semiurban':2})

# Feature Engineering

In [10]:
data['Total_Income']=data['ApplicantIncome']+data['CoapplicantIncome']
test['Total_Income']=test['ApplicantIncome']+test['CoapplicantIncome']

data['Debt_Income_Ratio'] = (data['ApplicantIncome']+ data['CoapplicantIncome']) / data['LoanAmount']
test['Debt_Income_Ratio'] = (test['ApplicantIncome']+ test['CoapplicantIncome']) / test['LoanAmount']

In [513]:
bins=[0,2500,4000,6000,81000]
group=[1,2,3,4]
data['Income_bin'] = pd.cut(data['ApplicantIncome'],bins,labels=group)
test['Income_bin'] = pd.cut(test['ApplicantIncome'],bins,labels=group)

bins=[0,1000,3000,42000] 
group=[1,2,3] 
data['Coapplicant_Income_bin']=pd.cut(data['CoapplicantIncome'],bins,labels=group)
test['Coapplicant_Income_bin']=pd.cut(test['CoapplicantIncome'],bins,labels=group)

bins=[0,2500,4000,6000,81000]
group=[1,2,3,4] 
data['Total_Income_bin']=pd.cut(data['Total_Income'],bins,labels=group)
test['Total_Income_bin']=pd.cut(test['Total_Income'],bins,labels=group)

interest_rate=0.08
data['EMI'] = data.apply(lambda x: ((x['LoanAmount']*1000)/x['Loan_Amount_Term']) ,axis=1)
data['Residual_monthly_income'] = (data['Total_Income']/12)-(data['EMI'])
test['EMI'] = test.apply(lambda x: ((x['LoanAmount']*1000)/x['Loan_Amount_Term']) ,axis=1)
test['Residual_monthly_income'] = (test['Total_Income']/12)-(test['EMI'])

data['Redisual_Status'] = data['Residual_monthly_income'].apply(lambda x: 0 if x<0 else 1)
test['Redisual_Status'] = test['Residual_monthly_income'].apply(lambda x: 0 if x<0 else 1)

In [514]:
data['Loan_Status']=data['Loan_Status'].map({"Y":1,"N":0})

In [515]:
data.drop(columns=['Loan_ID'],axis=1,inplace=True)
X_test=test.drop(columns=['Loan_ID'],axis=1)

In [516]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income,Debt_Income_Ratio,Income_bin,Coapplicant_Income_bin,Total_Income_bin,EMI,Residual_monthly_income,Redisual_Status
0,0,0,0,1,0,5849,0.0,128.0,360.0,1,0,1,5849.0,45.695312,3,,3,355.555556,131.861111,1
1,0,1,1,1,0,4583,1508.0,128.0,360.0,1,1,0,6091.0,47.585938,3,2.0,4,355.555556,152.027778,1
2,0,1,0,1,1,3000,0.0,66.0,360.0,1,0,1,3000.0,45.454545,2,,2,183.333333,66.666667,1
3,0,1,0,0,0,2583,2358.0,120.0,360.0,1,0,1,4941.0,41.175,2,2.0,3,333.333333,78.416667,1
4,0,0,0,1,0,6000,0.0,141.0,360.0,1,0,1,6000.0,42.553191,3,,3,391.666667,108.333333,1


# Train-Validation Split

In [517]:
from sklearn.model_selection import train_test_split
train, validation = train_test_split(
    data,
    test_size=0.20,
    random_state=0,
      )

X_train=train.drop(columns=['Loan_Status'])
y_train=train['Loan_Status']

X_validation=validation.drop(columns=['Loan_Status'])
y_validation=validation['Loan_Status']

X_train.shape, y_train.shape, X_validation.shape, y_validation.shape

((475, 19), (475,), (119, 19), (119,))

# Feature Selection

In [483]:
# from catboost import CatBoostClassifier
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# sfs= SFS(CatBoostClassifier(random_state=0,iterations=500),
#          k_features=(1,19),
#          forward=True,
#          floating=False,
#          verbose=2,
#          scoring='accuracy',
#          cv=3,
#          n_jobs=-1
#         ).fit(X_train,y_train)
# sfs.k_feature_names_

In [519]:
features=['Self_Employed',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Debt_Income_Ratio',
 'Residual_monthly_income',
 'Redisual_Status']

X_train=X_train[features]
X_validation=X_validation[features]
X_test=X_test[features]
X_train.head()

Unnamed: 0,Self_Employed,Loan_Amount_Term,Credit_History,Property_Area,Debt_Income_Ratio,Residual_monthly_income,Redisual_Status
231,0,180.0,1,1,88.47619,76.333333,1
564,0,360.0,0,0,34.104651,16.583333,1
481,0,360.0,1,1,27.389381,-55.972222,0
45,0,360.0,1,0,38.75,39.722222,1
213,1,360.0,1,1,43.869231,114.138889,1


# XGB Training Model

In [499]:
from sklearn import metrics

In [500]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=1)
xgb.fit(X_train,y_train)
y_pred_xgb=xgb.predict(X_validation)
print("Accuracy for xgb:",metrics.accuracy_score(y_validation, y_pred_xgb))
print("f1 score for xgb:",metrics.f1_score(y_validation,y_pred_xgb))
# Accuracy for xgb: 0.8151260504201681
# f1 score for xgb: 0.8735632183908046

Accuracy for xgb: 0.7815126050420168
f1 score for xgb: 0.8488372093023256


In [362]:
# params={
#  "learning_rate"    : [ 0.03,0.04,0.05] ,
#  "max_depth"        : [ 4,5,6],
#  "min_child_weight" : [ 1,2,3,],
#  "gamma"            : [ 0.0,0.05, 0.1 ],
#  "colsample_bytree" : [ 0.85,0.9,0.95,1 ],
#  "n_estimators"     : [70, 80, 90, 100]
# }
# classifier=XGBClassifier(random_state=0)
# from sklearn.model_selection import GridSearchCV
# grid_search=GridSearchCV(classifier,param_grid=params,scoring='accuracy',n_jobs=-1,cv=3,verbose=2)
# grid_search.fit(X_train,y_train)

In [524]:
#print(grid_search.best_params_)
xgb_tuned=XGBClassifier(random_state=0,colsample_bytree=0.85,learning_rate=0.03,max_depth=4,min_child_weight=2,
                       n_estimators=80)

In [525]:
xgb_tuned.fit(X_train,y_train)
y_pred_xgb_tuned=xgb_tuned.predict(X_validation)
print("Accuracy for xgb_tuned:",metrics.accuracy_score(y_validation, y_pred_xgb_tuned))
print("f1 score for xgb_tuned:",metrics.f1_score(y_validation,y_pred_xgb_tuned))
# Accuracy for xgb_tuned: 0.8235294117647058
# f1 score for xgb_tuned: 0.883977900552486

Accuracy for xgb_tuned: 0.8319327731092437
f1 score for xgb_tuned: 0.8901098901098902


# LGB Training Model

In [489]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(random_state=2)
lgb.fit(X_train,y_train)
y_pred_lgb=lgb.predict(X_validation)
print("Accuracy for lgb:",metrics.accuracy_score(y_validation, y_pred_lgb))
print("f1 score for lgb:",metrics.f1_score(y_validation,y_pred_lgb))

Accuracy for lgb: 0.7899159663865546
f1 score for lgb: 0.8571428571428571


In [398]:
# params={
#  "learning_rate"    : [0.008,0.009,0.01,0.02,0.03] ,
#  "boosting_type"    : ['gdbt','dart'],
#  "max_depth"        : [-1,2,4,6,8],
#  "n_estimators"     : [50,60,70,80,100,150]
# }

# classifier=LGBMClassifier(random_state=0)
# from sklearn.model_selection import GridSearchCV
# grid_search=GridSearchCV(classifier,param_grid=params,scoring='accuracy',n_jobs=-1,cv=3,verbose=2)
# grid_search.fit(X_train,y_train)

In [490]:
#print(grid_search.best_params_)
lgb_tuned=LGBMClassifier(random_state=0,boosting_type='dart',max_depth=-1,n_estimators=80,learning_rate=0.008)

In [491]:
lgb_tuned.fit(X_train,y_train)
y_pred_lgb_tuned=lgb_tuned.predict(X_validation)
print("Accuracy for lgb_tuned:",metrics.accuracy_score(y_validation, y_pred_lgb_tuned))
print("f1 score for lgb_tuned:",metrics.f1_score(y_validation,y_pred_lgb_tuned))
# Accuracy for lgb_tuned: 0.8235294117647058
# f1 score for lgb_tuned: 0.8864864864864865

Accuracy for lgb_tuned: 0.8235294117647058
f1 score for lgb_tuned: 0.8864864864864865


# CatB Training Model

In [520]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(random_state=100,logging_level='Silent')
cat.fit(X_train,y_train)
y_pred_cat=cat.predict(X_validation)
print("Accuracy for cat:",metrics.accuracy_score(y_validation, y_pred_cat))
print("f1 score for cat:",metrics.f1_score(y_validation,y_pred_cat))
# Accuracy for cat: 0.8151260504201681
# f1 score for cat: 0.8791208791208791

Accuracy for cat: 0.8235294117647058
f1 score for cat: 0.88268156424581


In [370]:
# params={
#  "learning_rate"    : [0.005,0.007,0.0075,0.008,0.01],
#  "depth"            : [6,7,8,9],
#  "iterations"       : [800,1000,1200,1500,1800],
#  'l2_leaf_reg'      : [1,2,3],
# }
# classifier=CatBoostClassifier(random_state=0,logging_level='Silent')
# from sklearn.model_selection import GridSearchCV
# grid_search=GridSearchCV(classifier,param_grid=params,scoring='accuracy',n_jobs=-1,cv=3,verbose=2)
# grid_search.fit(X_train,y_train)

In [493]:
#print(grid_search.best_params_)
cat_tuned=CatBoostClassifier(random_state=0,logging_level='Silent',learning_rate=0.005,depth=8,
                             iterations=800,l2_leaf_reg=3)

In [494]:
cat_tuned.fit(X_train,y_train)
y_pred_cat_tuned=cat_tuned.predict(X_validation)
print("Accuracy for cat_tuned:",metrics.accuracy_score(y_validation, y_pred_cat_tuned))
print("f1 score for cat_tuned:",metrics.f1_score(y_validation,y_pred_cat_tuned))
# Accuracy for cat_tuned: 0.8235294117647058
# f1 score for cat_tuned: 0.8864864864864865

Accuracy for cat_tuned: 0.8235294117647058
f1 score for cat_tuned: 0.8852459016393442


# VC

In [495]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier([('cat', cat_tuned), ('lgb', lgb_tuned),('xgb',xgb_tuned)])
y_pred_vc=vc.fit(X_train, y_train).predict(X_validation)
print("Accuracy for cat_lgb_xgb:",metrics.accuracy_score(y_validation, y_pred_vc))
print("f1 score for cat_lgb_xgb:",metrics.f1_score(y_validation,y_pred_vc))

Accuracy for cat_lgb_xgb: 0.8235294117647058
f1 score for cat_lgb_xgb: 0.8852459016393442


# Logistic Regression

In [496]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)
lr_pred=lr.predict(X_validation)
print(metrics.accuracy_score(y_validation, lr_pred))
print(metrics.f1_score(y_validation,lr_pred))

0.8235294117647058
0.8864864864864865


# Final Prediction

In [526]:
y_test=xgb_tuned.predict(X_test)

In [527]:
submission=pd.DataFrame({'Loan_ID': test["Loan_ID"],'Loan_Status':y_test})
submission['Loan_Status']=np.where(submission['Loan_Status']==1,'Y','N')

In [528]:
submission.to_csv('xgb_new_sub.csv',index=False)