In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

Dataset link: https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/

In [2]:
#reading the dataset
df=pd.read_csv("train.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df.columns

Index([u'Loan_ID', u'Gender', u'Married', u'Dependents', u'Education',
       u'Self_Employed', u'ApplicantIncome', u'CoapplicantIncome',
       u'LoanAmount', u'Loan_Amount_Term', u'Credit_History', u'Property_Area',
       u'Loan_Status'],
      dtype='object')

In [4]:
cols = df.columns

for i in cols:
    if df[i].isnull().sum() != 0:
        print("Column name is: ", i)
        print(df[i].isnull().sum())

('Column name is: ', 'Gender')
13
('Column name is: ', 'Married')
3
('Column name is: ', 'Dependents')
15
('Column name is: ', 'Self_Employed')
32
('Column name is: ', 'LoanAmount')
22
('Column name is: ', 'Loan_Amount_Term')
14
('Column name is: ', 'Credit_History')
50


In [5]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [6]:
#filling missing values
print(df['Gender'].value_counts())
df['Gender'].fillna('Male', inplace=True)

print(df['Married'].value_counts())
df['Married'].fillna('Yes', inplace=True)

print(df['Dependents'].value_counts())
df['Dependents'].fillna(0, inplace=True)

print(df['Self_Employed'].value_counts())
df['Self_Employed'].fillna('No', inplace=True)

print(df.LoanAmount.describe())
df['LoanAmount'].fillna(df.LoanAmount.mean(), inplace = True)

print(df['Loan_Amount_Term'].value_counts())
df['Loan_Amount_Term'].fillna(512, inplace=True)

print(df['Credit_History'].value_counts())
df['Credit_History'].fillna(1.0, inplace=True)

Male      489
Female    112
Name: Gender, dtype: int64
Yes    398
No     213
Name: Married, dtype: int64
0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64
No     500
Yes     82
Name: Self_Employed, dtype: int64
count    592.000000
mean     146.412162
std       85.587325
min        9.000000
25%      100.000000
50%      128.000000
75%      168.000000
max      700.000000
Name: LoanAmount, dtype: float64
360.0    512
180.0     44
480.0     15
300.0     13
84.0       4
240.0      4
120.0      3
36.0       2
60.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64
1.0    475
0.0     89
Name: Credit_History, dtype: int64


In [7]:
# Get categorical columns
cat_cols = []
for i in cols:
    if df[i].dtypes == 'object' and i != 'Loan_ID':
        print(i)
        cat_cols.append(i)

# Do label encoding for categorical columns
le = LabelEncoder()
for i in cat_cols:
    df[i] = le.fit_transform(df[i])

Gender
Married
Dependents
Education
Self_Employed
Property_Area
Loan_Status


In [8]:
#split dataset into train and test

train, test = train_test_split(df, test_size=0.3, random_state=0)

x_train=train.drop(['Loan_Status', 'Loan_ID'], axis=1)
y_train=train['Loan_Status']

x_test=test.drop(['Loan_Status', 'Loan_ID'], axis=1)
y_test=test['Loan_Status']

#### LogisticRegression 

In [9]:
model = LogisticRegression(random_state=1)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.8324324324324325

#### NaiveBayes 

In [10]:
model = GaussianNB()
model.fit(x_train, y_train)
model.score(x_test,y_test)

0.8216216216216217

#### SVM 

In [11]:
model = svm.SVC()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.7297297297297297

#### Decission Tree 

In [12]:
model = tree.DecisionTreeClassifier(random_state=1)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.7621621621621621

#### Bagging Classifier 

In [13]:
model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
model.fit(x_train, y_train)
model.score(x_test,y_test)

0.772972972972973

#### Random Forest 

In [14]:
model= RandomForestClassifier(random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)

0.7891891891891892

In [15]:
# Get feature importance
for i, j in sorted(zip(x_train.columns, model.feature_importances_)):
    print(i, j)

('ApplicantIncome', 0.21221305811494995)
('CoapplicantIncome', 0.12076719593397603)
('Credit_History', 0.24335456412035414)
('Dependents', 0.041700468774629074)
('Education', 0.023579208134452568)
('Gender', 0.021702525832963608)
('LoanAmount', 0.21177174138170957)
('Loan_Amount_Term', 0.03897374806321142)
('Married', 0.020001105234624308)
('Property_Area', 0.05035372625128807)
('Self_Employed', 0.015582658157841281)


#### Adaboost 

In [16]:
model = AdaBoostClassifier(random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)

0.8162162162162162

#### Gradient boosting classifier 

In [17]:
model= GradientBoostingClassifier(learning_rate=0.01,random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)

0.827027027027027

#### XGBoost 

In [18]:
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model.fit(x_train, y_train)
model.score(x_test,y_test)

  if diff:


0.827027027027027

#### LightGBM 

In [19]:
train_data=lgb.Dataset(x_train,label=y_train)
#define parameters
params = {'learning_rate':0.001}
model= lgb.train(params, train_data, 100) 
y_pred=model.predict(x_test)
for i in range(0,185):
    if y_pred[i]>=0.5: 
        y_pred[i]=1
else: 
    y_pred[i]=0
accuracy_score(y_test, y_pred)

0.7189189189189189

#### CatBoost 

In [20]:
model=CatBoostClassifier()
categorical_features_indices = np.where(df.dtypes != np.float)[0]
model.fit(x_train,y_train,cat_features=([ 0,  1, 2, 3, 4, 10]),eval_set=(x_test, y_test))
model.score(x_test,y_test)

0:	learn: 0.6614145	test: 0.6606599	best: 0.6606599 (0)	total: 58.2ms	remaining: 58.2s
1:	learn: 0.6331088	test: 0.6317961	best: 0.6317961 (1)	total: 66.1ms	remaining: 33s
2:	learn: 0.6096568	test: 0.6079755	best: 0.6079755 (2)	total: 71.8ms	remaining: 23.9s
3:	learn: 0.5905877	test: 0.5878291	best: 0.5878291 (3)	total: 75.1ms	remaining: 18.7s
4:	learn: 0.5744878	test: 0.5706316	best: 0.5706316 (4)	total: 78.8ms	remaining: 15.7s
5:	learn: 0.5604612	test: 0.5558306	best: 0.5558306 (5)	total: 82.8ms	remaining: 13.7s
6:	learn: 0.5483062	test: 0.5425377	best: 0.5425377 (6)	total: 87ms	remaining: 12.3s
7:	learn: 0.5365207	test: 0.5309211	best: 0.5309211 (7)	total: 91.9ms	remaining: 11.4s
8:	learn: 0.5240110	test: 0.5203366	best: 0.5203366 (8)	total: 98.2ms	remaining: 10.8s
9:	learn: 0.5148467	test: 0.5108900	best: 0.5108900 (9)	total: 103ms	remaining: 10.2s
10:	learn: 0.5080082	test: 0.5035216	best: 0.5035216 (10)	total: 111ms	remaining: 10s
11:	learn: 0.5031146	test: 0.4978061	best: 0.4978

0.827027027027027

#### MaxVoting 

In [21]:
model1 = LogisticRegression(random_state=1)
model2 = tree.DecisionTreeClassifier(random_state=1)
model3 = GaussianNB()
model = VotingClassifier(estimators=[('lr', model1), ('dt', model2), ('NB', model3)])
model.fit(x_train,y_train)
model.score(x_test,y_test)

  if diff:


0.827027027027027

#### Weighted Averaging 

In [22]:
model1 = tree.DecisionTreeClassifier()
model2 = GaussianNB()
model3= LogisticRegression()

model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)

pred1 = model1.predict_proba(x_test)
pred2 = model2.predict_proba(x_test)
pred3 = model3.predict_proba(x_test)

weighted_prediction = (0.2*pred1)+(0.4*pred2)+(0.4*pred3)
labelprediction = np.argmax(weighted_prediction, axis = 1)

accuracy_score(labelprediction, y_test)

0.827027027027027

#### Stacking 

In [23]:
def Stacking(model,train,y,test,n_fold):
    folds=StratifiedKFold(n_splits=n_fold,random_state=1)
    test_pred=np.empty((test.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    
    for train_indices,val_indices in folds.split(train,y.values):
        x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]

        model.fit(X=x_train,y=y_train)
        train_pred=np.append(train_pred,model.predict(x_val))
        
    model.fit(train, y)
    test_pred=model.predict(test)
    return test_pred.reshape(-1,1),train_pred

In [24]:
model1 = tree.DecisionTreeClassifier(random_state=1)

test_pred1 ,train_pred1=Stacking(model=model1,n_fold=10, train=x_train,test=x_test,y=y_train)

train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)

In [25]:
model2 = LogisticRegression()

test_pred2 ,train_pred2=Stacking(model=model2,n_fold=10,train=x_train,test=x_test,y=y_train)

train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [26]:
df = pd.concat([train_pred1, train_pred2], axis=1)
df_test = pd.concat([test_pred1, test_pred2], axis=1)

model = LogisticRegression(random_state=1)
model.fit(df,y_train)
model.score(df_test, y_test)

0.8324324324324325

#### Blending 

In [27]:
train, test = train_test_split(train, test_size=0.2, random_state=0)

x_train=train.drop(['Loan_Status', 'Loan_ID'], axis=1)
y_train=train['Loan_Status']

x_val=test.drop(['Loan_Status', 'Loan_ID'], axis=1)
y_val=test['Loan_Status']

x_val = x_val.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)

model1 = tree.DecisionTreeClassifier()
model1.fit(x_train, y_train)

val_pred1=model1.predict(x_val)
test_pred1=model1.predict(x_test)

val_pred1=pd.DataFrame(val_pred1)
test_pred1=pd.DataFrame(test_pred1)

model2 = LogisticRegression()
model2.fit(x_train,y_train)

val_pred2=model2.predict(x_val)
test_pred2=model2.predict(x_test)

val_pred2=pd.DataFrame(val_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [28]:
df_val = pd.concat([x_val, val_pred1,val_pred2],axis=1)
df_test = pd.concat([x_test, test_pred1,test_pred2],axis=1)

model = LogisticRegression(random_state=1)
model.fit(df_val,y_val)
model.score(df_test,y_test)

0.8108108108108109

References:

    1. https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/
    2. https://mlwave.com/kaggle-ensembling-guide/
    3. https://www.analyticsvidhya.com/blog/2017/06/which-algorithm-takes-the-crown-light-gbm-vs-xgboost/
    4. https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc
    5. https://www.analyticsvidhya.com/blog/2017/08/catboost-automated-categorical-data/