In [140]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB 
from sklearn.neural_network import MLPClassifier

In [2]:
path = 'C:\\Ravikanth\\Analytics Vidya\\1.Loan Prediction'

In [3]:
os.chdir(path)

In [4]:
train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv')

In [5]:
print('Train Shape: {}   Test Shape: {}' .format(train.shape , test.shape))

Train Shape: (614, 13)   Test Shape: (367, 12)


In [6]:
pd.value_counts(train['Loan_Status'].values)/len(train['Loan_Status'])*100

Y    68.729642
N    31.270358
dtype: float64

In [7]:
'''This is a perfect case of imbalanced class'''
'''Lets Do some data Preprocessing . We will cover following taks:
1. Check Data types such as Numerics , Date , Strings , Categories
2. Do Label encoding for object columns which have two categories
3. DO One Hot Encoding for object columns which have more than 2 categories
4. Do Imputation for missing values
'''


'Lets Do some data Preprocessing . We will cover following taks:\n1. Check Data types such as Numerics , Date , Strings , Categories\n2. Do Label encoding for object columns which have two categories\n3. DO One Hot Encoding for object columns which have more than 2 categories\n4. Do Imputation for missing values\n'

In [8]:
def miss_val(df):
    miss_values =  df.isnull().sum()
    miss_values_percent = df.isnull().sum()/len(df)*100
    miss_values_table = pd.concat([miss_values,miss_values_percent] , axis =1)
    miss_values_table_ren = miss_values_table.rename(columns ={ 0: 'Missing Value' , 1:'Percent'})
    return miss_values_table_ren 

In [9]:
miss_val(train)

Unnamed: 0,Missing Value,Percent
Loan_ID,0,0.0
Gender,13,2.117264
Married,3,0.488599
Dependents,15,2.442997
Education,0,0.0
Self_Employed,32,5.211726
ApplicantIncome,0,0.0
CoapplicantIncome,0,0.0
LoanAmount,22,3.583062
Loan_Amount_Term,14,2.28013


In [10]:
train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

Loan_ID          614
Gender             2
Married            2
Dependents         4
Education          2
Self_Employed      2
Property_Area      3
Loan_Status        2
dtype: int64

In [11]:
'''As you can see most of the object columns have 2 categories. So lets do label encoding to these object columns'''

'As you can see most of the object columns have 2 categories. So lets do label encoding to these object columns'

In [12]:
le = LabelEncoder()

In [13]:
#Lets do label encoding manually for train data first. Then we wil do the same for test data as well
train['Gender'] =  le.fit_transform(train['Gender'].astype(str))
train['Married'] =  le.fit_transform(train['Married'].astype(str))
train['Education'] =  le.fit_transform(train['Education'].astype(str))
train['Self_Employed'] =  le.fit_transform(train['Self_Employed'].astype(str))
mapper = { 'Y' : 1 , 'N' : 0}
train['Loan_Status'] = train['Loan_Status'].map(mapper)

In [14]:
#Lets do label encoding manually for test data first.
test['Gender'] =  le.fit_transform(test['Gender'].astype(str))
test['Married'] =  le.fit_transform(test['Married'].astype(str))
test['Education'] =  le.fit_transform(test['Education'].astype(str))
test['Self_Employed'] =  le.fit_transform(test['Self_Employed'].astype(str))

In [15]:
#Now lets do OHE (One hot encoding) for columns with more than 2 categories. Lets do it using getdummies function of Pandas
#Before doing OHE lets check the shape 
train.shape


(614, 13)

In [16]:
target_label = train['Loan_Status']
train_dm = pd.get_dummies(train.drop(columns=['Loan_ID','Loan_Status']))
train_dm['Loan_Status'] = target_label

In [17]:
train_dm.shape

(614, 17)

In [18]:
test_dm = pd.get_dummies(test.drop(columns='Loan_ID'))
test_dm.shape

(367, 16)

In [19]:
train_dm.dropna(axis = 0,inplace=True)
train_dm.shape

(529, 17)

In [20]:
miss_val(train_dm)

Unnamed: 0,Missing Value,Percent
Gender,0,0.0
Married,0,0.0
Education,0,0.0
Self_Employed,0,0.0
ApplicantIncome,0,0.0
CoapplicantIncome,0,0.0
LoanAmount,0,0.0
Loan_Amount_Term,0,0.0
Credit_History,0,0.0
Dependents_0,0,0.0


'''As you can see there is no missing value in train data. Lets do similar excersize for test data '''

In [21]:
test_dm.fillna(test_dm.median(), inplace= True)
test_dm.shape
miss_val(test_dm)

Unnamed: 0,Missing Value,Percent
Gender,0,0.0
Married,0,0.0
Education,0,0.0
Self_Employed,0,0.0
ApplicantIncome,0,0.0
CoapplicantIncome,0,0.0
LoanAmount,0,0.0
Loan_Amount_Term,0,0.0
Credit_History,0,0.0
Dependents_0,0,0.0


In [22]:
'''Now we are good to go for building model. Lets try first with logistic Regression , then with SVM, DTree, Random Forest'''

'Now we are good to go for building model. Lets try first with logistic Regression , then with SVM, DTree, Random Forest'

In [154]:
x_train , x_test , y_train , y_test = train_test_split(train_dm.iloc[:,:-1] , train_dm['Loan_Status'] , test_size = 0.2 , random_state = 10)

In [161]:
lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=10)
svm = SVC(C=1.0)
dtree = DecisionTreeClassifier(criterion='gini' , random_state= 100)
rf = RandomForestClassifier(random_state=100)
mlp =MLPClassifier(max_iter=1000)

In [162]:
classifiers = [lr,knn,svm,dtree,rf,mlp]
Lables = ['Logistic Regression' , 'K Nearest Neighbours','Support Vector Machines', 'Decision Trees', 'Random Forest','MLP']
for clf , label in zip(classifiers , Lables):
    clf.fit(x_train,y_train)
    score = clf.score(x_test,y_test)
    print('Using {}  \n Accuracy: {} ' .format(label, score))

Using Logistic Regression  
 Accuracy: 0.7735849056603774 
Using K Nearest Neighbours  
 Accuracy: 0.7169811320754716 
Using Support Vector Machines  
 Accuracy: 0.6698113207547169 
Using Decision Trees  
 Accuracy: 0.7169811320754716 
Using Random Forest  
 Accuracy: 0.7358490566037735 
Using MLP  
 Accuracy: 0.5377358490566038 


In [26]:
'''As you can see the Accuracy is very bad except for Logistic Regression. Now lets try different methods to imporve the score'''

'As you can see the Accuracy is very bad except for Logistic Regression. Now lets try different methods to imporve the score'

In [27]:
'''First we we will do scaling'''

'First we we will do scaling'

In [28]:
sc = StandardScaler()
sc.fit(x_train)
x_train_sc = sc.transform(x_train)
x_test_sc = sc.transform(x_test)

In [29]:
classifiers = [lr,knn,svm,dtree,rf]
Lables = ['Logistic Regression' , 'K Nearest Neighbours','Support Vector Machines', 'Decision Trees', 'Random Forest']
for clf , label in zip(classifiers , Lables):
    clf.fit(x_train_sc,y_train)
    score = clf.score(x_test_sc,y_test)
    print('Using {}  \n Accuracy: {} ' .format(label, score))

Using Logistic Regression  
 Accuracy: 0.7830188679245284 
Using K Nearest Neighbours  
 Accuracy: 0.7452830188679245 
Using Support Vector Machines  
 Accuracy: 0.7830188679245284 
Using Decision Trees  
 Accuracy: 0.7169811320754716 
Using Random Forest  
 Accuracy: 0.7358490566037735 


In [30]:
'''As you can see , by using standardization the accuracy for SCV increased drastically. Now lets focus on creating polynomials
and decomposition'''

'As you can see , by using standardization the accuracy for SCV increased drastically. Now lets focus on creating polynomials\nand decomposition'

In [31]:
x_train['Loan_Status'] = target_label

In [32]:
x_train.corr()['Loan_Status'].sort_values(ascending = False)

Loan_Status                1.000000
Credit_History             0.570940
Property_Area_Semiurban    0.145525
Married                    0.104273
Dependents_2               0.092939
Gender                     0.064063
Self_Employed              0.047978
Dependents_3+              0.001829
Dependents_1              -0.002913
CoapplicantIncome         -0.006251
ApplicantIncome           -0.023401
Property_Area_Urban       -0.037390
Loan_Amount_Term          -0.041534
Dependents_0              -0.042372
LoanAmount                -0.067185
Education                 -0.080639
Property_Area_Rural       -0.118451
Name: Loan_Status, dtype: float64

In [33]:
polynomials = PolynomialFeatures(degree=2)

In [34]:
x_train_sc = polynomials.fit_transform(x_train_sc)
x_test_sc = polynomials.fit_transform(x_test_sc)
print('X_train shape : {}  X_test Shape: {}' .format(x_train_sc.shape , x_test_sc.shape))

X_train shape : (423, 153)  X_test Shape: (106, 153)


In [35]:
classifiers = [lr,knn,svm,dtree,rf]
Lables = ['Logistic Regression' , 'K Nearest Neighbours','Support Vector Machines', 'Decision Trees', 'Random Forest']
for clf , label in zip(classifiers , Lables):
    clf.fit(x_train_sc,y_train)
    score = clf.score(x_test_sc,y_test)
    print('Using {}  \n Accuracy: {} ' .format(label, score))

Using Logistic Regression  
 Accuracy: 0.7264150943396226 
Using K Nearest Neighbours  
 Accuracy: 0.7169811320754716 
Using Support Vector Machines  
 Accuracy: 0.7735849056603774 
Using Decision Trees  
 Accuracy: 0.7358490566037735 
Using Random Forest  
 Accuracy: 0.7358490566037735 


In [36]:
'''After creating polynomial , accuracy decreased.So lets do decomposition'''

'After creating polynomial , accuracy decreased.So lets do decomposition'

In [102]:
pca = PCA(n_components=5)

In [103]:
pca.fit(x_train)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [104]:
pca.explained_variance_ratio_.sum()

0.999999960942324

In [105]:
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)

In [106]:
classifiers = [lr,knn,svm,dtree,rf]
Lables = ['Logistic Regression' , 'K Nearest Neighbours','Support Vector Machines', 'Decision Trees', 'Random Forest']
for clf , label in zip(classifiers , Lables):
    clf.fit(x_train_pca,y_train)
    score = clf.score(x_test_pca,y_test)
    print('Using {}  \n Accuracy: {} ' .format(label, score))

Using Logistic Regression  
 Accuracy: 0.660377358490566 
Using K Nearest Neighbours  
 Accuracy: 0.6509433962264151 
Using Support Vector Machines  
 Accuracy: 0.6698113207547169 
Using Decision Trees  
 Accuracy: 0.5849056603773585 
Using Random Forest  
 Accuracy: 0.6320754716981132 


In [None]:
'''Surprisingly the accuracy dropped after doing PCA. Finally lets try XGBOOST and LGBM'''

In [68]:
import lightgbm as lgbm
import xgboost as xgb

In [69]:
xgb_clf = xgb.XGBClassifier(n_estimators=100 ,reg_alpha=0 ,random_state=10)

In [118]:
xgb_clf.fit(x_train_sc , y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [119]:
xgb_clf.score(x_test_sc,y_test)

  if diff:


0.7641509433962265

In [120]:
lgbm_clf = lgbm.LGBMClassifier()
lgbm_clf.fit(x_train_sc , y_train)
lgbm_clf.score(x_test_sc,y_test)

  if diff:


0.7452830188679245

In [111]:
lr = LogisticRegression(C=1)
lr.fit(train_dm.iloc[:,:-1] , train_dm['Loan_Status'])

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [128]:
sc = StandardScaler()
X_tr = train_dm.iloc[:,:-1]
Y_tr = train_dm['Loan_Status']
sc.fit(X_tr)
X_tr_sc = sc.transform(X_tr)
X_te_sc =  sc.transform(test_dm)
xgb_clf.fit(X_tr_sc , Y_tr)
pred_loan_status=lgbm_clf.predict(X_te_sc)

  if diff:


In [143]:
sc = StandardScaler()
X_tr = train_dm.iloc[:,:-1]
Y_tr = train_dm['Loan_Status']
sc.fit(X_tr)
X_tr_sc = sc.transform(X_tr)
X_te_sc =  sc.transform(test_dm)
mlp =MLPClassifier(max_iter=500)
mlp.fit(X_tr_sc , Y_tr)
pred_loan_status=mlp.predict(X_te_sc)



In [144]:
submission_1 = pd.DataFrame(test['Loan_ID'] , columns=['Loan_ID','Loan_Status'])
submission_1['Loan_Status'] = pred_loan_status
mapper = { 1 : 'Y' , 0 : 'N'}
submission_1['Loan_Status'] = submission_1['Loan_Status'].map(mapper)
submission_1.to_csv('Submission_mlp.csv' , index= False)

In [None]:
'''Now lets try stacking'''

In [None]:
def Stacking(model,train,y,test,n_fold):
    folds=StratifiedKFold(n_splits=n_fold,random_state=1)
    test_pred=np.empty((test.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for train_indices,val_indices in folds.split(train,y.values):
        x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]
        model.fit(X=x_train,y=y_train)
        train_pred=np.append(train_pred,model.predict(x_val))
    model.fit(train,y)
    test_pred=model.predict(test)
    return test_pred ,train_pred

In [None]:
model1 =LogisticRegression(random_state=100)
test_pred1 ,train_pred1=Stacking(model=model1,n_fold=10, train=train_dm.iloc[:,:-1],y=train_dm['Loan_Status'] ,test=test_dm)
train_pred1=pd.DataFrame(train_pred1 , columns=['LR'])
test_pred1=pd.DataFrame(test_pred1, columns=['LR'])
                        
                        

In [None]:
model2 = DecisionTreeClassifier()
test_pred2 ,train_pred2=Stacking(model=model2,n_fold=10, train=train_dm.iloc[:,:-1],y=train_dm['Loan_Status'] ,test=test_dm)
train_pred2=pd.DataFrame(train_pred2, columns=['DT'])
test_pred2=pd.DataFrame(test_pred2, columns=['DT'])

In [None]:
model3 = KNeighborsClassifier()
test_pred3 ,train_pred3=Stacking(model=model3,n_fold=10, train=train_dm.iloc[:,:-1],y=train_dm['Loan_Status'] ,test=test_dm)
train_pred3=pd.DataFrame(train_pred3, columns=['KNN'])
test_pred3=pd.DataFrame(test_pred3, columns=['KNN'])

In [None]:
df = pd.concat([train_pred1,train_pred2,train_pred3], axis=1)
df_test = pd.concat([test_pred1,test_pred2,test_pred3], axis=1)

model = LogisticRegression(C = 1, penalty='l2',random_state=100)
#model = SVC()
model.fit(df,train_dm['Loan_Status'])
model.score(df, train_dm['Loan_Status'])

In [None]:
pred_loan_status = model.predict(df_test)

In [None]:
submission_2 = pd.DataFrame(test['Loan_ID'] , columns=['Loan_ID','Loan_Status'])
submission_2['Loan_Status'] = pred_loan_status
mapper = { 1 : 'Y' , 0 : 'N'}
submission_2['Loan_Status'] = submission_2['Loan_Status'].map(mapper)
submission_2.to_csv('Submission_5.csv' , index= False)

In [None]:
import mlxtend
from mlxtend.classifier import StackingClassifier

In [None]:
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression(penalty='l1')
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

In [None]:
sclf.fit(train_dm.iloc[:,:-1],train_dm['Loan_Status'])
sclf.score(train_dm.iloc[:,:-1], train_dm['Loan_Status'])

In [None]:
pred_loan_status = sclf.predict(test_dm)

In [None]:
submission_3 = pd.DataFrame(test['Loan_ID'] , columns=['Loan_ID','Loan_Status'])
submission_3['Loan_Status'] = pred_loan_status
mapper = { 1 : 'Y' , 0 : 'N'}
submission_3['Loan_Status'] = submission_3['Loan_Status'].map(mapper)
submission_3.to_csv('Submission_6.csv' , index= False)