In [1]:
# Initial imports

import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

In [2]:
# Imports for better visualization

from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'



In [3]:
# Load training data
train_data = pd.read_csv('../data/train.csv')

In [4]:
# Load testing data
test_data = pd.read_csv('../data/test.csv')

In [5]:
# View train data head
train_data.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [6]:
# View test data head
test_data.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
5,LP001054,Male,Yes,0,Not Graduate,Yes,2165,3422,152.0,360.0,1.0,Urban
6,LP001055,Female,No,1,Not Graduate,No,2226,0,59.0,360.0,1.0,Semiurban
7,LP001056,Male,Yes,2,Not Graduate,No,3881,0,147.0,360.0,0.0,Rural
8,LP001059,Male,Yes,2,Graduate,,13633,0,280.0,240.0,1.0,Urban
9,LP001067,Male,No,0,Not Graduate,No,2400,2400,123.0,360.0,1.0,Semiurban


In [7]:
train_data.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [8]:
# View train data shape
train_data.shape

(614, 13)

In [9]:
# Check for null values in train data
train_data.isnull().any()

Loan_ID              False
Gender                True
Married               True
Dependents            True
Education            False
Self_Employed         True
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount            True
Loan_Amount_Term      True
Credit_History        True
Property_Area        False
Loan_Status          False
dtype: bool

In [10]:
# View test data shape
test_data.shape

(367, 12)

In [11]:
# Check for null values in test data
test_data.isnull().any()

Loan_ID              False
Gender                True
Married              False
Dependents            True
Education            False
Self_Employed         True
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount            True
Loan_Amount_Term      True
Credit_History        True
Property_Area        False
dtype: bool

In [12]:
def num_missing(x):
  return sum(x.isnull())

In [13]:
print "Missing values per column:"
print train_data.apply(num_missing, axis=0) # axis=0 defines that function is to be applied on each column

Missing values per column:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [14]:
print "Missing values per column:"
print test_data.apply(num_missing, axis=0)

Missing values per column:
Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64


In [15]:
print 'Train:\n', train_data['Gender'].value_counts()
print 'Test:\n', test_data['Gender'].value_counts()

Train:
Male      489
Female    112
Name: Gender, dtype: int64
Test:
Male      286
Female     70
Name: Gender, dtype: int64


In [16]:
print 'Train:\n', train_data['Married'].value_counts()
print 'Test:\n', test_data['Married'].value_counts()

Train:
Yes    398
No     213
Name: Married, dtype: int64
Test:
Yes    233
No     134
Name: Married, dtype: int64


In [17]:
print 'Train:\n', train_data['Education'].value_counts()
print 'Test:\n', test_data['Education'].value_counts()

Train:
Graduate        480
Not Graduate    134
Name: Education, dtype: int64
Test:
Graduate        283
Not Graduate     84
Name: Education, dtype: int64


In [18]:
print 'Train:\n', train_data['Self_Employed'].value_counts()
print 'Test:\n', test_data['Self_Employed'].value_counts()

Train:
No     500
Yes     82
Name: Self_Employed, dtype: int64
Test:
No     307
Yes     37
Name: Self_Employed, dtype: int64


In [19]:
print 'Train:\n', train_data['Loan_Amount_Term'].value_counts()
print 'Test:\n', test_data['Loan_Amount_Term'].value_counts()

Train:
360.0    512
180.0     44
480.0     15
300.0     13
84.0       4
240.0      4
120.0      3
36.0       2
60.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64
Test:
360.0    311
180.0     22
480.0      8
300.0      7
240.0      4
84.0       3
6.0        1
120.0      1
36.0       1
350.0      1
12.0       1
60.0       1
Name: Loan_Amount_Term, dtype: int64


In [20]:
print 'Train:\n', train_data['Credit_History'].value_counts()
print 'Test:\n', test_data['Credit_History'].value_counts()

Train:
1.0    475
0.0     89
Name: Credit_History, dtype: int64
Test:
1.0    279
0.0     59
Name: Credit_History, dtype: int64


In [21]:
print 'Train:\n', train_data['Property_Area'].value_counts()
print 'Test:\n', test_data['Property_Area'].value_counts()

Train:
Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64
Test:
Urban        140
Semiurban    116
Rural        111
Name: Property_Area, dtype: int64


In [22]:
print 'Train:\n', train_data['Dependents'].value_counts()
print 'Test:\n', test_data['Dependents'].value_counts()

Train:
0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64
Test:
0     200
2      59
1      58
3+     40
Name: Dependents, dtype: int64


In [23]:
train_data['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [24]:
data = train_data.append(test_data, ignore_index=True)

In [25]:
data.head(10)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed
0,5849,0.0,1.0,0,Graduate,Male,,360.0,LP001002,Y,No,Urban,No
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,LP001003,N,Yes,Rural,No
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,LP001005,Y,Yes,Urban,Yes
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,LP001006,Y,Yes,Urban,No
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,LP001008,Y,No,Urban,No
5,5417,4196.0,1.0,2,Graduate,Male,267.0,360.0,LP001011,Y,Yes,Urban,Yes
6,2333,1516.0,1.0,0,Not Graduate,Male,95.0,360.0,LP001013,Y,Yes,Urban,No
7,3036,2504.0,0.0,3+,Graduate,Male,158.0,360.0,LP001014,N,Yes,Semiurban,No
8,4006,1526.0,1.0,2,Graduate,Male,168.0,360.0,LP001018,Y,Yes,Urban,No
9,12841,10968.0,1.0,1,Graduate,Male,349.0,360.0,LP001020,N,Yes,Semiurban,No


In [26]:
data.dtypes

ApplicantIncome        int64
CoapplicantIncome    float64
Credit_History       float64
Dependents            object
Education             object
Gender                object
LoanAmount           float64
Loan_Amount_Term     float64
Loan_ID               object
Loan_Status           object
Married               object
Property_Area         object
Self_Employed         object
dtype: object

In [27]:
from sklearn import preprocessing

In [28]:
data['Loan_Amount_Term'].value_counts()

360.0    823
180.0     66
480.0     23
300.0     20
240.0      8
84.0       7
120.0      4
36.0       3
60.0       3
12.0       2
350.0      1
6.0        1
Name: Loan_Amount_Term, dtype: int64

In [29]:
# loan_term_keys = dict(data['Loan_Amount_Term'].value_counts()).keys()
# loan_term_keys.sort()
# loan_term_keys

In [30]:
# loan_term = dict(zip(loan_term_keys, range(len(loan_term_keys))))
# loan_term

In [31]:
# train_data.ix[(train_data['Dependents'] == '3+'), 'Dependents'] = '3'

In [32]:
# train_data.Dependents.value_counts()

In [33]:
# train_data.Dependents.dtypes

In [34]:
def prepare_data(df, is_train):
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Married'] = df['Married'].map({'Yes': 1, 'No': 0})
    df['Dependents'] = df['Dependents'].map({'0': 0, '1': 1, '2': 2, '3+': 3})
    df['Education'] = df['Education'].map({'Graduate': 1, 'Not Graduate': 0})
    df['Self_Employed'] = df['Self_Employed'].map({'Yes': 1, 'No': 0})
#     df['Loan_Amount_Term'] = df['Loan_Amount_Term'].map(loan_term)
    dummies = pd.get_dummies(df['Property_Area'], prefix='Property_Area')
    dummies = dummies[dummies.columns[:-1]]
    df = pd.concat([df.drop(['Property_Area'], axis=1), dummies], axis=1)
    if is_train:
        df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})
    return df.drop('Loan_ID', axis=1)

In [35]:
train = prepare_data(train_data.copy(), 1)

In [36]:
train.head(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban
0,1.0,0.0,0.0,1,0.0,5849,0.0,,360.0,1.0,1,0,0
1,1.0,1.0,1.0,1,0.0,4583,1508.0,128.0,360.0,1.0,0,1,0
2,1.0,1.0,0.0,1,1.0,3000,0.0,66.0,360.0,1.0,1,0,0
3,1.0,1.0,0.0,0,0.0,2583,2358.0,120.0,360.0,1.0,1,0,0
4,1.0,0.0,0.0,1,0.0,6000,0.0,141.0,360.0,1.0,1,0,0
5,1.0,1.0,2.0,1,1.0,5417,4196.0,267.0,360.0,1.0,1,0,0
6,1.0,1.0,0.0,0,0.0,2333,1516.0,95.0,360.0,1.0,1,0,0
7,1.0,1.0,3.0,1,0.0,3036,2504.0,158.0,360.0,0.0,0,0,1
8,1.0,1.0,2.0,1,0.0,4006,1526.0,168.0,360.0,1.0,1,0,0
9,1.0,1.0,1.0,1,0.0,12841,10968.0,349.0,360.0,1.0,0,0,1


In [37]:
train.columns

Index([u'Gender', u'Married', u'Dependents', u'Education', u'Self_Employed',
       u'ApplicantIncome', u'CoapplicantIncome', u'LoanAmount',
       u'Loan_Amount_Term', u'Credit_History', u'Loan_Status',
       u'Property_Area_Rural', u'Property_Area_Semiurban'],
      dtype='object')

In [38]:
test = prepare_data(test_data.copy(), 0)

In [39]:
test.head(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban
0,1.0,1,0.0,1,0.0,5720,0,110.0,360.0,1.0,0,0
1,1.0,1,1.0,1,0.0,3076,1500,126.0,360.0,1.0,0,0
2,1.0,1,2.0,1,0.0,5000,1800,208.0,360.0,1.0,0,0
3,1.0,1,2.0,1,0.0,2340,2546,100.0,360.0,,0,0
4,1.0,0,0.0,0,0.0,3276,0,78.0,360.0,1.0,0,0
5,1.0,1,0.0,0,1.0,2165,3422,152.0,360.0,1.0,0,0
6,0.0,0,1.0,0,0.0,2226,0,59.0,360.0,1.0,0,1
7,1.0,1,2.0,0,0.0,3881,0,147.0,360.0,0.0,1,0
8,1.0,1,2.0,1,,13633,0,280.0,240.0,1.0,0,0
9,1.0,0,0.0,0,0.0,2400,2400,123.0,360.0,1.0,0,1


In [40]:
# No women with dependents
train[(train['Gender'] == 0) & (train['Dependents'] != 0)]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban
29,0.0,0.0,2.0,1,,3750,2083.0,120.0,360.0,1.0,1,0,1
54,0.0,1.0,1.0,1,1.0,11500,0.0,286.0,360.0,0.0,0,0,0
82,0.0,1.0,2.0,1,0.0,1378,1881.0,167.0,360.0,1.0,0,0,0
113,0.0,0.0,1.0,1,1.0,7451,0.0,,360.0,1.0,1,0,1
146,0.0,1.0,2.0,1,0.0,14866,0.0,70.0,360.0,1.0,1,0,0
219,0.0,1.0,2.0,1,0.0,4283,2383.0,127.0,360.0,,1,0,1
238,0.0,0.0,1.0,1,0.0,3812,0.0,112.0,360.0,1.0,1,1,0
251,0.0,0.0,2.0,1,0.0,3427,0.0,138.0,360.0,1.0,0,0,0
255,0.0,0.0,3.0,1,0.0,3083,0.0,255.0,360.0,1.0,1,1,0
262,0.0,0.0,1.0,1,0.0,3481,0.0,155.0,36.0,1.0,0,0,1


In [41]:
test.columns

Index([u'Gender', u'Married', u'Dependents', u'Education', u'Self_Employed',
       u'ApplicantIncome', u'CoapplicantIncome', u'LoanAmount',
       u'Loan_Amount_Term', u'Credit_History', u'Property_Area_Rural',
       u'Property_Area_Semiurban'],
      dtype='object')

In [42]:
continuous_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

In [43]:
def fill_na(df):
    df['Gender'] = df['Gender'].fillna(1)
    df['Married'] = df['Married'].fillna(1)
    df['Self_Employed'] = df['Self_Employed'].fillna(0)
    df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(360)
    df['Credit_History'] = df['Credit_History'].fillna(1)
    df['Dependents'] = int(stats.mode(df['Dependents'])[0])
    for col in continuous_features:
        df[col] = df[col].fillna(data[col].dropna().mean())
    return df

In [44]:
train = fill_na(train)
test = fill_na(test)

In [45]:
print train.apply(num_missing, axis=0)

Gender                     0
Married                    0
Dependents                 0
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Loan_Status                0
Property_Area_Rural        0
Property_Area_Semiurban    0
dtype: int64


In [46]:
print test.apply(num_missing, axis=0)

Gender                     0
Married                    0
Dependents                 0
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Property_Area_Rural        0
Property_Area_Semiurban    0
dtype: int64


In [47]:
def feature_engineering(df):
#     df['Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
    df['Income'] = (df['ApplicantIncome'] + df['CoapplicantIncome'])/df['LoanAmount']
    return df.drop(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'], axis=1)

In [48]:
train = feature_engineering(train)
test = feature_engineering(test)

In [49]:
train.head(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Income
0,1.0,0.0,0,1,0.0,360.0,1.0,1,0,0,41.042293
1,1.0,1.0,0,1,0.0,360.0,1.0,0,1,0,47.585938
2,1.0,1.0,0,1,1.0,360.0,1.0,1,0,0,45.454545
3,1.0,1.0,0,0,0.0,360.0,1.0,1,0,0,41.175
4,1.0,0.0,0,1,0.0,360.0,1.0,1,0,0,42.553191
5,1.0,1.0,0,1,1.0,360.0,1.0,1,0,0,36.003745
6,1.0,1.0,0,0,0.0,360.0,1.0,1,0,0,40.515789
7,1.0,1.0,0,1,0.0,360.0,0.0,0,0,1,35.063291
8,1.0,1.0,0,1,0.0,360.0,1.0,1,0,0,32.928571
9,1.0,1.0,0,1,0.0,360.0,1.0,0,0,1,68.22063


In [50]:
test.head(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Income
0,1.0,1,0,1,0.0,360.0,1.0,0,0,52.0
1,1.0,1,0,1,0.0,360.0,1.0,0,0,36.31746
2,1.0,1,0,1,0.0,360.0,1.0,0,0,32.692308
3,1.0,1,0,1,0.0,360.0,1.0,0,0,48.86
4,1.0,0,0,0,0.0,360.0,1.0,0,0,42.0
5,1.0,1,0,0,1.0,360.0,1.0,0,0,36.756579
6,0.0,0,0,0,0.0,360.0,1.0,0,1,37.728814
7,1.0,1,0,0,0.0,360.0,0.0,1,0,26.401361
8,1.0,1,0,1,0.0,240.0,1.0,0,0,48.689286
9,1.0,0,0,0,0.0,360.0,1.0,0,1,39.02439


In [51]:
# Imports for model
from sklearn.metrics import accuracy_score
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import train_test_split



In [52]:
# XGBoost Model

# Fit XGBRegressor

xgbr = XGBClassifier(n_estimators=100, silent=False)
x_train, x_test, y_train, y_test = train_test_split(train.drop('Loan_Status', axis=1), train['Loan_Status'], test_size=0.33, stratify=train['Loan_Status'])
xgbr.fit(x_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [53]:
# Predict values on test data
predicted_train_xgb = xgbr.predict(x_test)

In [54]:
# Print log-loss
print accuracy_score(y_test, predicted_train_xgb)

0.768472906404


In [55]:
params = xgbr.get_params()

In [56]:
# Use complete data and predict values on actual test data
predicted_xgbr = xgbr.predict(test[x_test.columns])

In [57]:
# Create solution dataframe
solution_xgbr = pd.DataFrame(test_data['Loan_ID'])
solution_xgbr = pd.concat([solution_xgbr, pd.DataFrame(predicted_xgbr, columns=['Loan_Status'])], axis=1)
solution_xgbr['Loan_Status'] = solution_xgbr['Loan_Status'].map({1: 'Y', 0: 'N'})
solution_xgbr.head(10)

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
5,LP001054,Y
6,LP001055,Y
7,LP001056,N
8,LP001059,Y
9,LP001067,Y


In [58]:
# Export solution to csv file
solution_xgbr.to_csv('../submissions/solution_xgbr.csv', index=False, header=True)

In [59]:
from sklearn.ensemble import RandomForestClassifier

In [60]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [61]:
predicted_train_rf = rf.predict(x_test)
predicted_train_rf

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1])

In [62]:
accuracy_score(y_test, predicted_train_rf)

0.72906403940886699

In [63]:
predicted_rf = rf.predict(test[x_test.columns])

In [64]:
# Create solution dataframe
solution_rf = pd.DataFrame(test_data['Loan_ID'])
solution_rf = pd.concat([solution_rf, pd.DataFrame(predicted_rf, columns=['Loan_Status'])], axis=1)
solution_rf['Loan_Status'] = solution_rf['Loan_Status'].map({1: 'Y', 0: 'N'})
solution_rf.head(10)

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
5,LP001054,N
6,LP001055,Y
7,LP001056,N
8,LP001059,Y
9,LP001067,Y


In [65]:
# Export solution to csv file
solution_rf.to_csv('../submissions/solution_rf.csv', index=False, header=True)

In [66]:
from __future__ import division
import xgboost as xgb

In [67]:
def calc_accuracy(predictions, y_test):
    labels = y_test.get_label()
    return 'accuracy_score', (1 - sum(labels != predictions)/len(labels))

In [68]:
params = {"objective": "multi:softmax",
          'num_class': 2,
          "booster" : "gbtree",
          'learning_rate': 0.1,
          "max_depth": 3,
          "subsample": 0.75,
          'colsample_bylevel': 1,
          'gamma': 0,
          'min_child_weight': 1,
          "silent": 1,
          }

num_boost_round = 50

In [69]:
print("Train a XGBoost model")
dtrain = xgb.DMatrix(x_train, y_train)
dvalid = xgb.DMatrix(x_test, y_test)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, watchlist, feval=calc_accuracy, maximize=True, early_stopping_rounds=50, verbose_eval=True)

Train a XGBoost model
[0]	train-accuracy_score:0.841849	eval-accuracy_score:0.763547
Multiple eval metrics have been passed: 'eval-accuracy_score' will be used for early stopping.

Will train until eval-accuracy_score hasn't improved in 50 rounds.
[1]	train-accuracy_score:0.836983	eval-accuracy_score:0.758621
[2]	train-accuracy_score:0.83455	eval-accuracy_score:0.763547
[3]	train-accuracy_score:0.83455	eval-accuracy_score:0.763547
[4]	train-accuracy_score:0.836983	eval-accuracy_score:0.763547
[5]	train-accuracy_score:0.839416	eval-accuracy_score:0.763547
[6]	train-accuracy_score:0.839416	eval-accuracy_score:0.763547
[7]	train-accuracy_score:0.836983	eval-accuracy_score:0.763547
[8]	train-accuracy_score:0.836983	eval-accuracy_score:0.763547
[9]	train-accuracy_score:0.836983	eval-accuracy_score:0.763547
[10]	train-accuracy_score:0.839416	eval-accuracy_score:0.763547
[11]	train-accuracy_score:0.839416	eval-accuracy_score:0.763547
[12]	train-accuracy_score:0.839416	eval-accuracy_score:0.76

In [70]:
predicted_train_xgb = gbm.predict(xgb.DMatrix(x_test))

In [71]:
predicted_train_xgb

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  1.,  0.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,
        0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,
        1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1

In [72]:
accuracy_score(y_test, predicted_train_xgb)

0.76847290640394084

In [73]:
predicted_xgb = gbm.predict(xgb.DMatrix(test))

In [74]:
# Create solution dataframe
solution_xgb = pd.DataFrame(test_data['Loan_ID'])
solution_xgb = pd.concat([solution_xgb, pd.DataFrame(predicted_xgb, columns=['Loan_Status'])], axis=1)
solution_xgb['Loan_Status'] = solution_xgb['Loan_Status'].map({1: 'Y', 0: 'N'})
solution_xgb.head(10)

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
5,LP001054,Y
6,LP001055,Y
7,LP001056,N
8,LP001059,Y
9,LP001067,Y


In [75]:
# Export solution to csv file
solution_xgb.to_csv('../submissions/solution_xgb.csv', index=False)

In [76]:
# Imports for Oversampling (since data is highly imbalanced + small)

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

In [77]:
train_features = train.drop('Loan_Status', axis=1)
target = train['Loan_Status']

In [78]:
# Generate the new dataset using over-sampling method

verbose = False
ratio = 'auto'

# 'Random over-sampling'
OS = RandomOverSampler(ratio=ratio)
train_os_features, train_os_target = OS.fit_sample(train_features, target.values.ravel())

# 'SMOTE'
smote = SMOTE(ratio=ratio, kind='regular')
train_smo_features, train_smo_target = smote.fit_sample(train_features, target.values.ravel())

In [79]:
# Convert the generated numpy arrays into data frames

train_os_features = DataFrame(train_os_features, columns=train_features.columns)
train_os_target = DataFrame(train_os_target, columns=['Loan_Status'])

train_smo_features = DataFrame(train_smo_features, columns=train_features.columns)
train_smo_target = DataFrame(train_smo_target, columns=['Loan_Status'])

In [80]:
# Check total number of samples

print train_os_target.shape
print train_smo_target.shape

(844, 1)
(844, 1)


In [81]:
# Check how the new balanced data are distributed

print train_os_target['Loan_Status'].value_counts()
print train_smo_target['Loan_Status'].value_counts()

1    422
0    422
Name: Loan_Status, dtype: int64
1    422
0    422
Name: Loan_Status, dtype: int64


In [82]:
features_dfs = [train_os_features, train_smo_features]
target_dfs = [train_os_target, train_smo_target]
os_types = ['ros', 'smo']

In [83]:
for i, df in enumerate(features_dfs):
    x_train, x_test, y_train, y_test = train_test_split(df, target_dfs[i]['Loan_Status'], test_size=0.33, stratify=target_dfs[i]['Loan_Status'])
    
    # SKLEARN XGBOOST
    xgbr = XGBClassifier(n_estimators=100, silent=False)
    xgbr.fit(x_train, y_train)

    predicted_train_xgb = xgbr.predict(x_test)
    print os_types[i] + ': ', accuracy_score(y_test, predicted_train_xgb)
    predicted_xgbr = xgbr.predict(test[x_test.columns])

    solution_xgbr = pd.DataFrame(test_data['Loan_ID'])
    solution_xgbr = pd.concat([solution_xgbr, pd.DataFrame(predicted_xgbr, columns=['Loan_Status'])], axis=1)
    solution_xgbr['Loan_Status'] = solution_xgbr['Loan_Status'].map({1: 'Y', 0: 'N'})
    solution_xgbr.to_csv('../submissions/solution_xgbr_' + os_types[i] + '.csv', index=False, header=True)
#     print solution_xgbr.head(10)


    # RANDOM FOREST
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(x_train, y_train)

    predicted_train_rf = rf.predict(x_test)
    print os_types[i] + ': ', accuracy_score(y_test, predicted_train_rf)
    predicted_rf = rf.predict(test[x_test.columns])

    solution_rf = pd.DataFrame(test_data['Loan_ID'])
    solution_rf = pd.concat([solution_rf, pd.DataFrame(predicted_rf, columns=['Loan_Status'])], axis=1)
    solution_rf['Loan_Status'] = solution_rf['Loan_Status'].map({1: 'Y', 0: 'N'})
    solution_rf.to_csv('../submissions/solution_rf_' + os_types[i] + '.csv', index=False, header=True)
#     print solution_rf.head(10)


    # XGBOOST
    params = {"objective": "multi:softmax",
              'num_class': 2,
              "booster" : "gbtree",
              'learning_rate': 0.1,
              "max_depth": 3,
              "subsample": 0.75,
              "colsample_bytree": 1,
              'colsample_bylevel': 1,
              'gamma': 0,
              'min_child_weight': 1,
              "silent": 1,
              }

    num_boost_round = 50

    print("Train a XGBoost model")
    dtrain = xgb.DMatrix(x_train, y_train)
    dvalid = xgb.DMatrix(x_test, y_test)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, watchlist, feval=calc_accuracy, maximize=True, early_stopping_rounds=50, verbose_eval=True)

    predicted_train_xgb = gbm.predict(xgb.DMatrix(x_test))
    print os_types[i] + ': ', accuracy_score(y_test, predicted_train_xgb)
    predicted_xgb = gbm.predict(xgb.DMatrix(test))

    solution_xgb = pd.DataFrame(test_data['Loan_ID'])
    solution_xgb = pd.concat([solution_xgb, pd.DataFrame(predicted_xgb, columns=['Loan_Status'])], axis=1)
    solution_xgb['Loan_Status'] = solution_xgb['Loan_Status'].map({1: 'Y', 0: 'N'})
    solution_xgb.to_csv('../submissions/solution_xgb_' + os_types[i] + '.csv', index=False)
#     print solution_xgb.head(10)

ros:  0.806451612903
ros:  0.853046594982
Train a XGBoost model
[0]	train-accuracy_score:0.704425	eval-accuracy_score:0.767025
Multiple eval metrics have been passed: 'eval-accuracy_score' will be used for early stopping.

Will train until eval-accuracy_score hasn't improved in 50 rounds.
[1]	train-accuracy_score:0.704425	eval-accuracy_score:0.767025
[2]	train-accuracy_score:0.720354	eval-accuracy_score:0.781362
[3]	train-accuracy_score:0.748673	eval-accuracy_score:0.767025
[4]	train-accuracy_score:0.746903	eval-accuracy_score:0.770609
[5]	train-accuracy_score:0.752212	eval-accuracy_score:0.774194
[6]	train-accuracy_score:0.741593	eval-accuracy_score:0.767025
[7]	train-accuracy_score:0.745133	eval-accuracy_score:0.767025
[8]	train-accuracy_score:0.743363	eval-accuracy_score:0.767025
[9]	train-accuracy_score:0.746903	eval-accuracy_score:0.767025
[10]	train-accuracy_score:0.746903	eval-accuracy_score:0.767025
[11]	train-accuracy_score:0.746903	eval-accuracy_score:0.767025
[12]	train-accu