# Loan Prediction 2

In [212]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.lda import LDA

In [213]:
#Read files:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Data Exploration and Cleaning

In [214]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [215]:
print(train.columns)
train.dtypes

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [216]:
feature_columns_to_use = ['ApplicantIncome','CoapplicantIncome','Credit_History',
                          'Dependents','Education', 'Gender', 'LoanAmount', 
                          'Loan_Amount_Term', 'Married', 'Property_Area', 
                          'Self_Employed']
nonnumeric_columns = ['Dependents', 'Education', 'Gender', 'Married', 'Property_Area', 'Self_Employed']

### Impute missing values using the median for numeric columns and the most common value for string columns [Ref](http://stackoverflow.com/a/25562948)

In [217]:
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

### Join the features from train and test together before imputing missing values

In [218]:
big_X = train[feature_columns_to_use].append(test[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)

In [219]:
big_X_imputed['emi'] = big_X_imputed['LoanAmount'] / big_X_imputed['Loan_Amount_Term']
big_X_imputed['total_income'] = big_X_imputed['ApplicantIncome'] + big_X_imputed['CoapplicantIncome']

### Transform categorical features and target

In [220]:
dummies_data = pd.get_dummies(big_X_imputed[nonnumeric_columns], columns=nonnumeric_columns)
big_X_imputed = pd.concat([big_X_imputed, dummies_data], axis=1)
big_X_imputed = big_X_imputed.drop(nonnumeric_columns, axis=1)
big_X_imputed = big_X_imputed.drop(['LoanAmount', 'Loan_Amount_Term', 'ApplicantIncome', 'CoapplicantIncome'], axis=1)
big_X_imputed.head()

Unnamed: 0,Credit_History,emi,total_income,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Gender_Female,Gender_Male,Married_No,Married_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Self_Employed_No,Self_Employed_Yes
0,1.0,0.35,5849.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.355556,6091.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.183333,3000.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.333333,4941.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,1.0,0.391667,6000.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [221]:
ss = StandardScaler()
big_X_imputed_ss = ss.fit_transform(big_X_imputed)
big_X_imputed = pd.DataFrame(big_X_imputed_ss, columns=big_X_imputed.columns)

In [222]:
big_X_imputed.head()

Unnamed: 0,Credit_History,emi,total_income,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Gender_Female,Gender_Male,Married_No,Married_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Self_Employed_No,Self_Employed_Yes
0,0.42151,-0.148626,-0.154913,0.849148,-0.441457,-0.441457,-0.319761,0.534522,-0.534522,-0.477268,0.477268,1.351699,-1.351699,-0.647828,-0.743112,1.366902,0.371552,-0.371552
1,0.42151,-0.142736,-0.114719,-1.177651,2.265226,-0.441457,-0.319761,0.534522,-0.534522,-0.477268,0.477268,-0.73981,0.73981,1.543619,-0.743112,-0.731581,0.371552,-0.371552
2,0.42151,-0.325324,-0.628099,0.849148,-0.441457,-0.441457,-0.319761,0.534522,-0.534522,-0.477268,0.477268,-0.73981,0.73981,-0.647828,-0.743112,1.366902,-2.691412,2.691412
3,0.42151,-0.166296,-0.305721,0.849148,-0.441457,-0.441457,-0.319761,-1.870829,1.870829,-0.477268,0.477268,-0.73981,0.73981,-0.647828,-0.743112,1.366902,0.371552,-0.371552
4,0.42151,-0.104452,-0.129833,0.849148,-0.441457,-0.441457,-0.319761,0.534522,-0.534522,-0.477268,0.477268,1.351699,-1.351699,-0.647828,-0.743112,1.366902,0.371552,-0.371552


### Prepare the inputs for the model

In [223]:
IDcol = pd.Series(test['Loan_ID'])

In [224]:
train_X = big_X_imputed[0:train.shape[0]].as_matrix()
test_X = big_X_imputed[train.shape[0]:].as_matrix()

### Label encoding for target is required

In [225]:
le = LabelEncoder()
train_y = train['Loan_Status']
train_y = le.fit_transform(train_y)

## AdaBoost

In [226]:
adb = AdaBoostClassifier()

n_estimators_range=[1, 2]
learning_rate_range = [0.0000001]

param_grid = [{'n_estimators': n_estimators_range, 
               'learning_rate': learning_rate_range}]

gs_adb = GridSearchCV(estimator=adb,
                       param_grid=param_grid,
                       scoring='accuracy',
                       cv=10,
                       n_jobs=2)

gs_adb = gs_adb.fit(train_X, train_y)

print("Best Score: ", gs_adb.best_score_)
print(gs_adb.best_params_)

Best Score:  0.809446254072
{'learning_rate': 1e-07, 'n_estimators': 1}


## XGBoost

In [227]:
#XGBoost Classifier
xgb_clf = xgb.XGBClassifier()

n_estimators_range=[1, 2]
max_depth_range = [1, 2]
learning_rate_range = [0.0000001, 0.00002, 0.00003]

param_grid = [{'n_estimators': n_estimators_range, 
               'max_depth': max_depth_range, 
               'learning_rate': learning_rate_range}]

gs_xgb = GridSearchCV(estimator=xgb_clf, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=3)

gs_xgb = gs_xgb.fit(train_X, train_y)

print("Best Score: ", gs_xgb.best_score_)
print(gs_xgb.best_params_)

Best Score:  0.809446254072
{'learning_rate': 1e-07, 'n_estimators': 1, 'max_depth': 1}


### Submission

In [228]:
clf = gs_xgb.best_estimator_
print(clf.feature_importances_)
clf = clf.fit(train_X, train_y)

#prediction on test data
predictions = clf.predict(test_X)
predictions = le.inverse_transform(predictions)

predictions = pd.Series(predictions, name='Loan_Status')

submission = pd.concat([IDcol, predictions], axis=1)
submission.to_csv("model_xgb_01.csv", index=False)

[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
