# Loan Prediction 2

In [357]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score

In [358]:
#Read files:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Data Exploration and Cleaning

In [359]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [360]:
print(train.columns)
train.dtypes

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [361]:
feature_columns_to_use = ['ApplicantIncome','CoapplicantIncome','Credit_History',
                          'Dependents','Education', 'Gender', 'LoanAmount', 
                          'Loan_Amount_Term', 'Married', 'Property_Area', 
                          'Self_Employed']
nonnumeric_columns = ['Dependents', 'Education', 'Gender', 'Married', 'Property_Area', 'Self_Employed']

### Impute missing values using the median for numeric columns and the most common value for string columns [Ref](http://stackoverflow.com/a/25562948)

In [362]:
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

### Join the features from train and test together before imputing missing values

In [363]:
big_X = train[feature_columns_to_use].append(test[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)

In [364]:
big_X_imputed['emi'] = big_X_imputed['LoanAmount'] / big_X_imputed['Loan_Amount_Term']
big_X_imputed['total_income'] = big_X_imputed['ApplicantIncome'] + big_X_imputed['CoapplicantIncome']

### Transform categorical features and target

In [365]:
#le = LabelEncoder()
#for feature in nonnumeric_columns:
#    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

#print(big_X_imputed.head())

In [366]:
dummies_data = pd.get_dummies(big_X_imputed[nonnumeric_columns], columns=nonnumeric_columns)
dummies_data.head()

Unnamed: 0,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Gender_Female,Gender_Male,Married_No,Married_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Self_Employed_No,Self_Employed_Yes
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [368]:
a = big_X_imputed.add(dummies_data)
a.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education,Education_Graduate,...,Married_Yes,Property_Area,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Self_Employed,Self_Employed_No,Self_Employed_Yes,emi,total_income
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


### Prepare the inputs for the model

In [12]:
IDcol = pd.Series(test['Loan_ID'])

In [16]:
train_X = big_X_imputed[0:train.shape[0]].as_matrix()
test_X = big_X_imputed[train.shape[0]:].as_matrix()

### Label encoding for target is required

In [20]:
train_y = train['Loan_Status']
train_y = le.fit_transform(train_y)

## XGBoost

In [23]:
#XGBoost Classifier
xgb_clf = xgb.XGBClassifier()

n_estimators_range=[1, 2]
max_depth_range = [1, 2]
learning_rate_range = [0.0000001, 0.00002, 0.00003]

param_grid = [{'n_estimators': n_estimators_range, 
               'max_depth': max_depth_range, 
               'learning_rate': learning_rate_range}]

gs_xgb = GridSearchCV(estimator=xgb_clf, 
                  param_grid=param_grid, 
                  scoring='f1', 
                  cv=10,
                  n_jobs=3)

gs_xgb = gs_xgb.fit(train_X, train_y)

print("Best Score: ", gs_xgb.best_score_)
print(gs_xgb.best_params_)

Best Score:  0.876780994019
{'max_depth': 1, 'learning_rate': 1e-07, 'n_estimators': 1}


### Submission

In [24]:
clf = gs_xgb.best_estimator_
clf = clf.fit(train_X, train_y)

#prediction on test data
predictions = clf.predict(test_X)
predictions = le.inverse_transform(predictions)

predictions = pd.Series(predictions, name='Loan_Status')

submission = pd.concat([IDcol, predictions], axis=1)
submission.to_csv("model_xgb_01.csv", index=False)