In [2]:
import os 
import json
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings("ignore")

In [3]:
class font:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [4]:
data = pd.read_csv('../data/loan_train.csv')
print(data.shape)
print(data.columns)
data.head()

(614, 13)
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
factor_columns = ['Gender','Married','Education','Self_Employed','Property_Area']
for _ in factor_columns:
    print("labels {}: {}".format(_, set(data[_])))

labels Gender: {'Male', 'Female', nan}
labels Married: {nan, 'Yes', 'No'}
labels Education: {'Graduate', 'Not Graduate'}
labels Self_Employed: {nan, 'Yes', 'No'}
labels Property_Area: {'Rural', 'Semiurban', 'Urban'}


In [6]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [7]:
tb = missing_values_table(data)
tb

Your selected dataframe has 13 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Credit_History,50,8.1
Self_Employed,32,5.2
LoanAmount,22,3.6
Dependents,15,2.4
Loan_Amount_Term,14,2.3
Gender,13,2.1
Married,3,0.5


In [10]:
feature = data.drop(['Loan_ID', 'Loan_Status'], axis = 1)
label = data['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.33, random_state=42)

In [17]:
'''
Dependents: Assumption that there are no dependents
Self_Employed: Assumption that the applicant is not self-employed
Loan_Amount_Term: Assumption that the loan amount term is median value
Credit_History: Assumption that the person has a credit history
Married: If nothing specified, applicant is not married
Gender: Assuming the gender is Male for the missing values
'''

X_train['Dependents'] = X_train['Dependents'].fillna('0')
X_train['Self_Employed'] = X_train['Self_Employed'].fillna('No')
X_train['Loan_Amount_Term'] = X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].mean())
X_train['Credit_History'] = X_train['Credit_History'].fillna(1)
X_train['Married'] = X_train['Married'].fillna('No')
X_train['Gender'] = X_train['Gender'].fillna('Male')
X_train['LoanAmount'] = X_train['LoanAmount'].fillna(X_train['LoanAmount'].mean())

In [18]:
for _ in factor_columns:
    print("labels {}: {}".format(_, set(X_train[_])))

labels Gender: {'Male', 'Female'}
labels Married: {'Yes', 'No'}
labels Education: {'Graduate', 'Not Graduate'}
labels Self_Employed: {'Yes', 'No'}
labels Property_Area: {'Rural', 'Semiurban', 'Urban'}


In [19]:
# convert data 
gender_values = {'Female' : 0, 'Male' : 1} 
married_values = {'No' : 0, 'Yes' : 1}
education_values = {'Graduate' : 0, 'Not Graduate' : 1}
employed_values = {'No' : 0, 'Yes' : 1}
property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}
dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
X_train.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values, \
                'Self_Employed': employed_values, 'Property_Area': property_values, 'Dependents': dependent_values}\
                , inplace=True)

In [20]:
# replace check
for _ in factor_columns:
    print("labels {}: {}".format(_, set(X_train[_])))

labels Gender: {0, 1}
labels Married: {0, 1}
labels Education: {0, 1}
labels Self_Employed: {0, 1}
labels Property_Area: {0, 1, 2}


In [22]:
missing_values_table(X_train)

Your selected dataframe has 11 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


**Create pipelines**

In [86]:
myd = pd.read_csv('../data/loan_train.csv')

In [87]:
myd.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [148]:
myd_feature = data.drop(['Loan_ID', 'Loan_Status'], axis = 1)
myd_label = data['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(myd_feature, myd_label, \
                                                    test_size=0.25, random_state=42)

In [149]:
missing_values_table(X_train)

Your selected dataframe has 11 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Credit_History,36,7.8
Self_Employed,20,4.3
LoanAmount,16,3.5
Gender,11,2.4
Dependents,11,2.4
Loan_Amount_Term,11,2.4
Married,1,0.2


In [150]:
X_train.shape

(460, 11)

In [151]:
class PreProcessing(BaseEstimator, TransformerMixin):
    '''
        custom pre-processing estimator 
    '''
    def __int__(self):
        pass
    
    def transform(self, df):
        '''
            Regular transform() that is a help for training, validation & testing datasets
            (NOTE: The operations performed here are the ones that we did prior to this cell)
        '''
        #df = df.drop(['Loan_ID', 'Loan_Status'], axis = 1)
        pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\
           'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']
        
        df = df[pred_var]
        # filling missing value
        df['Dependents'] = df['Dependents'].fillna(0)
        df['Self_Employed'] = df['Self_Employed'].fillna('No')
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)
        df['Credit_History'] = df['Credit_History'].fillna(1)
        df['Married'] = df['Married'].fillna('No')
        df['Gender'] = df['Gender'].fillna('Male')
        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)
        
        # convert values of cat features
        gender_values = {'Female' : 0, 'Male' : 1} 
        married_values = {'No' : 0, 'Yes' : 1}
        education_values = {'Graduate' : 0, 'Not Graduate' : 1}
        employed_values = {'No' : 0, 'Yes' : 1}
        property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}
        dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
        
        df.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values, \
                    'Self_Employed': employed_values, 'Property_Area': property_values, \
                    'Dependents': dependent_values}, inplace=True)
        
        return df.values
        
    def fit(self, df, y = None, **fit_params):
        '''
            Fitting the Training dataset & calculating the required values from train
            e.g: We will need the mean of X_train['Loan_Amount_Term'] that will be used in
                transformation of X_test
        '''
        
        self.term_mean_ = df['Loan_Amount_Term'].mean()
        self.amt_mean_ = df['LoanAmount'].mean()
        return self

In [152]:
preprocess = PreProcessing()
preprocess.fit(X_train)
X_train_transfromed = preprocess.transform(X_train)

In [153]:
X_train_transfromed.shape

(460, 11)

In [154]:
missing_values_table(X_test)

Your selected dataframe has 11 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Credit_History,14,9.1
Self_Employed,12,7.8
LoanAmount,6,3.9
Dependents,4,2.6
Loan_Amount_Term,3,1.9
Gender,2,1.3
Married,2,1.3


In [155]:
X_test.shape

(154, 11)

In [156]:
X_test_transformed = preprocess.transform(X_test)
missing_values_table(X_test)
X_test_transformed.shape

Your selected dataframe has 11 columns.
There are 7 columns that have missing values.


(154, 11)

In [157]:
y_test = y_test.replace({'Y':1, 'N':0}).values
y_train = y_train.replace({'Y':1, 'N':0}).values

In [158]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30],
             "randomforestclassifier__max_depth" : [None, 6, 8, 10],
             "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20], 
             "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3]}

pipe = make_pipeline(PreProcessing(),RandomForestClassifier())
pipe

Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [159]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, param_grid = param_grid)
grid

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impu..._jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestclassifier__n_estimators': [10, 20, 30], 'randomforestclassifier__max_depth': [None, 6, 8, 10], 'randomforestclassifier__max_leaf_nodes': [None, 5, 10, 20], 'randomforestclassifier__min_impurity_split': [0.1, 0.2, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [161]:
myd_feature = data.drop(['Loan_ID', 'Loan_Status'], axis = 1)
myd_label = data['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(myd_feature, myd_label, \
                                                    test_size=0.25, random_state=42)

In [162]:
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impu..._jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestclassifier__n_estimators': [10, 20, 30], 'randomforestclassifier__max_depth': [None, 6, 8, 10], 'randomforestclassifier__max_leaf_nodes': [None, 5, 10, 20], 'randomforestclassifier__min_impurity_split': [0.1, 0.2, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [163]:
grid.best_params_

{'randomforestclassifier__max_depth': None,
 'randomforestclassifier__max_leaf_nodes': None,
 'randomforestclassifier__min_impurity_split': 0.3,
 'randomforestclassifier__n_estimators': 10}

In [164]:
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))

Test set score: 0.77


In [165]:
%ls

 Volume in drive C has no label.
 Volume Serial Number is C4E4-43B1

 Directory of C:\Users\RayHu\Documents\schoolProject

10/27/2018  09:53 PM    <DIR>          .
10/27/2018  09:53 PM    <DIR>          ..
10/27/2018  03:19 PM    <DIR>          .ipynb_checkpoints
10/22/2018  11:38 AM            46,496 costsenstive_card.ipynb
11/05/2016  09:09 AM       150,828,752 creditcard.csv
10/27/2018  12:11 PM           456,259 creditcard.ipynb
09/17/2018  11:01 AM    <DIR>          customerChurn
03/27/2018  04:37 PM           506,854 FinalReport-YuediWang&RuiyuHu.pdf
10/27/2018  09:43 PM    <DIR>          flask
10/27/2018  02:40 PM               158 flask101.py
10/27/2018  12:43 PM            21,957 loan_test.csv
10/27/2018  12:43 PM            38,013 loan_train.csv
10/27/2018  09:53 PM            42,520 pipeline101.ipynb
               8 File(s)    151,941,009 bytes
               5 Dir(s)  111,401,070,592 bytes free


In [166]:
test = pd.read_csv('../data/loan_test.csv')
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [167]:
len(test)

367

In [168]:
grid.predict(test)

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

### 3. Saving Machine Learning Model : Serialization & Deserialization

> In computer science, in the context of data storage, serialization is the process of translating data structures or object state into a format that can be stored (for example, in a file or memory buffer, or transmitted across a network connection link) and reconstructed later in the same or another computer environment.

In [169]:
# pickling
list_to_pickle = [1,'WW',2,'HH']

import pickle
# serialization
list_pickle = pickle.dumps(list_to_pickle)

In [170]:
list_pickle

b'\x80\x03]q\x00(K\x01X\x02\x00\x00\x00WWq\x01K\x02X\x02\x00\x00\x00HHq\x02e.'

In [171]:
# deserialization
loaded_pickle = pickle.loads(list_pickle)
loaded_pickle

[1, 'WW', 2, 'HH']

** We have a custom Class that we need to import while running our training, hence we'll be using dill module to packup the estimator Class with our grid object.It is advisable to create a separate training.py file that contains all the code for training the model (See here for example).**

In [174]:
import dill as pickle
filename = 'model_v1.pk'

In [178]:
with open('flask_api/models/' + filename, 'wb')as file:
    pickle.dump(grid, file)

In [179]:
'''
So our model will be saved in the location above. 
Now that the model pickled, creating a Flask wrapper around it would be the next step.
Before that, to be sure that our pickled file works fine -- let's load it back and do a prediction:
'''

with open('flask_api/models/' + filename, 'rb') as f:
    loaded_model = pickle.load(f)

In [180]:
loaded_model.predict(test)

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

> Since, we already have the preprocessing steps required for the new incoming data present as a part of the pipeline we just have to run predict(). While working with scikit-learn, it is always easy to work with pipelines. 

> Estimators and pipelines save you time and headache, even if the initial implementation seems to be ridiculous. Stich in time, saves nine!


## 4. creat flask API

>There are three important parts in constructing our wrapper function, apicall():
* Getting the request data (for which predictions are to be made)
* Loading our pickled estimator
* jsonify our predictions and send the response back with status code: 200