# Modeling

### Train 6 ML algorithm:
* Gaussian Naive Bayes 
* Logistic Regression
* Decision Tree
* Random Forest
* Gradient Boosting
* XGBoost


## Import libraries

In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, f1_score, fbeta_score, accuracy_score

import joblib


## Load data

In [2]:

Train = pd.read_csv('../input/Train.csv', index_col='id')
Val = pd.read_csv('../input/Val.csv', index_col='id')
Test = pd.read_csv('../input/Test.csv', index_col='id')

df00_not_norm = pd.read_csv('../input/df_not_norm.csv', index_col='id')

Train.head()

Unnamed: 0_level_0,loan_amnt,term,emp_length,annual_inc,addr_state,dti,mths_since_recent_inq,bc_open_to_buy,num_op_rev_tl,home_ownership_MORTGAGE,home_ownership_OTHERS,home_ownership_OWN,home_ownership_RENT,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
105253408,0.179897,-0.564595,0.2890743,0.119112,0.231563,0.41055,2.617179,-0.612346,-1.188047,1,0,0,0,1
16592425,0.546808,1.771181,-4.961623e-16,-0.183949,-0.964507,0.06924,-1.046193,0.647717,1.963179,0,0,1,0,0
58470345,-0.966701,-0.564595,0.2890743,-0.637249,0.967607,-1.046003,-0.679856,-0.67863,-1.188047,0,0,0,1,0
10092824,-0.508062,-0.564595,0.2890743,1.013113,-0.596485,-0.66214,0.785493,7.479771,5.789667,1,0,0,0,0
65965690,0.753196,-0.564595,1.127018,0.930595,-2.068572,0.22704,-0.13035,0.55854,-0.287696,1,0,0,0,1


## Split data at X and y

In [3]:
X_train = Train.drop(['target'], axis=1)
y_train = Train['target']


X_val = Val.drop(['target'], axis=1)
y_val = Val['target']


X_test = Test.drop(['target'], axis=1)
y_test = Test['target']

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape



((807186, 13), (807186,), (269062, 13), (269062,), (269062, 13), (269062,))

In [4]:
X_train.head()

Unnamed: 0_level_0,loan_amnt,term,emp_length,annual_inc,addr_state,dti,mths_since_recent_inq,bc_open_to_buy,num_op_rev_tl,home_ownership_MORTGAGE,home_ownership_OTHERS,home_ownership_OWN,home_ownership_RENT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
105253408,0.179897,-0.564595,0.2890743,0.119112,0.231563,0.41055,2.617179,-0.612346,-1.188047,1,0,0,0
16592425,0.546808,1.771181,-4.961623e-16,-0.183949,-0.964507,0.06924,-1.046193,0.647717,1.963179,0,0,1,0
58470345,-0.966701,-0.564595,0.2890743,-0.637249,0.967607,-1.046003,-0.679856,-0.67863,-1.188047,0,0,0,1
10092824,-0.508062,-0.564595,0.2890743,1.013113,-0.596485,-0.66214,0.785493,7.479771,5.789667,1,0,0,0
65965690,0.753196,-0.564595,1.127018,0.930595,-2.068572,0.22704,-0.13035,0.55854,-0.287696,1,0,0,0


## Training Models

### Model 1) Gaussian Naive Bayes
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html


In [5]:
# 1) Gaussian Naive Bayes


gaussian = GaussianNB()
gaussian.fit(X_train, y_train)


### Model 2) Logistic Regression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [6]:
# 2) Logistic Regression

logreg = LogisticRegression(penalty='l2', 
                            dual=False, 
                            tol=0.0001, 
                            C=1, 
                            fit_intercept=True, 
                            intercept_scaling=1, 
                            class_weight=None, 
                            random_state=None, 
                            solver='liblinear', 
                            max_iter=100, 
                            multi_class='ovr', 
                            verbose=0, 
                            warm_start=False, 
                            n_jobs=1)
logreg.fit(X_train, y_train)



### Model 3) Decision Tree
https://scikit-learn.org/stable/modules/tree.html

In [7]:
#Decision Tree

dectree = DecisionTreeClassifier(criterion='gini', 
                                 splitter='best', 
                                 max_depth=4, 
                                 min_samples_split=2, 
                                 min_samples_leaf=2, 
                                 min_weight_fraction_leaf=0.0,
                                 max_features=None, 
                                 random_state=None, 
                                 max_leaf_nodes=None, 
                                 min_impurity_decrease=0.0, 
                                 class_weight=None)
dectree.fit(X_train, y_train)



### Model 4) Random Forest
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [8]:
# Random Forest

rndforest = RandomForestClassifier(n_estimators=10, 
                                   criterion='gini', 
                                   max_depth=3, 
                                   min_samples_split=2, 
                                   min_samples_leaf=1, 
                                   min_weight_fraction_leaf=0.0, 
                                   max_features='sqrt', 
                                   max_leaf_nodes=None, 
                                   min_impurity_decrease=0.0, 
                                   bootstrap=True, 
                                   oob_score=False, 
                                   n_jobs=None, 
                                   random_state=None, 
                                   verbose=0, 
                                   warm_start=False, 
                                   class_weight=None)
rndforest.fit(X_train, y_train)



### Model 5) Gradient Boosting Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [9]:
# Gradient Boosting Classifier

gbc = GradientBoostingClassifier(loss='log_loss', 
                                 learning_rate=0.1,
                                 n_estimators=100, 
                                 subsample=1.0, 
                                 criterion='friedman_mse',
                                 min_samples_split=2, 
                                 min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0,
                                 max_depth=3,
                                 min_impurity_decrease=0.0, 
                                #  min_impurity_split=None, 
                                 init=None, 
                                 random_state=None,
                                 max_features=None,
                                 verbose=0, 
                                 max_leaf_nodes=None, 
                                 warm_start=False, 
                                #  presort='auto',
                                 validation_fraction=0.2, 
                                 n_iter_no_change=None,
                                 tol=0.0001)

gbc.fit(X_train, y_train)


### Model 6) XGBoost


In [10]:


xgb_classifier = xgb.XGBClassifier(
    objective='binary:logistic',  # for binary classification tasks
    learning_rate=0.1,
    n_estimators=100,
    subsample=1.0,
    min_child_weight=1,           # similar to min_samples_leaf
    max_depth=3,
    gamma=0.0,                    # similar to min_impurity_decrease
    random_state=None,
    colsample_bytree=1.0,         # similar to max_features
    verbosity=1,                  # similar to verbose (1 for messages, 0 for silent)
    # early_stopping_rounds=10,     # similar to n_iter_no_change
    nthread=-1,                   # to use all available threads
    booster='gbtree'              # use gradient boosted trees as default
)

xgb_classifier.fit(
    X_train, y_train, 
    eval_set=[(X_train, y_train)], 
    eval_metric='logloss',        # for binary classification
    verbose=True,                 # print messages during training
    early_stopping_rounds=10      # stop if performance doesn't improve after 10 rounds
)



[0]	validation_0-logloss:0.65603
[1]	validation_0-logloss:0.62574
[2]	validation_0-logloss:0.60085
[3]	validation_0-logloss:0.58024
[4]	validation_0-logloss:0.56308
[5]	validation_0-logloss:0.54876
[6]	validation_0-logloss:0.53679
[7]	validation_0-logloss:0.52672
[8]	validation_0-logloss:0.51826
[9]	validation_0-logloss:0.51114
[10]	validation_0-logloss:0.50515
[11]	validation_0-logloss:0.50008
[12]	validation_0-logloss:0.49579
[13]	validation_0-logloss:0.49216
[14]	validation_0-logloss:0.48910
[15]	validation_0-logloss:0.48650
[16]	validation_0-logloss:0.48428
[17]	validation_0-logloss:0.48240
[18]	validation_0-logloss:0.48079
[19]	validation_0-logloss:0.47940
[20]	validation_0-logloss:0.47821
[21]	validation_0-logloss:0.47716
[22]	validation_0-logloss:0.47627
[23]	validation_0-logloss:0.47545
[24]	validation_0-logloss:0.47475
[25]	validation_0-logloss:0.47414
[26]	validation_0-logloss:0.47355
[27]	validation_0-logloss:0.47306
[28]	validation_0-logloss:0.47261
[29]	validation_0-loglos

## Save models

In [11]:
# Define the list of models to compare
models = [gaussian, logreg, dectree, rndforest, gbc, xgb_classifier]
model_names = ['Gaussian Naive Bayes','Logistic Regression', 'Decision Tree', 'RandomForest', 'Gradient Boosting', 'XGBoost']
models_names_short = ['gaussian', 'logreg', 'dectree', 'rndforest', 'gbc', 'xgb_classifier']

for i, model in enumerate(models):
        joblib.dump(model, "../artifacts/" + models_names_short[i] + "_model.joblib") 

