# Training and Testing Model Predictions

In [1]:
#importing packages

import sys, os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

sys.path.append(os.path.abspath(os.path.join('..', 'utils')))
from utility import evaluate_cat_models, plot_precision_recall_vs_threshold
import model_pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# initialize models

gnb = GaussianNB()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
gradient_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()


### Oversampling model

In [3]:
#importing datasets
train_set_os = pd.read_csv('../Data/ProcessedData/train_set_os.csv')
test_set_os = pd.read_csv('../Data/ProcessedData/test_set_os.csv')
train_set_os.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,0.358453,1,0.113858,1,0,0,1.55916,0.370856,-1.364158,0
1,1,-0.268326,1,1.698224,0,0,1,-1.592367,0.370856,-0.647879,0
2,1,0.288811,1,-0.203015,0,0,1,0.181121,-1.381341,-0.062918,0
3,0,0.288811,1,-0.678324,0,2,1,0.189704,-1.381341,-0.098731,1
4,1,0.079884,1,0.113858,0,0,1,0.381285,0.370856,-0.588189,0


In [4]:
X_train_os = train_set_os.drop('Response', axis=1)
y_train_os = train_set_os['Response']
X_test_os = test_set_os.drop('Response', axis=1)
y_test_os = test_set_os['Response']

In [None]:
model_list = [tree, gnb, forest, gradient_boost, ada_boost]

evaluate_cat_models(model_list, X_train_os, X_test_os, y_train_os, y_test_os, cv=5)

Fitting DecisionTreeClassifier()
Done with fitting....
DecisionTreeClassifier() cross validation
Done with cross validation


Fitting GaussianNB()
Done with fitting....
GaussianNB() cross validation
Done with cross validation


Fitting RandomForestClassifier()
Done with fitting....
RandomForestClassifier() cross validation


### Undersampling

In [None]:
#importing datasets
train_set_us = pd.read_csv('../Data/ProcessedData/train_set_us.csv')
test_set_us = pd.read_csv('../Data/ProcessedData/test_set_us.csv')
train_set_us.head()

In [None]:
X_train_us = train_set_us.drop('Response', axis=1)
y_train_us = train_set_us['Response']
X_test_us = test_set_us.drop('Response', axis=1)
y_test_us = test_set_us['Response']

In [None]:
model_list = [tree, gnb, forest, gradient_boost, ada_boost]

evaluate_cat_models(model_list, X_train_us, X_test_us, y_train_us, y_test_us, cv=5)

### Smote

In [None]:
#importing datasets
train_set_sm = pd.read_csv('../Data/ProcessedData/train_set_sm.csv')
test_set_sm = pd.read_csv('../Data/ProcessedData/test_set_sm.csv')
train_set_sm.head()

In [None]:
X_train_sm = train_set_sm.drop('Response', axis=1)
y_train_sm = train_set_sm['Response']
X_test_sm = test_set_sm.drop('Response', axis=1)
y_test_sm = test_set_sm['Response']

In [None]:
model_list = [tree, gnb, forest, gradient_boost, ada_boost]

evaluate_cat_models(model_list, X_train_sm, X_test_sm, y_train_sm, y_test_sm, cv=5)

## Hyperparameter tuning for Gradient Boosting Classifier

Boosting is a sequential technique that works on the principle of ensemble. It combines a set of weak learners and delivers improved prediction accuracy.

The parameters for gradient boosting classifier are broken down into three categories

1. Tree-Specific Parameters: These affect each individual tree in the model
2. Boosting Parameters: These affect the boosting operation in the model
3. Miscellaneous Parameters: Other parameters for overall function

**Tree-Specific Parameters**
1. min_samples_split: 
    - Defines the minimum number of observations which is required in a node to be considered for spliting
2. min_samples_leaf: 
    - Defines the minimum observations required in a terminal node or leaf
3. min_weight_fraction_leaf
    - Similar to min_samples_leaf but defined as fraction of the total number of observations instead of an integer
4. max_depth
    - The maximum depth of a tree
5. max_leaf_nodes
    - The maximum number of terminal nodes or leaves in a tree
6. max_features
    - The number of features to consider while searching for a best split


**Boosting Paramters**
1. learning_rate
    - This determines the impact of each tree on the final outcome. 
2. n_estimators
    - The number of sequential trees to be modeled
3. subsample
    - The fraction of observations to be selected for each tree. Values slightly less than 1 makes the model robust
    
    
**Other Parameters**
1. loss
    - It refers to the loss function
2. init
    - This affects initialization of the output
3. random_state
    - The random number seed
4. verbose
    - The type of output to be printed when the model fits
5. warm_start
    - This parameter helps us fit additional trees on previous fit of a model
6. presort
    - Select whether to presort data for faster splits

In [None]:
columns = train_set_os.columns

In [None]:
def plot_feature_imp(feature_imp, columns):
    plt.plot(columns, feature_imp, kind='bar')
    plt.show()

In [None]:
gradient_boost = GradientBoostingClassifier(n_estimators=60, max_features='sqrt',
                                           subsample=.8, random_state=10)

In [None]:
param_grid = {
    'min_samples_split': range(20000, 26667, 200),
    'max_depth': range(6, 16),
    'learning_rate': np.linspace(0.0001, 0.1, 50)
} 
rs = RandomizedSearchCV(gradient_boost, param_grid, cv=3, scoring=make_scorer(matthews_corrcoef), n_jobs=-1, n_iter=200, verbose=21)

In [None]:
rs.fit(X_train_os, y_train_os)

In [None]:
rs.best_estimator_

In [None]:
model = GradientBoostingClassifier()
model.fit(X_train_os, y_train_os)
pickle.dump(model, open('../models/gradient_boost.pkl', 'wb'))

In [None]:
pickle.dump(rs.best_estimator_, open('../models/gradient_boost.pkl', 'wb'))

In [None]:
model = pickle.load(open('../models/gradient_boost.pkl', 'rb'))

In [None]:
evaluate_cat_models([model], X_train_os, X_test_os, y_train_os, y_test_os, cv=5)

## Model Interpretation

- `Positive Class`: A policyholder is interested in vehicle insurance
- `Negative Class`: A policyholder is not interested in vehicle insurance

In [None]:
y_pred = model.predict(X_test_os)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
plot_confusion_matrix(model, X_test_os, y_test_os, ax=ax)
fig.savefig('../DataAnalysisAndViz/plots/conf_mat.jpeg', dpi=82)
plt.show()

**Observation**
1. The model predicts the positive class when it was actually the positive class 8682 times
2. The model predicts the positive class when it was actually the negative class 21749 times 
3. The model predicts the negative class when it was actually the positive class 712 times 
4. The model predicts the negative class when it was actually the negative class 45079 times 

In [None]:
precision_score(y_test_os, y_pred, pos_label=1)

In [None]:
recall_score(y_test_os, y_pred, pos_label=1)

In [None]:
f1_score(y_test_os, y_pred, pos_label=1)

The `precision` is the proportion of true positives out of all detected positives or simply

**TP / (TP + FP)**

8682 / (8682 + 21749) = 0.285

The `recall` is the number of true positives that are correctly classified or simply

**TP / (TP + FN)**

8682 / (8682 + 712) = 0.924

Finally, the `f1 score` is the harmonic mean of precision and recall. This computes to 43%

Precision and recall and by extension, f1 score are skewed one class, if we flip the positive class, we get a total new precision and recall (even f1 score)

In [None]:
precision_score(y_test_os, y_pred, pos_label=0)

In [None]:
recall_score(y_test_os, y_pred, pos_label=0)

In [None]:
f1_score(y_test_os, y_pred, pos_label=0)

This is because, the metrics does not take `True Negative` into accounts, in this case, our model does not care how it performs in detecting policy holders that are not interested in vehicle insurance

In business context

- If advertising vehicle insurance to health insurance policy holder would be very expensive, then we optimize the model for precision, because we want to be very sure that they would be interested
- If advertising vehicle insurance to health insurance policy holder would not be very expensive, then we optimize the model for recall, because we want to advertise to everyone that would be interested.
> A naive model would advertise to all policy holders

In this context, we would optimize for precision

In [None]:
y_scores = model.predict_proba(X_test_os)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test_os, y_scores, pos_label=1)

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
thresholds

In [None]:
threshold = .6

In [None]:
y_pred_60 = y_scores > threshold
precision_score(y_pred_60, y_test_os, pos_label=1)

In [None]:
recall_score(y_pred_60, y_test_os, pos_label=1)

In [None]:
y_pred = y_pred_60.astype(np.int32)

This model has a precision score of 84% and recall score of 30%

## Model Building and Saving

In [None]:
# model instantiation
model = pickle.load(open('../models/gradient_boost.pkl', 'rb'))

In [None]:
model.fit(X_train_os, y_train_os)

In [None]:
pickle.dump(model, open('../models/gradient_boost.pkl', 'wb'))

In [None]:
data = model_pipeline.load_data('../Data/train.csv')
pred, proba, full_pipeline = model_pipeline.run_pipeline(data, model)
pickle.dump(full_pipeline, open('../models/pipeline.pkl', 'wb'))

In [None]:
model = model_pipeline.load_model('../models/gradient_boost.pkl')
data = model_pipeline.load_data('../Data/test.csv')
pred, proba, full_pipeline = model_pipeline.run_pipeline(data, model, '../models/pipeline.pkl')

data = pd.read_csv('../Data/test.csv')
data['Prediction'] = pred
data