### 09 Model Building & Grid Searching

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.optimizers import Adam
# from keras.layers import Dropout
from sklearn.model_selection import GridSearchCV
import csv
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
random_state = 42

In [None]:
!pip3 install xgboost

In [None]:
import sys
print(sys.base_prefix)

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [None]:
model = pd.read_csv('model data snow day.csv')

In [None]:
# feature engineering
# engineer a number out of stage
model.stage.value_counts()

#### Closed, Deal Signed, Invoice Sent = 1; otherwise 0

In [None]:
def translate_stage(stage):
    if stage in ['Closed Won', 'Deal Signed', 'Invoice Sent']:
        return(1)
    else:
        return (0)

In [None]:
model['y'] = model['stage'].apply(translate_stage)

In [None]:
model.shape

#### Feature engineering

In [None]:
# feature engineering
y = model['y']
X = model[['lat', 'lng',
           'mobility_score',
           'carshare',
           'bikeshare',
           'ridehailing',
           'masstransit',
           'closest_ts',
           'within_one_tenth',
           'within_one_half',
           'within_one'
           ]]

#### Conducting a train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=14)

#### Calculating our baseline

In [None]:
# calculate baseline
print(y.value_counts())
print(1 - (456 / (1227+456)))

### Logistic Regression

In [None]:
% % time
logreg_params = {
    'penalty': ['l2'],
    'C': [1.0]
}

logreg_gd = GridSearchCV(LogisticRegression(), logreg_params, verbose=1, cv=5)
logreg_gd.fit(X_train, y_train)

In [None]:
print(logreg_gd.best_score_)
print(logreg_gd.best_params_)
logreg_best = logreg_gd.best_estimator_
logreg_best

In [None]:
logreg = LogisticRegression(C=1.0, penalty='l2', random_state=1)
logreg.fit(X_train, y_train)

print('Intercept', logreg.intercept_)
print('Coefficients', logreg.coef_)
print('Predicted probability', logreg.predict_proba(X))

In [None]:
cross_val_score(logreg, X_train, y_train, cv=5, scoring='accuracy').mean()

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
logreg_pred = logreg.predict(X_test)

In [None]:
logreg_proba = logreg.predict_proba(X_test)
logreg_proba

#### Logistic Regression Confusion Matrix

In [None]:
logreg_CM = confusion_matrix(
    y_test, y_pred=logreg_pred, labels=None, sample_weight=None)
logreg_CM


- Specificity: 53.67%

- True negatives: 360
- False positives: 19
- False negatives: 104
- True positives: 22 

##### Specificity

In [None]:
print(360/(360+19))

##### Sensitivity

In [None]:
print(22/(104+22))

In [None]:
# plt.scatter(logreg_pred, ) #predictions vs. closed

In [None]:
from sklearn.metrics import classification_report

#### Logistic Regression Classification Report

In [None]:
print(classification_report(y_test, logreg_pred))

- Precision: 72%


In [None]:
from sklearn.metrics import roc_auc_score

#### Logistic Regression ROC-AUC Score

In [None]:
roc_auc_score(y_test, logreg_pred, average='samples', )

### Random Forest Classifier

#### Grid searching, hyperparameter tuning

In [None]:
% % time
ranfor_params = {
    'n_estimators': [500],
    'max_depth': [5],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': [3]

}

ranfor_gd = GridSearchCV(RandomForestClassifier(),
                         ranfor_params, verbose=1, cv=5)
ranfor_gd.fit(X_train, y_train)

In [None]:
print(ranfor_gd.best_score_)
print(ranfor_gd.best_params_)

#### Building our model with the tuned hyperparamters

In [None]:
ranfor_best = RandomForestClassifier(n_estimators=500, max_depth=5, min_samples_split=2,
                                     min_samples_leaf=1, max_features=3, random_state=2)

In [None]:
ranfor_best.fit(X_train, y_train)

In [None]:
ranfor_best_scores = cross_val_score(ranfor_best, X_train, y_train, cv=5)
print(ranfor_best_scores)
print(np.mean(ranfor_best_scores))

In [None]:
features_ranfor = pd.DataFrame(list(zip(
    X.columns, ranfor_best.feature_importances_)), columns=['feature', 'importance'])

In [None]:
features_ranfor.plot(kind='bar', title='Random Forest Classifier Model Feature Importance',
                     x='feature', y='importance', fontsize='large')
plt.xticks(rotation=90)
plt.xlabel('Features', fontsize='large')
plt.ylabel('Feature importance', fontsize='large')

In [None]:
features_ranfor

In [None]:
ranfor_pred = ranfor_best.predict(X_test)

#### Random Forest Classifier Confusion Matrix

In [None]:
ranfor_CM = confusion_matrix(y_test, ranfor_pred)
ranfor_CM

- True positives: 24
- False negatives: 102
- False positives: 18
- True negatives: 361

##### Specificity

In [None]:
print(361/(361+18))

##### Sensitivity

In [None]:
print(24/(24+102))

#### Random Forest Classifier ROC-AUC Score

In [None]:
roc_auc_score(y_test, y_score=ranfor_pred)

#### Random Forest Classifier Classification Report

In [None]:
print(classification_report(y_test, ranfor_pred))

- Precision: 73%
- Recall/sensitivity: 76%

### Gradient Boost Classifier

#### Grid searching, hyperparamter tuning

In [None]:
% % time
gradboost_params = {
    'loss': ['exponential'],
    'learning_rate': [0.01],
    'n_estimators': [300],
    'max_depth': [1],
    'min_samples_leaf': [1],
    'max_features': [8]
}

gradboost_gd = GridSearchCV(
    GradientBoostingClassifier(), gradboost_params, verbose=1, cv=5)
gradboost_gd.fit(X_train, y_train)

In [None]:
print(gradboost_gd.best_score_)
print(gradboost_gd.best_params_)

In [None]:
gradboost_best = GradientBoostingClassifier(loss='exponential', learning_rate=0.01,
                                            n_estimators=300, max_depth=1, min_samples_leaf=1, max_features=8,
                                            random_state=3)
gradboost_best.fit(X_train, y_train)

In [None]:
gradboost_scores = cross_val_score(gradboost_best, X_train, y_train, cv=5)
print(gradboost_scores)
print(np.mean(gradboost_scores))

In [None]:
features_gradboost = pd.DataFrame(list(zip(
    X.columns, gradboost_best.feature_importances_)), columns=['feature', 'importance'])

In [None]:
features_gradboost.plot(kind='bar', title='Gradient Boost Classifier Feature Importance',
                        x='feature', y='importance', fontsize='large', sort_columns=True, legend=False)
plt.xticks(rotation=90)
plt.xlabel('Features', fontsize='large')
plt.ylabel('Feature importance', fontsize='large')

In [None]:
features_gradboost

In [None]:
gradboost_pred = gradboost_best.predict(X_test)

#### Gradient Boost Classifier Confusion Matrix

In [None]:
gradboost_CM = confusion_matrix(y_test, gradboost_pred)
gradboost_CM

- True positives: 24
- False negatives: 102
- False positives: 21
- True negatives: 358

##### Specificity

In [None]:
print(358/(358+21))

##### Sensitivity

In [None]:
print(24/(24+102))

#### Gradient Boost Classifier ROC-AUC Score

In [None]:
roc_auc_score(y_test, gradboost_pred)

#### Gradient Boost Classifier Classification Report

In [None]:
print(classification_report(y_test, gradboost_pred))

- Precision: 72%
- Recall: 76%

### AdaBoost Classifier

#### Grid Searching, Hyperparameter tuning

In [None]:
% % time
adaboost_params = {
    'n_estimators': [25],
    'learning_rate': [0.1],

}

adaboost_gd = GridSearchCV(
    AdaBoostClassifier(), adaboost_params, verbose=1, cv=5)
adaboost_gd.fit(X_train, y_train)

In [None]:
print(adaboost_gd.best_score_)
print(adaboost_gd.best_params_)

#### Building the model with the tuned hyperparamters

In [None]:
adaboost_best = AdaBoostClassifier(
    learning_rate=0.1, n_estimators=25, random_state=4)
adaboost_best.fit(X_train, y_train)

In [None]:
adaboost_scores = cross_val_score(adaboost_best, X_train, y_train, cv=5)
print(adaboost_scores)
print(np.mean(adaboost_scores))

#### AdaBoost Classifier Feature Importance

In [None]:
features_adaboost = pd.DataFrame(list(zip(
    X.columns, adaboost_best.feature_importances_)), columns=['feature', 'importance'])

In [None]:
features_adaboost.plot(kind='bar', title='AdaBoost Classifier Feature Importance',
                       x='feature', y='importance', fontsize='large')
plt.xticks(rotation=90)
plt.xlabel('Features', fontsize='large')
plt.ylabel('Feature importance', fontsize='large')

In [None]:
features_adaboost

In [None]:
adaboost_pred = adaboost_best.predict(X_test)

#### AdaBoost Classifier Confusion Matrix

In [None]:
adaboost_CM = confusion_matrix(y_test, adaboost_pred)
adaboost_CM

- True positives: 25
- False negatives: 101
- False positives: 21
- True negatives: 358

##### Specificity

In [None]:
print(358/(358+21))

##### Sensitivity

In [None]:
print(25/(25+101))

#### AdaBoost Classifier ROC-AUC score

In [None]:
roc_auc_score(y_test, adaboost_pred)

#### AdaBoost Classifier Classification Report

In [None]:
print(classification_report(y_test, adaboost_pred))

- Precision: 72%

### XGBoost

In [None]:
% % time
xgb_params = {
    'max_depth': [3],
    'learning_rate': [0.01],
    'n_estimators': [125],
    'gamma': [0.05],
    'min_child_weight': [1],
    'subsample': [1.0],
    'colsample_bytree': [0.6],
    'reg_lambda': [0.01],
    'reg_alpha': [0]
}

xgboost_gd = GridSearchCV(XGBClassifier(), xgb_params, verbose=1, cv=5)
xgboost_gd.fit(X_train, y_train)

In [None]:
print(xgboost_gd.best_score_)
print(xgboost_gd.best_params_)

In [None]:
xgboost_best = XGBClassifier(max_depth=3, learning_rate=0.01, n_estimators=125, gamma=0.05, min_child_weight=1.0,
                             subsample=1.0, colsample_bytree=0.6, reg_lambda=0.01, reg_alpha=0, random_state=5)
xgboost_best.fit(X_train, y_train)

In [None]:
xgb_scores = cross_val_score(xgboost_best, X_train, y_train, cv=5)
print(xgb_scores)
print(np.mean(xgb_scores))

In [None]:
features_xgboost = pd.DataFrame(list(zip(
    X.columns, xgboost_best.feature_importances_)), columns=['feature', 'importance'])

#### Extreme Gradient Boost Classifier Feature Importance

In [None]:
features_xgboost.plot(kind='bar', title='XGBoost Classifier Feature Importance',
                      x='feature', y='importance', fontsize='large')
plt.xticks(rotation=90)
plt.xlabel('Features', fontsize='large')
plt.ylabel('Feature importance', fontsize='large')

In [None]:
features_xgboost

In [None]:
xgb_pred = xgboost_best.predict(X_test)

#### Extreme Gradient Boost Classifier Confusion Matrix

In [None]:
xgb_CM = confusion_matrix(xgb_pred, y_test)
xgb_CM

- True positives: 23
- False positives: 103
- False negatives: 18
- True negatives: 361

##### Specificity

In [None]:
print(361/(361+103))

##### Sensitivity

In [None]:
print(23/(23+18))

#### Extreme Gradient Boost Classifier ROC-AUC Score

In [None]:
roc_auc_score(y_test, xgb_pred)

#### Extreme Gradient Boost Classifier Classification Report

In [None]:
print(classification_report(xgb_pred, y_test))

- Precision: 89%

### Extra Trees Classifier

In [None]:
% % time
ext_params = {
    'max_features': [2],
    'max_depth': [8],
    'min_samples_split': [15],
    'n_estimators': [800],
    'min_samples_leaf': [1],
    'criterion': ['entropy'],
}

ext_gd = GridSearchCV(ExtraTreesClassifier(), ext_params, verbose=1, cv=5)
ext_gd.fit(X_train, y_train)

In [None]:
print(ext_gd.best_score_)
print(ext_gd.best_params_)

In [None]:
ext_best = ExtraTreesClassifier(max_depth=8, min_samples_leaf=1, n_estimators=800, min_samples_split=15,
                                max_features=2, criterion='entropy', random_state=6)
ext_best.fit(X_train, y_train)

In [None]:
ext_scores = cross_val_score(ext_best, X_train, y_train, cv=5)
print(ext_scores)
print(np.mean(ext_scores))

#### Extra Trees Classifier Feature Importance

In [None]:
features_ext = pd.DataFrame(list(
    zip(X.columns, ext_best.feature_importances_)), columns=['feature', 'importance'])

In [None]:
features_ext.plot(kind='bar', title='Extra Trees Classifier Feature Importance',
                  x='feature', y='importance', fontsize='large')
plt.xticks(rotation=90)
plt.xlabel('Features', fontsize='large')
plt.ylabel('Feature importance', fontsize='large')

In [None]:
ext_pred = ext_best.predict(X_test)

#### Extreme Trees Classifier Confusion Matrix

In [None]:
ext_CM = confusion_matrix(y_test, ext_pred)
ext_CM

- True positives: 21
- False negatives: 105
- False positives: 8
- True negatives: 371

##### Specificity

In [None]:
print(371/(371+8))

##### Sensitivity

In [None]:
print(21/(21+105))

#### Extreme Trees Classifier ROC-AUC Score

In [None]:
roc_auc_score(y_test, ext_pred)

#### Extreme Trees Classifier Classification Report

In [None]:
print(classification_report(y_test, ext_pred))

- Precision: 77%

### Bagging Classifier

In [None]:
% % time
bag_params = {
    'n_estimators': [800],
    'max_samples': [50],
    'bootstrap': [False],
    'bootstrap_features': [True]

}

bag_gd = GridSearchCV(BaggingClassifier(), bag_params, verbose=1, cv=5)
bag_gd.fit(X_train, y_train)

In [None]:
print(bag_gd.best_score_)
print(bag_gd.best_params_)

In [None]:
bag_best = BaggingClassifier(n_estimators=800, max_samples=50,
                             bootstrap=False, bootstrap_features=True, random_state=7)
bag_best.fit(X_train, y_train)

In [None]:
bag_scores = cross_val_score(bag_best, X_train, y_train, cv=5)
print(bag_scores)
print(np.mean(bag_scores))

In [None]:
bag_pred = bag_best.predict(X_test)

#### Bagging Trees Classifier Confusion Matrix

In [None]:
bag_CM = confusion_matrix(y_test, bag_pred)
bag_CM

- True positives: 27
- False negatives: 99
- False positives: 14
- True negatives: 365

##### Specificity

In [None]:
print(365/(365+14))

##### Sensitivity

In [None]:
print(27/(27+99))

#### Bagging Trees Classifier ROC-AUC Score

In [None]:
roc_auc_score(y_test, bag_pred)

#### Bagging Trees Classifier Classification Report

In [None]:
print(classification_report(y_test, bag_pred))

- Precision: 75%

### Model Evaluation

Cross Validation Scores
- Logistic Regression: 76.74%
- Random Forest Classifier: 76.49%
- AdaBoost Classifier: 76.91%
- Gradient Boost Classifier: 76.91%
- Extreme Gradient Boost Classifier: 76.67%
- Extreme Trees Classifier: 75.89%
- Bagging Classifier: 76.40%