# Part III: Ensembles and Final Result

## AdaBoost

Train an AdaBoost classifier using Decision Tree stubs as weak learners. Compare its performance to results obtained in Part II using 10 fold CV.

In [1]:
# AdaBoost code goes here
import proj2_lib.util as utils

file_config = utils.file_config

# ONLY NEED TO RUN THIS STEP ONCE (switch this to True to run it)
RUN_MAKE_TRAIN_TEST_FILES = False
if RUN_MAKE_TRAIN_TEST_FILES:
    utils.make_train_test_sets(config=file_config)
    
import proj2_lib.preprocess as preprocess

# ONLY NEED TO RUN THIS STEP ONCE
RUN_FIT_PREPROCESSING = False
if RUN_FIT_PREPROCESSING:
    preprocess.fit_save_pipelines(config=file_config)

train_X, train_y = preprocess.load_train_data(config=file_config)

print(train_X.shape)
print(train_y.shape)

(90526, 101)
(90526,)


In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

ab_class = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=600,
    learning_rate=1)

In [3]:
ab_class.fit(train_X,train_y)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=600, random_state=None)

In [4]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(ab_class, train_X, train_y, 
                         scoring='accuracy', cv=10)

In [5]:
# mean accuracy of Decision Tree classifier
import numpy as np
np.mean(scores)

0.79720739760097881

In [7]:
auc_scores = cross_val_score(ab_class, train_X, train_y,
                            scoring='roc_auc', cv=10)

In [8]:
# mean AUC score of Decision Tree classifier
np.mean(auc_scores)

0.72810419110080737

## Stacking

Choose a set of 5 or so classifiers. Write a function that trains an ensemble using stacking

In [2]:
def build_stack_ensemble(X, y):
    
    # ********************************************************
    # ** THIS WAS IMPLEMENTED OUTSIDE THE FUNCTION BODY ******
    # ********************************************************

    return None

Use 10-fold cross validation to measure performance of your stacked classifier. See Part II solution to see how to roll your own sklearn classifier along with http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

# LinearSVC train/fit/prediction

In [10]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

svm_param_grid = [
    {'C': 2 ** np.linspace(-3, 5, num=5)}
]

svm_classifier = LinearSVC()
svm_tuner = GridSearchCV(svm_classifier, svm_param_grid, cv=5,
                        scoring='roc_auc', n_jobs=4, verbose=1)

In [12]:
svm_tuner.fit(train_X,train_y)


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:  1.9min finished


AttributeError: 'GridSearchCV' object has no attribute 'train'

In [16]:
svm_tuner_ypred = svm_tuner.predict(train_X)

# DecisionTreeClassifier train/fit/prediction

In [21]:
from sklearn.tree import DecisionTreeClassifier

tree_param_grid = [
    {'min_samples_split': range(2, 22, 4)}
]

tree_classifier = DecisionTreeClassifier()
tree_tuner = GridSearchCV(tree_classifier, tree_param_grid, cv=5,
                        scoring='roc_auc', n_jobs=4, verbose=1)

In [22]:
tree_tuner.fit(train_X,train_y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:   23.4s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=4,
       param_grid=[{'min_samples_split': range(2, 22, 4)}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=1)

In [23]:
tree_tuner_ypred = tree_tuner.predict(train_X)

In [34]:
train_X.shape

(90526, 101)

# RandomForest train/fit/prediction

In [38]:
from sklearn.ensemble import RandomForestClassifier

rf_param_grid = { 
    'n_estimators': [10,50,100],
    'max_features': ['sqrt', 'log2']
}

rf_classifier = RandomForestClassifier(n_jobs=3)
rf_tuner = GridSearchCV(rf_classifier, rf_param_grid, cv=5,
                        scoring='roc_auc', n_jobs=4, verbose=1)

In [39]:
rf_tuner.fit(train_X,train_y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  2.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=3, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'max_features': ['sqrt', 'log2'], 'n_estimators': [10, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=1)

In [41]:
rf_tuner_ypred = rf_tuner.predict(train_X)

# AdaBoost train/fit/prediction

In [45]:
ab_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
             }

tree_ab_classifier = DecisionTreeClassifier()
ab_classifier = AdaBoostClassifier(base_estimator = tree_ab_classifier)
ab_tuner = GridSearchCV(ab_classifier, ab_param_grid, cv=5,
                        scoring='roc_auc', n_jobs=4, verbose=1)


In [47]:
ab_tuner.fit(train_X,train_y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 24.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'base_estimator__splitter': ['best', 'random'], 'base_estimator__criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=1)

In [48]:
ab_tuner_ypred = ab_tuner.predict(train_X)

In [94]:
train_X[:-1].shape

(90525, 101)

# Stacking train/fit/prediction

In [95]:
import pandas as pd
cols_name = ['dTree','rForest','lSvc','aBoost','noshow']
pred_cols = [tree_tuner_ypred,rf_tuner_ypred,svm_tuner_ypred,ab_tuner_ypred,train_y]


meta_pred_df = pd.DataFrame(np.column_stack(pred_cols),columns = cols_name)
meta_pred_df.head()


Unnamed: 0,dTree,rForest,lSvc,aBoost,noshow
0,1,1,-1,1,1
1,-1,-1,-1,-1,-1
2,-1,-1,-1,-1,-1
3,-1,1,-1,1,1
4,-1,-1,-1,-1,-1


# StratifiedShuffleSplit

In [96]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=20000, random_state=1234)

for train_meta_index, test_meta_index in split.split(meta_pred_df, meta_pred_df['noshow']):
    train_meta_set = meta_pred_df.iloc[train_meta_index]
    test_meta_set = meta_pred_df.iloc[test_meta_index]
    

In [98]:
# save train and test sets as csvs
PROCESSED_DATA_DIR = 'processed_data'

train_meta_set.to_csv(PROCESSED_DATA_DIR + '/train_meta_set.csv', index=False)
test_meta_set.to_csv(PROCESSED_DATA_DIR + '/test_meta_set.csv', index=False)

In [99]:
train_meta_set.shape

(70526, 5)

In [100]:
test_meta_set.shape

(20000, 5)

In [None]:
# Logistic Regression

In [147]:
#train_meta_X = train_meta_set[:3]
train_meta_X = train_meta_set[['dTree','rForest','lSvc','aBoost']]
train_meta_y = train_meta_set[['noshow']]
train_meta_y.shape

(70526, 1)

In [171]:
from sklearn import linear_model

lr_class = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
lr_class.fit(train_meta_X,train_meta_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

# Logistic Regression Scores

In [174]:

from sklearn.model_selection import cross_val_score
lr_auc_scores = cross_val_score(lr_class, train_meta_X.as_matrix(), train_meta_y.as_matrix().ravel(), scoring='roc_auc', cv=10)


In [175]:
np.mean(lr_auc_scores)

0.96407406288152941

In [176]:
lr_acc_scores = cross_val_score(lr_class, train_meta_X.as_matrix(), train_meta_y.as_matrix().ravel(), scoring='accuracy', cv=10)

In [177]:
np.mean(lr_acc_scores)

0.98044688418474224

## Final Result

Choose a single model based on all previous project steps. Train this model on the complete training dataset and measure it's performance on the held out test set.

Compare to the 10-fold CV estimate you got previously.

In [178]:
# final result goes here
test_X, test_y = preprocess.load_test_data(config=file_config)

print(test_X.shape)
print(test_y.shape)

(20000, 101)
(20000,)


In [181]:
#Using the results of the AdaBoost  as comparison
from sklearn import metrics

y_final_pred = ab_class.predict(test_X)

#Mean Square error
print(np.sqrt(metrics.mean_squared_error(test_y, y_final_pred)))


0.900777441991
