##### Jupyter Notebook, Step 3 - Testing Model Pipelines
- Considering these results, develop a strategy for building a final predictive model
- recommended approaches:
    - Use feature selection to reduce the dataset to a manageable size then use conventional methods
    - Use dimension reduction to reduce the dataset to a manageable size then use conventional methods
    - Use an iterative model training method to use the entire dataset
- This notebook should be a "playground" where you try various approaches to solving this problem

We've already used Josh's method to reduce the noise out of the dataset as part of EDA, and have subsequently run feature selection to identify a few features for each data set that seem to be the most valuable.  Additionally, we ran PCA to narrow down to 5 features, which was successful.  Thus, we're doing a version of part 2 above - doing dimension reduction by removing noise, followed by further dimension reduction in PCA (going from 20->5 features).  Conventional methods then follow, including our baseline models and *******.



In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline
from tqdm import tqdm
import pickle
from sklearn.pipeline import Pipeline


UCIsample1_clean = pickle.load( open( "UCIsample1_clean", "rb" ) )
UCIsample2_clean = pickle.load( open( "UCIsample2_clean", "rb" ) )
UCIsample3_clean = pickle.load( open( "UCIsample3_clean", "rb" ) )
DBsample1_clean = pickle.load( open( "DBsample1_clean", "rb" ) )
DBsample2_clean = pickle.load( open( "DBsample2_clean", "rb" ) )
DBsample3_clean = pickle.load( open( "DBsample3_clean", "rb" ) )

UCIsample1 = pickle.load( open( "UCIsample1", "rb" ) )
UCIsample2 = pickle.load( open( "UCIsample2", "rb" ) )
UCIsample3 = pickle.load( open( "UCIsample3", "rb" ) )
DBsample1 = pickle.load( open( "DBsample1", "rb" ) )
DBsample2 = pickle.load( open( "DBsample2", "rb" ) )
DBsample3 = pickle.load( open( "DBsample3", "rb" ) )

sample_list = [UCIsample1_clean, UCIsample2_clean, UCIsample3_clean, \
               DBsample1_clean, DBsample2_clean, DBsample3_clean]
sample_names = ['UCIsample1_clean', 'UCIsample2_clean', 'UCIsample3_clean', \
                'DBsample1_clean', 'DBsample2_clean', 'DBsample3_clean']
sample_list_noise = [UCIsample1, UCIsample2, UCIsample3, \
               DBsample1, DBsample2, DBsample3]
sample_names_noise = ['UCIsample1', 'UCIsample2', 'UCIsample3', \
                'DBsample1', 'DBsample2', 'DBsample3']

UCIsample1_clean.name = 'UCIsample1_clean'
UCIsample2_clean.name = 'UCIsample2_clean'
UCIsample3_clean.name = 'UCIsample3_clean'
DBsample1_clean.name = 'DBsample1_clean' 
DBsample2_clean.name = 'DBsample2_clean' 
DBsample3_clean.name = 'DBsample3_clean'
UCIsample1.name = 'UCIsample1'
UCIsample2.name = 'UCIsample2'
UCIsample3.name = 'UCIsample3'
DBsample1.name = 'DBsample1' 
DBsample2.name = 'DBsample2' 
DBsample3.name = 'DBsample3'

DB_features = pickle.load( open('DB_features', "rb" ) )
UCI_features = pickle.load( open('UCI_features', "rb" ) )

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import (precision_score, 
                             accuracy_score, 
                             roc_auc_score, 
                             roc_curve, 
                             precision_recall_curve, 
                             recall_score,
                             make_scorer,
                             auc,
                             classification_report,
                             confusion_matrix
                            )

In [2]:
logr_scaled = Pipeline([
    ('scaling', StandardScaler(with_mean=False)), 
    ('pca', PCA(n_components=5)),
    ('classifier', LogisticRegression())])

dct_scaled = Pipeline([
    ('scaling', StandardScaler(with_mean=False)), 
    ('pca', PCA(n_components=5)),
    ('classifier', DecisionTreeClassifier())])

knn_scaled = Pipeline([
    ('scaling', StandardScaler(with_mean=False)), 
    ('pca', PCA(n_components=5)),
    ('classifier', KNeighborsClassifier())])

svc_scaled = Pipeline([
    ('scaling', StandardScaler(with_mean=False)), 
    ('pca', PCA(n_components=5)),
    ('classifier', SVC())])

logr_params = {
               'classifier__penalty':['l1','l2'], 
               'classifier__C':np.logspace(-3,3,7)}
dct_params = {
#               'classifier__max_features':np.arange(0,5),
              'classifier__max_depth':np.arange(1,8)}
knn_params = {
              'classifier__weights':['uniform','distance'],
              'classifier__n_neighbors':np.arange(3,13,2)}
svc_params = {
               'classifier__gamma':np.logspace(-3,3,7), 
               'classifier__C':np.logspace(-3,3,7),
               'classifier__kernel':['rbf','sigmoid']}

logr_scaled_gs = GridSearchCV(logr_scaled, logr_params, cv=StratifiedShuffleSplit(n_splits=5, test_size = .2))
dct_scaled_gs = GridSearchCV(dct_scaled, dct_params, cv=StratifiedShuffleSplit(n_splits=5, test_size = .2))
knn_scaled_gs = GridSearchCV(knn_scaled, knn_params, cv=StratifiedShuffleSplit(n_splits=5, test_size = .2))
svc_scaled_gs = GridSearchCV(svc_scaled, svc_params, cv=StratifiedShuffleSplit(n_splits=5, test_size = .2))

# GradientBoostingClassifier - no

ada_scaled = Pipeline([
    ('scaling', StandardScaler(with_mean=False)), 
    ('pca', PCA(n_components=5)),
    ('clf', AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), learning_rate=.5))])

bag_scaled = Pipeline([
    ('scaling', StandardScaler(with_mean=False)), 
    ('pca', PCA(n_components=5)),
    ('clf', BaggingClassifier(DecisionTreeClassifier(random_state=42), max_samples=.8, random_state=42))])

rfc_scaled = Pipeline([
    ('scaling', StandardScaler(with_mean=False)), 
    ('pca', PCA(n_components=5)),
    ('clf',RandomForestClassifier(random_state=42))])

etc_scaled = Pipeline([
    ('scaling', StandardScaler(with_mean=False)), 
    ('pca', PCA(n_components=5)),
    ('clf',ExtraTreesClassifier(random_state=42))])

ada_params = {
              "clf__base_estimator__criterion" : ["gini", "entropy"],
#               "base_estimator__splitter" :   ["best", "random"],
              'clf__n_estimators': [100]}

bag_params = {
    'clf__base_estimator': [DecisionTreeClassifier(max_depth=md, random_state=42) for md in [5,7,10,None]],
    'clf__n_estimators':[10,50,100]}

rfc_params = {
    'clf__n_estimators':[10,50,100,200],
    'clf__max_features':['auto','log2']}

etc_params = {
    'clf__bootstrap':[True, False],
    'clf__n_estimators':[10,50,100,200]}

ada_scaled_gs = GridSearchCV(ada_scaled, ada_params, cv=StratifiedShuffleSplit(n_splits=5, test_size = .2), n_jobs=-1)
bag_gs = GridSearchCV(bag_scaled, bag_params, cv=StratifiedShuffleSplit(n_splits=5, test_size = .2), n_jobs=-1)
rfc_scaled_gs = GridSearchCV(rfc_scaled, rfc_params, cv=StratifiedShuffleSplit(n_splits=5, test_size = .2), n_jobs=-1)
etc_scaled_gs = GridSearchCV(etc_scaled, etc_params, cv=StratifiedShuffleSplit(n_splits=5, test_size = .2), n_jobs=-1)


pipe_names = ['logr_scaled', 'dct_scaled', 'knn_scaled', 'svcp_scaled', 'ada_scaled_gs', 'bag_gs', 'rfc_scaled_gs', 'etc_scaled_gs']
pipe_list = [logr_scaled_gs, dct_scaled_gs, knn_scaled_gs, svc_scaled_gs, ada_scaled_gs, bag_gs, rfc_scaled_gs, etc_scaled_gs]
model_zip = list(zip(pipe_names, pipe_list))

In [4]:
def score_pipelines(sample_list, model_zip):
    results = []
    for sample in tqdm(sample_list):
        y = sample.iloc[:,-1]
        X = sample.iloc[:,0:-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.5)
        for model_name, model in model_zip:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            recall = recall_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            results.append({
                    'sample': sample.name,
                    'name':'{}'.format(model_name),
                    'model': model,
                    'best_params': model.best_params_,
                    'train_accuracy': model.score(X_train, y_train),
                    'test_accuracy': model.score(X_test, y_test),
                    'recall': recall,
                    'precision': precision,
            })
    return pd.DataFrame(results)

# for sample in sample_list:
#     for pipe in tqdm(pipe_list):
#         display(fit_sample_model(sample, pipe))

results = score_pipelines(sample_list, model_zip)

100%|██████████| 6/6 [02:42<00:00, 27.13s/it]


In [5]:
display(results.sort_values('test_accuracy', ascending=False))

Unnamed: 0,best_params,model,name,precision,recall,sample,test_accuracy,train_accuracy
15,"{'clf__bootstrap': False, 'clf__n_estimators':...",GridSearchCV(cv=StratifiedShuffleSplit(n_split...,etc_scaled_gs,0.690909,0.844444,UCIsample2_clean,0.76,1.0
10,"{'classifier__n_neighbors': 3, 'classifier__we...",GridSearchCV(cv=StratifiedShuffleSplit(n_split...,knn_scaled,0.678571,0.844444,UCIsample2_clean,0.75,1.0
47,"{'clf__bootstrap': True, 'clf__n_estimators': ...",GridSearchCV(cv=StratifiedShuffleSplit(n_split...,etc_scaled_gs,0.744,0.746988,DBsample3_clean,0.746,1.0
31,"{'clf__bootstrap': False, 'clf__n_estimators':...",GridSearchCV(cv=StratifiedShuffleSplit(n_split...,etc_scaled_gs,0.715827,0.799197,DBsample1_clean,0.742,1.0
5,{'clf__base_estimator': DecisionTreeClassifier...,GridSearchCV(cv=StratifiedShuffleSplit(n_split...,bag_gs,0.826087,0.678571,UCIsample1_clean,0.74,1.0
2,"{'classifier__n_neighbors': 7, 'classifier__we...",GridSearchCV(cv=StratifiedShuffleSplit(n_split...,knn_scaled,0.773585,0.732143,UCIsample1_clean,0.73,1.0
30,"{'clf__max_features': 'auto', 'clf__n_estimato...",GridSearchCV(cv=StratifiedShuffleSplit(n_split...,rfc_scaled_gs,0.708029,0.779116,DBsample1_clean,0.73,1.0
29,{'clf__base_estimator': DecisionTreeClassifier...,GridSearchCV(cv=StratifiedShuffleSplit(n_split...,bag_gs,0.705036,0.787149,DBsample1_clean,0.73,0.986
26,"{'classifier__n_neighbors': 3, 'classifier__we...",GridSearchCV(cv=StratifiedShuffleSplit(n_split...,knn_scaled,0.702509,0.787149,DBsample1_clean,0.728,1.0
3,"{'classifier__C': 100.0, 'classifier__gamma': ...",GridSearchCV(cv=StratifiedShuffleSplit(n_split...,svcp_scaled,0.741379,0.767857,UCIsample1_clean,0.72,0.92
