##### Jupyter Notebook, Step 1 - Benchmarking
- build pipeline to perform a naive fit for each of the base model classes:
	- logistic regression
	- decision tree
	- k nearest neighbors
	- support vector classifier
- in order to do this, you will need to set a high `C` value in order to perform minimal regularization, in the case of logistic regression and support vector classifier.  An example is 1E10


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline
from tqdm import tqdm
import pickle
from sklearn.pipeline import Pipeline

UCIsample1_clean = pickle.load( open( "UCIsample1_clean", "rb" ) )
UCIsample2_clean = pickle.load( open( "UCIsample2_clean", "rb" ) )
UCIsample3_clean = pickle.load( open( "UCIsample3_clean", "rb" ) )
DBsample1_clean = pickle.load( open( "DBsample1_clean", "rb" ) )
DBsample2_clean = pickle.load( open( "DBsample2_clean", "rb" ) )
DBsample3_clean = pickle.load( open( "DBsample3_clean", "rb" ) )

UCIsample1 = pickle.load( open( "UCIsample1", "rb" ) )
UCIsample2 = pickle.load( open( "UCIsample2", "rb" ) )
UCIsample3 = pickle.load( open( "UCIsample3", "rb" ) )
DBsample1 = pickle.load( open( "DBsample1", "rb" ) )
DBsample2 = pickle.load( open( "DBsample2", "rb" ) )
DBsample3 = pickle.load( open( "DBsample3", "rb" ) )

sample_list = [UCIsample1_clean, UCIsample2_clean, UCIsample3_clean, \
               DBsample1_clean, DBsample2_clean, DBsample3_clean]
sample_names = ['UCIsample1_clean', 'UCIsample2_clean', 'UCIsample3_clean', \
                'DBsample1_clean', 'DBsample2_clean', 'DBsample3_clean']
sample_list_noise = [UCIsample1, UCIsample2, UCIsample3, \
               DBsample1, DBsample2, DBsample3]
sample_names_noise = ['UCIsample1', 'UCIsample2', 'UCIsample3', \
                'DBsample1', 'DBsample2', 'DBsample3']

UCIsample1_clean.name = 'UCIsample1_clean'
UCIsample2_clean.name = 'UCIsample2_clean'
UCIsample3_clean.name = 'UCIsample3_clean'
DBsample1_clean.name = 'DBsample1_clean' 
DBsample2_clean.name = 'DBsample2_clean' 
DBsample3_clean.name = 'DBsample3_clean'
UCIsample1.name = 'UCIsample1'
UCIsample2.name = 'UCIsample2'
UCIsample3.name = 'UCIsample3'
DBsample1.name = 'DBsample1' 
DBsample2.name = 'DBsample2' 
DBsample3.name = 'DBsample3'

In [2]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

logr = LogisticRegression(C=1E10)
dct = DecisionTreeClassifier()
knn = KNeighborsClassifier()
svcp = SVC(C=1E10)

logr_scaled = make_pipeline(StandardScaler(), LogisticRegression(C=1E10))
dct_scaled = make_pipeline(StandardScaler(), DecisionTreeClassifier())
knn_scaled = make_pipeline(StandardScaler(), KNeighborsClassifier())
svcp_scaled = make_pipeline(StandardScaler(), SVC(C=1E10))

pipe_names = ['logr', 'dct', 'knn', 'svcp', 'logr_scaled', 'dct_scaled', 'knn_scaled', 'svcp_scaled']
pipe_list = [logr, dct, knn, svcp, logr_scaled, dct_scaled, knn_scaled, svcp_scaled]
model_zip = list(zip(pipe_names, pipe_list))

In [3]:
from sklearn.metrics import (precision_score, 
                             accuracy_score, 
                             roc_auc_score, 
                             roc_curve, 
                             precision_recall_curve, 
                             recall_score,
                             make_scorer,
                             auc,
                             classification_report,
                             confusion_matrix
                            )

In [4]:
# expected ratios based on targets

# first calculate the average label values:
DB_mean = (DBsample1_clean['target'].mean() + DBsample2_clean['target'].mean() + DBsample3_clean['target'].mean())/3
UCI_mean = (UCIsample1_clean[500].mean() + UCIsample2_clean[500].mean() + UCIsample3_clean[500].mean())/3

# since the values range from -1 to 1 in UCI, we need to scale it to correct the ratio by adding 1 and dividing by 2
display("Label ratio in DB is", DB_mean)
display("Label ratio in UCI is",(UCI_mean + 1)/2)

# confirming that this matches the accuracy score averaged across samples:
benchmark_all_1_1000 = np.array([1]*1000)
benchmark_all_1_200 = np.array([1]*200)

display("accuracy in DB is", (
        accuracy_score(DBsample1_clean['target'], benchmark_all_1_1000) + 
        accuracy_score(DBsample2_clean['target'], benchmark_all_1_1000) + 
        accuracy_score(DBsample3_clean['target'], benchmark_all_1_1000))
        /3)

display("accuracy in UCI is", (
        accuracy_score(UCIsample1_clean[500], benchmark_all_1_200) +
        accuracy_score(UCIsample2_clean[500], benchmark_all_1_200) +
        accuracy_score(UCIsample3_clean[500], benchmark_all_1_200))
        /3)

'Label ratio in DB is'

0.49033333333333334

'Label ratio in UCI is'

0.52333333333333332

'accuracy in DB is'

0.49033333333333334

'accuracy in UCI is'

0.52333333333333343

In [5]:
# define a function to loop through each sample and each pipeline and output the results to a dataframe.  first, 
# doing this with the noisy features

def score_pipelines(sample_list, model_zip):
    results = []
    for sample in tqdm(sample_list):
        y = sample.iloc[:,-1]
        X = sample.iloc[:,0:-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
        for model_name, model in model_zip:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            recall = recall_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
#             class_report = classification_report(y_test, y_pred)
#             conf_matrix = confusion_matrix(y_test, y_pred)
            results.append({
                    'sample':sample.name,
                    'name':'{}'.format(model_name),
                    'model':model,
                    'train_accuracy' : model.score(X_train, y_train),
                    'test_accuracy': model.score(X_test, y_test),
                    'recall':recall,
                    'precision':precision,
#                     'classification_report':class_report,
#                     'conf_matrix':conf_matrix
            })
    return pd.DataFrame(results)

results = score_pipelines(sample_list_noise, model_zip)
display(results.sort_values('test_accuracy', ascending=False))

  'precision', 'predicted', average, warn_for)
100%|██████████| 6/6 [00:26<00:00,  4.40s/it]


Unnamed: 0,model,name,precision,recall,sample,test_accuracy,train_accuracy
10,"KNeighborsClassifier(algorithm='auto', leaf_si...",knn,0.642857,0.75,UCIsample2,0.68,0.813333
35,"SVC(C=10000000000.0, cache_size=200, class_wei...",svcp,0.62963,0.557377,DBsample2,0.624,1.0
42,"KNeighborsClassifier(algorithm='auto', leaf_si...",knn,0.635593,0.595238,DBsample3,0.624,0.758667
2,"KNeighborsClassifier(algorithm='auto', leaf_si...",knn,0.636364,0.75,UCIsample1,0.62,0.753333
27,"SVC(C=10000000000.0, cache_size=200, class_wei...",svcp,0.580882,0.647541,DBsample1,0.6,1.0
29,"Pipeline(steps=[('standardscaler', StandardSca...",dct_scaled,0.574627,0.631148,DBsample1,0.592,1.0
43,"SVC(C=10000000000.0, cache_size=200, class_wei...",svcp,0.6,0.547619,DBsample3,0.588,1.0
33,"DecisionTreeClassifier(class_weight=None, crit...",dct,0.57377,0.57377,DBsample2,0.584,1.0
37,"Pipeline(steps=[('standardscaler', StandardSca...",dct_scaled,0.576271,0.557377,DBsample2,0.584,1.0
25,"DecisionTreeClassifier(class_weight=None, crit...",dct,0.573913,0.540984,DBsample1,0.58,1.0


In [6]:
# second, running this on the noiseless features (our core 20).  overall better test accuracy, but still overfitting
# on the train set and not doing fantastically well.

results = score_pipelines(sample_list, model_zip)
display(results.sort_values('test_accuracy', ascending=False))

  'precision', 'predicted', average, warn_for)
100%|██████████| 6/6 [00:11<00:00,  1.94s/it]


Unnamed: 0,model,name,precision,recall,sample,test_accuracy,train_accuracy
7,"Pipeline(steps=[('standardscaler', StandardSca...",svcp_scaled,0.875,0.75,UCIsample1_clean,0.8,1.0
14,"Pipeline(steps=[('standardscaler', StandardSca...",knn_scaled,0.740741,0.833333,UCIsample2_clean,0.78,0.86
34,"KNeighborsClassifier(algorithm='auto', leaf_si...",knn,0.773913,0.729508,DBsample2_clean,0.764,0.821333
15,"Pipeline(steps=[('standardscaler', StandardSca...",svcp_scaled,0.75,0.75,UCIsample2_clean,0.76,1.0
10,"KNeighborsClassifier(algorithm='auto', leaf_si...",knn,0.714286,0.833333,UCIsample2_clean,0.76,0.806667
38,"Pipeline(steps=[('standardscaler', StandardSca...",knn_scaled,0.765217,0.721311,DBsample2_clean,0.756,0.813333
30,"Pipeline(steps=[('standardscaler', StandardSca...",knn_scaled,0.721805,0.786885,DBsample1_clean,0.748,0.833333
26,"KNeighborsClassifier(algorithm='auto', leaf_si...",knn,0.718519,0.795082,DBsample1_clean,0.748,0.818667
42,"KNeighborsClassifier(algorithm='auto', leaf_si...",knn,0.762712,0.714286,DBsample3_clean,0.744,0.832
46,"Pipeline(steps=[('standardscaler', StandardSca...",knn_scaled,0.746032,0.746032,DBsample3_clean,0.744,0.836


There is a serious amount of overfitting going on in the benchmark models, especially in the samples with the noise (all 500 or 1000 features) compared to the ones that have had Josh's first-pass noise-reduction run on them.  The models, as a result, are only a little better than a coin flip at predicting an outcome on the test split (for most of them).  

Of these, KNN and SVC seem to perform the best, getting as high as 70-ish percent in many of the test splits.  On some models, however, SVC often gets worse than a coin flip, giving it high variance

In [11]:
112000*0.07*(40-10)

235200.00000000003

In [12]:
112000*1.07

119840.0