In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# General idea

The idea of this notebook is to propose a methodology for hyperparameter tuning than we can use for all the models, here I apply it to a neural net, but using it for other models should change at most a couple of lines.

I noticed Maciej was using grid-search with 4-fold cross-validation. The issues I see are:

1. Using exhaustive search will be way to slow. For instance, for this neural net with the considered hyperparams to tune and its possible values we could have (as per my unreliable calculations) over 1000 configurations. Every configuration takes about 3 min to fit in Colab (with a subset of the dataset as I'll explain later), and that by 4 of the 4-fold would result in over a week of compute time.

2. K-fold cross validation is meant for smaller datasets than ours, where squishing every bit of data is important. Might be rushed to say this since we don't know the real complexity of the problem, but I'd dare to say that we do have data spare, so, K-fold might not be the best choice, at least for hyperparameter tuning. 

So, here I propose that we apply the alternative described here: https://stats.stackexchange.com/questions/34939/k-fold-cross-validation-strategy-for-large-data-set-in-statistical-learning#:~:text=k%2Dfold%20cross%2Dvalidation%20is,sample%20data%20is%20sufficiently%20limited.&text=If%20that%20is%20the%20case,on%20B%2C%20then%20vice%20versa, specifically where it says: _"Another option would be to randomly draw samples of ~ 10% of the data for training, then another 10% for testing and repeat that process multiple times and assess variability of your results."_

What I'm doing is choosing 10 random train/test splits from the whole dataset with sizes of 1%/0.1% of the whole data set, so in total we are evaluating on about 10% of the data (almost 900k) with low probability of overlap. On this 10 train/test subsets I apply a randomized search using https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html, where 50 configurations are sampled. It shouldn't take much more than 24 hours to compute.

My proposal would be using this to define one model that is worth to fine-tune by hand using the whole dataset (checking bias/variance tradeoffs and etc.), and finally use that model to derive results (use the real test set to obtain numerical results, and assess feature importance by permutation, and whatever we can think of in the little more than a week we have left).

## Setting up the data:

In [4]:
with open('/content/gdrive/My Drive/Notebooks Colab/cleanDF.pickle', 'rb') as handle:
    df = pickle.load(handle)

X = df.copy().drop('HasDetections', 1)
y = df.copy()['HasDetections']
del df

categoricalColumns = list(X.select_dtypes(include='category').columns)
numericalColumns = list(X.select_dtypes(exclude='category').columns)

classes = y.unique()
uniques = {}
for column in categoricalColumns:
    uniques[column] = X[column].unique()
uniquesDF = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in uniques.items() ]))
categoricalEncoder = OneHotEncoder(handle_unknown='ignore', dtype='uint8', sparse=True)
categoricalEncoder.fit(uniquesDF.astype(str))
del uniquesDF
# Numerical columns with mean 0.0 and variance of 1.0
for column in numericalColumns:
    X[column] = ((X[column] - np.mean(X[column])) / np.std(X[column])).astype('float64')

## Specific to neural nets (a lot of ugly hacking to manage using mini-batches with the RandomizedSearchCV):

In [None]:
def iterateMiniBatches(inputs, targets, batchSize, shuffle=False):
    assert inputs.shape[0] == targets.shape[0]
    if shuffle:
        indices = np.arange(inputs.shape[0])
        np.random.shuffle(indices)
    for start_idx in range(0, inputs.shape[0] - batchSize + 1, batchSize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchSize]
        else:
            excerpt = slice(start_idx, start_idx + batchSize)
        yield inputs.iloc[excerpt], targets.iloc[excerpt]

In [5]:
from sklearn.base import BaseEstimator, ClassifierMixin

class CustomMLPClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, batchSize=128, tolerance=1e-3, maxEpochs=10, hiddenLayerSizes=1024, learningRate=1e-4, l2RegCoefficient=1e-4):
        self.batchSize = batchSize
        self.maxEpochs = maxEpochs
        self.hiddenLayerSizes = hiddenLayerSizes
        self.learningRate = learningRate
        self.l2RegCoefficient = l2RegCoefficient
        self.tolerance = tolerance
        self.model = MLPClassifier(hidden_layer_sizes=self.hiddenLayerSizes, activation='relu', solver='adam', learning_rate_init=self.learningRate,
                                   alpha=self.l2RegCoefficient, batch_size=self.batchSize, max_iter=1)

    def fit(self, X, y=None):
        xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.1, stratify=y, random_state=11)
        xTestNumerical = xTest[numericalColumns].values
        xTestCategorical = xTest[categoricalColumns]
        xTestCategorical = categoricalEncoder.transform(xTestCategorical)
        xTest = np.concatenate([xTestNumerical, scipy.sparse.csr_matrix.toarray(xTestCategorical)], axis=1)
        del xTestNumerical
        del xTestCategorical
        numBatches = xTrain.shape[0] // self.batchSize
        
        classes = yTrain.unique()
        prevTestScore = 0
        for n in range(self.maxEpochs):
            batchCounter = 0
            print(f"---------------------- EPOCH {n + 1} ----------------------")
            for batch in tqdm(iterateMiniBatches(xTrain, yTrain, self.batchSize, shuffle=True)):
                batchCounter += 1
                xBatch, yBatch = batch
                xBatchNumerical = xBatch[numericalColumns].values
                xBatchCategorical = xBatch[categoricalColumns]
                xBatchCategorical = categoricalEncoder.transform(xBatchCategorical)
                xBatch = np.concatenate([xBatchNumerical, scipy.sparse.csr_matrix.toarray(xBatchCategorical)], axis=1)
                self.model.partial_fit(xBatch, yBatch, classes=classes)
            testScore = self.model.score(xTest, yTest)
            print(f"Test score: {testScore}")
            if (testScore - prevTestScore) < self.tolerance:
                print('Applied early stopping')
                break
            else:
                prevTestScore = testScore
        return self

    def predict(self, X, y=None):
        xNumerical = X[numericalColumns].values
        xCategorical = X[categoricalColumns]
        xCategorical = categoricalEncoder.transform(xCategorical)
        return self.model.predict(np.concatenate([xNumerical, scipy.sparse.csr_matrix.toarray(xCategorical)], axis=1), y)

    def score(self, X, y=None):
        xNumerical = X[numericalColumns].values
        xCategorical = X[categoricalColumns]
        xCategorical = categoricalEncoder.transform(xCategorical)
        return self.model.score(np.concatenate([xNumerical, scipy.sparse.csr_matrix.toarray(xCategorical)], axis=1), y)

## Here we would define the dictionaries for the other models:

In [6]:
# Neural net size layers
maxNumLayers = 4
neuralNetArch = [(layerSize,) * numLayers for numLayers in range(1, maxNumLayers + 1) for layerSize in [64, 128, 256, 512, 1024]]
# Learning rate
learningRates = np.logspace(start=-5, stop=-1, num=10, base=10)
# L2 Regularization coefficient
alphas = [0.0, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]

neuralNetGrid = {
    'hiddenLayerSizes': neuralNetArch,
    'learningRate': learningRates,
    'l2RegCoefficient': alphas
}

## And here is the randomized search per se:

In [None]:
# Create the model to be tuned
neuralNetBase = CustomMLPClassifier()
# Create the random search Neural Net
neuralNetRandom = RandomizedSearchCV(estimator = neuralNetBase, param_distributions = neuralNetGrid, 
                               n_iter = 50, cv = ShuffleSplit(n_splits=10, test_size=0.001, train_size=0.01), verbose = 2, random_state = 11, 
                               n_jobs = 1)
# Fit the random search model
neuralNetRandom.fit(X, y)
# View the best parameters from the random search
neuralNetRandom.best_params_

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


1it [00:00,  5.98it/s]

---------------------- EPOCH 1 ----------------------


627it [01:07,  9.24it/s]
1it [00:00,  9.79it/s]

Test score: 0.6160053799596503
---------------------- EPOCH 2 ----------------------


627it [01:05,  9.50it/s]
1it [00:00,  9.88it/s]

Test score: 0.6221699170589554
---------------------- EPOCH 3 ----------------------


627it [01:06,  9.50it/s]
1it [00:00,  8.89it/s]

Test score: 0.6244115669132482
---------------------- EPOCH 4 ----------------------


627it [01:05,  9.64it/s]


Test score: 0.6217215870880969
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 4.6min
[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.6min remaining:    0.0s
1it [00:00,  8.73it/s]

---------------------- EPOCH 1 ----------------------


627it [01:08,  9.13it/s]
1it [00:00,  9.99it/s]

Test score: 0.6237390719569603
---------------------- EPOCH 2 ----------------------


627it [01:04,  9.74it/s]


Test score: 0.6217215870880969
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 2.3min
[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


1it [00:00,  8.27it/s]

---------------------- EPOCH 1 ----------------------


627it [01:05,  9.60it/s]
1it [00:00,  9.93it/s]

Test score: 0.610849585294777
---------------------- EPOCH 2 ----------------------


627it [01:04,  9.70it/s]
1it [00:00,  9.92it/s]

Test score: 0.612979152656355
---------------------- EPOCH 3 ----------------------


627it [01:04,  9.77it/s]
1it [00:00,  9.97it/s]

Test score: 0.623851154449675
---------------------- EPOCH 4 ----------------------


627it [01:04,  9.69it/s]


Test score: 0.6176866173503699
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 4.4min
[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


1it [00:00,  8.74it/s]

---------------------- EPOCH 1 ----------------------


627it [01:05,  9.56it/s]
0it [00:00, ?it/s]

Test score: 0.6192557722483748
---------------------- EPOCH 2 ----------------------


627it [01:04,  9.75it/s]
2it [00:00, 10.23it/s]

Test score: 0.6237390719569603
---------------------- EPOCH 3 ----------------------


627it [01:04,  9.70it/s]


Test score: 0.6228424120152433
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 3.3min
[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


1it [00:00,  8.61it/s]

---------------------- EPOCH 1 ----------------------


627it [01:04,  9.78it/s]
1it [00:00,  9.73it/s]

Test score: 0.6228424120152433
---------------------- EPOCH 2 ----------------------


627it [01:06,  9.46it/s]


Test score: 0.6216095045953822
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 2.2min
[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


1it [00:00,  8.75it/s]

---------------------- EPOCH 1 ----------------------


627it [01:07,  9.35it/s]
0it [00:00, ?it/s]

Test score: 0.6132033176417844
---------------------- EPOCH 2 ----------------------


627it [01:05,  9.52it/s]
1it [00:00,  8.46it/s]

Test score: 0.6144362250616454
---------------------- EPOCH 3 ----------------------


627it [01:06,  9.47it/s]
1it [00:00,  9.67it/s]

Test score: 0.6208249271463797
---------------------- EPOCH 4 ----------------------


627it [01:07,  9.34it/s]


Test score: 0.6212732571172382
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 4.5min
[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


1it [00:00,  8.75it/s]

---------------------- EPOCH 1 ----------------------


627it [01:07,  9.33it/s]
1it [00:00,  9.56it/s]

Test score: 0.6158932974669357
---------------------- EPOCH 2 ----------------------


627it [01:05,  9.56it/s]


Test score: 0.6165657924232235
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 2.3min
[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


1it [00:00,  8.32it/s]

---------------------- EPOCH 1 ----------------------


627it [01:03,  9.87it/s]
1it [00:00,  9.46it/s]

Test score: 0.6155570499887918
---------------------- EPOCH 2 ----------------------


627it [01:04,  9.75it/s]


Test score: 0.6130912351490697
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 2.2min
[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


1it [00:00,  9.09it/s]

---------------------- EPOCH 1 ----------------------


627it [01:05,  9.55it/s]
1it [00:00,  9.15it/s]

Test score: 0.6136516476126429
---------------------- EPOCH 2 ----------------------


627it [01:06,  9.48it/s]
1it [00:00,  9.23it/s]

Test score: 0.6188074422775163
---------------------- EPOCH 3 ----------------------


627it [01:05,  9.60it/s]


Test score: 0.615781214974221
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 3.3min
[CV] learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024) 


1it [00:00,  9.08it/s]

---------------------- EPOCH 1 ----------------------


627it [01:05,  9.61it/s]
1it [00:00,  9.91it/s]

Test score: 0.6165657924232235
---------------------- EPOCH 2 ----------------------


627it [01:05,  9.57it/s]
1it [00:00,  8.86it/s]

Test score: 0.6306881865052679
---------------------- EPOCH 3 ----------------------


627it [01:07,  9.35it/s]


Test score: 0.626316969289397
Applied early stopping
[CV]  learningRate=0.03593813663804626, l2RegCoefficient=1e-06, hiddenLayerSizes=(1024, 1024, 1024), total= 3.4min
[CV] learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512) 


1it [00:00,  8.51it/s]

---------------------- EPOCH 1 ----------------------


627it [01:06,  9.37it/s]
1it [00:00,  9.78it/s]

Test score: 0.6228424120152433
---------------------- EPOCH 2 ----------------------


627it [01:05,  9.51it/s]


Test score: 0.6152208025106478
Applied early stopping
[CV]  learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512), total= 2.3min
[CV] learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512) 


1it [00:00,  9.29it/s]

---------------------- EPOCH 1 ----------------------


627it [01:06,  9.38it/s]
1it [00:00,  9.16it/s]

Test score: 0.6176866173503699
---------------------- EPOCH 2 ----------------------


627it [01:06,  9.42it/s]
1it [00:00,  9.86it/s]

Test score: 0.6240753194351042
---------------------- EPOCH 3 ----------------------


627it [01:06,  9.38it/s]


Test score: 0.6228424120152433
Applied early stopping
[CV]  learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512), total= 3.4min
[CV] learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512) 


1it [00:00,  8.69it/s]

---------------------- EPOCH 1 ----------------------


627it [01:06,  9.43it/s]
0it [00:00, ?it/s]

Test score: 0.6151087200179332
---------------------- EPOCH 2 ----------------------


627it [01:05,  9.60it/s]
1it [00:00,  9.16it/s]

Test score: 0.6177986998430846
---------------------- EPOCH 3 ----------------------


627it [01:06,  9.44it/s]


Test score: 0.6160053799596503
Applied early stopping
[CV]  learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512), total= 3.4min
[CV] learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512) 


1it [00:00,  8.86it/s]

---------------------- EPOCH 1 ----------------------


627it [01:06,  9.48it/s]
1it [00:00,  9.99it/s]

Test score: 0.6171262048867967
---------------------- EPOCH 2 ----------------------


627it [01:04,  9.67it/s]
1it [00:00,  8.64it/s]

Test score: 0.6200403496973773
---------------------- EPOCH 3 ----------------------


627it [01:06,  9.48it/s]
1it [00:00,  9.27it/s]

Test score: 0.6223940820443846
---------------------- EPOCH 4 ----------------------


627it [01:05,  9.62it/s]


Test score: 0.6195920197265187
Applied early stopping
[CV]  learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512), total= 4.4min
[CV] learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512) 


1it [00:00,  8.69it/s]

---------------------- EPOCH 1 ----------------------


627it [01:05,  9.55it/s]
1it [00:00,  9.75it/s]

Test score: 0.619816184711948
---------------------- EPOCH 2 ----------------------


627it [01:05,  9.53it/s]
1it [00:00,  9.69it/s]

Test score: 0.623514906971531
---------------------- EPOCH 3 ----------------------


627it [01:05,  9.61it/s]


Test score: 0.6227303295225286
Applied early stopping
[CV]  learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512), total= 3.3min
[CV] learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512) 


1it [00:00,  8.53it/s]

---------------------- EPOCH 1 ----------------------


627it [01:06,  9.44it/s]
1it [00:00,  9.35it/s]

Test score: 0.6197041022192333
---------------------- EPOCH 2 ----------------------


627it [01:08,  9.11it/s]


Test score: 0.6155570499887918
Applied early stopping
[CV]  learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512), total= 2.3min
[CV] learningRate=0.012915496650148827, l2RegCoefficient=0.0001, hiddenLayerSizes=(512, 512) 


1it [00:00,  8.52it/s]

---------------------- EPOCH 1 ----------------------


216it [00:22,  9.41it/s]