In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Setting up the data:

In [None]:
with open('/content/gdrive/My Drive/Notebooks Colab/cleanDF.pickle', 'rb') as handle:
    df = pickle.load(handle)

X = df.copy().drop('HasDetections', 1)
y = df.copy()['HasDetections']
del df

categoricalColumns = list(X.select_dtypes(include='category').columns)
numericalColumns = list(X.select_dtypes(exclude='category').columns)

classes = y.unique()
uniques = {}
for column in categoricalColumns:
    uniques[column] = X[column].unique()
uniquesDF = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in uniques.items() ]))
categoricalEncoder = OneHotEncoder(handle_unknown='ignore', dtype='uint8', sparse=True)
categoricalEncoder.fit(uniquesDF.astype(str))
del uniquesDF
# Numerical columns with mean 0.0 and variance of 1.0
for column in numericalColumns:
    X[column] = ((X[column] - np.mean(X[column])) / np.std(X[column])).astype('float64')

## Specific to neural nets (a lot of ugly hacking to manage using mini-batches with the RandomizedSearchCV):

In [None]:
def iterateMiniBatches(inputs, targets, batchSize, shuffle=False):
    assert inputs.shape[0] == targets.shape[0]
    if shuffle:
        indices = np.arange(inputs.shape[0])
        np.random.shuffle(indices)
    for start_idx in range(0, inputs.shape[0] - batchSize + 1, batchSize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchSize]
        else:
            excerpt = slice(start_idx, start_idx + batchSize)
        yield inputs.iloc[excerpt], targets.iloc[excerpt]

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin

class CustomMLPClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, batchSize=128, tolerance=1e-3, maxEpochs=10, hiddenLayerSizes=1024, learningRate=1e-4, l2RegCoefficient=1e-4):
        self.batchSize = batchSize
        self.maxEpochs = maxEpochs
        self.hiddenLayerSizes = hiddenLayerSizes
        self.learningRate = learningRate
        self.l2RegCoefficient = l2RegCoefficient
        self.tolerance = tolerance
        self.model = MLPClassifier(hidden_layer_sizes=self.hiddenLayerSizes, activation='relu', solver='adam', learning_rate_init=self.learningRate,
                                   alpha=self.l2RegCoefficient, batch_size=self.batchSize, max_iter=1)

    def fit(self, X, y=None):
        xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.1, stratify=y, random_state=11)
        xTestNumerical = xTest[numericalColumns].values
        xTestCategorical = xTest[categoricalColumns]
        xTestCategorical = categoricalEncoder.transform(xTestCategorical)
        xTest = np.concatenate([xTestNumerical, scipy.sparse.csr_matrix.toarray(xTestCategorical)], axis=1)
        del xTestNumerical
        del xTestCategorical
        numBatches = xTrain.shape[0] // self.batchSize
        
        classes = yTrain.unique()
        prevTestScore = 0
        for n in range(self.maxEpochs):
            batchCounter = 0
            # print(f"---------------------- EPOCH {n + 1} ----------------------")
            # for batch in tqdm(iterateMiniBatches(xTrain, yTrain, self.batchSize, shuffle=True)):
            for batch in iterateMiniBatches(xTrain, yTrain, self.batchSize, shuffle=True):
                batchCounter += 1
                xBatch, yBatch = batch
                xBatchNumerical = xBatch[numericalColumns].values
                xBatchCategorical = xBatch[categoricalColumns]
                xBatchCategorical = categoricalEncoder.transform(xBatchCategorical)
                xBatch = np.concatenate([xBatchNumerical, scipy.sparse.csr_matrix.toarray(xBatchCategorical)], axis=1)
                self.model.partial_fit(xBatch, yBatch, classes=classes)
            testScore = self.model.score(xTest, yTest)
            # print(f"Test score: {testScore}")
            if (testScore - prevTestScore) < self.tolerance:
                # print('Applied early stopping')
                break
            else:
                prevTestScore = testScore
        return self

    def predict(self, X, y=None):
        xNumerical = X[numericalColumns].values
        xCategorical = X[categoricalColumns]
        xCategorical = categoricalEncoder.transform(xCategorical)
        return self.model.predict(np.concatenate([xNumerical, scipy.sparse.csr_matrix.toarray(xCategorical)], axis=1), y)

    def score(self, X, y=None):
        xNumerical = X[numericalColumns].values
        xCategorical = X[categoricalColumns]
        xCategorical = categoricalEncoder.transform(xCategorical)
        return self.model.score(np.concatenate([xNumerical, sci4/1AY0e-g7DcGVMVcsP5taqNXCqd055NijXM2HXUmJk5Un1FH5IPoDYwykEI9Epy.sparse.csr_matrix.toarray(xCategorical)], axis=1), y)

## Here we would define the dictionaries for the other models:

In [None]:
# Neural net size layers
maxNumLayers = 4
neuralNetArch = [(layerSize,) * numLayers for numLayers in range(1, maxNumLayers + 1) for layerSize in [64, 128, 256, 512, 1024]]
# Learning rate
learningRates = np.logspace(start=-5, stop=-1, num=8, base=10)
# L2 Regularization coefficient
alphas = [0.0, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]

neuralNetGrid = {
    'hiddenLayerSizes': neuralNetArch,
    'learningRate': learningRates,
    'l2RegCoefficient': alphas
}
print(neuralNetGrid)

{'hiddenLayerSizes': [(64,), (128,), (256,), (512,), (1024,), (64, 64), (128, 128), (256, 256), (512, 512), (1024, 1024), (64, 64, 64), (128, 128, 128), (256, 256, 256), (512, 512, 512), (1024, 1024, 1024), (64, 64, 64, 64), (128, 128, 128, 128), (256, 256, 256, 256), (512, 512, 512, 512), (1024, 1024, 1024, 1024)], 'learningRate': array([1.00000000e-05, 3.72759372e-05, 1.38949549e-04, 5.17947468e-04,
       1.93069773e-03, 7.19685673e-03, 2.68269580e-02, 1.00000000e-01]), 'l2RegCoefficient': [0.0, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0]}


## And here is the randomized search per se:

In [None]:
# Create the model to be tuned
neuralNetBase = CustomMLPClassifier()
# Create the random search Neural Net
neuralNetRandom = RandomizedSearchCV(estimator = neuralNetBase, param_distributions = neuralNetGrid, 
                               n_iter = 50, cv = ShuffleSplit(n_splits=10, test_size=0.001, train_size=0.01), verbose = 2, random_state = 11, 
                               n_jobs = 1, refit=False)
# Fit the random search model
neuralNetRandom.fit(X, y)

# View the best parameters from the random search
print(neuralNetRandom.best_params_)
with open('/content/gdrive/My Drive/Notebooks Colab/bestParams.pickle', 'wb') as handle:
    pickle.dump(neuralNetRandom, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256) 
[CV]  learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256), total= 3.9min
[CV] learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.9min remaining:    0.0s


[CV]  learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256), total= 1.9min
[CV] learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256) 
[CV]  learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256), total= 2.8min
[CV] learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256) 
[CV]  learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256), total= 1.9min
[CV] learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256) 
[CV]  learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256), total= 3.8min
[CV] learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256) 
[CV]  learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256), total= 2.9min
[CV] learningRate=1e-05, l2RegCoefficient=1e-05, hiddenLayerSizes=(256, 256, 256, 256) 
[CV]  learningRate=1e-05, l2RegCoefficient=1e

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 274.9min finished
