In [None]:
import numpy as np
import pandas as pd; 
import matplotlib.pyplot as plt
import seaborn as sns


import sklearn
from sklearn.model_selection import train_test_split
from sklearn import tree, svm, neural_network, neighbors, ensemble
from sklearn.utils import shuffle

import keras as K
from keras.wrappers.scikit_learn import KerasClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras.utils import np_utils
from keras.datasets import mnist
from keras import optimizers
from keras.callbacks import Callback, TensorBoard

from tensorflow.errors import InvalidArgumentError

In [None]:
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

def printbold(string):
    printmd(f'**{string}**')

# input generation

In [None]:
def generate_input_row(n_holes, avail_holes_ids=None):
    if avail_holes_ids == None: avail_holes_ids = [i for i in range(18)]
    row = np.array([1]*18)
    holes_id = np.random.choice( avail_holes_ids , size=n_holes, replace=False)
    row[holes_id] = 0
    return row

generate_input_row(2, [0,1,2,3])

In [None]:
def generate_dataset(n_sample=1000, bad_fraction=0.25, n_holes=[1], avail_holes_ids=None):
    def rand_n_holes(n_holes):
        return np.random.choice(n_holes)
    
    good_rows_lst = [ generate_input_row(0) for _ in range(int(n_sample*(1-bad_fraction))) ]
    bad_rows_lst  = [ generate_input_row(rand_n_holes(n_holes), avail_holes_ids=avail_holes_ids) for _ in range(int(n_sample*bad_fraction))]
    good_target   = [1  for _ in range(int(n_sample*(1-bad_fraction)))]
    bad_target    = [0 for _ in range(int(n_sample*bad_fraction))]
    
    
    X = np.vstack(good_rows_lst + bad_rows_lst)
    Y = good_target + bad_target
    return X,Y

generate_dataset(n_sample=20, bad_fraction=0.25, n_holes=[1]);

In [None]:
fig, axes = plt.subplots(1,3, figsize=(12, 8), sharey=True)
holes_ids = range(18)
ds = generate_dataset(n_sample=30, bad_fraction=1, avail_holes_ids=holes_ids)[0]
axes[0].imshow(ds, cmap='binary_r')
axes[0].set_title('uniform')

holes_ids2 = [8,9,10,11]
ds2 = generate_dataset(n_sample=30, bad_fraction=1, avail_holes_ids=holes_ids2)[0]
axes[1].imshow(ds2, cmap='binary_r')
axes[1].set_title('central sectors affected')

holes_ids3 = [0,1,2,3]
ds3 = generate_dataset(n_sample=30, bad_fraction=1, avail_holes_ids=holes_ids3)[0]
axes[2].imshow(ds3, cmap='binary_r')
axes[2].set_title('left sectors affected')

# uniform holes coverage

## basic sklearn classifiers

In [None]:
X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5)
X_test,  y_test  = generate_dataset(n_sample=10000, bad_fraction=0.5)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

classifiers = [(svm.SVC(gamma='auto'), 'SVM'),
               (tree.DecisionTreeClassifier(max_depth=3), 'DT-depth3'),
               (tree.DecisionTreeClassifier(max_depth=16), 'DT-depth16'),
               (tree.DecisionTreeClassifier(max_depth=20), 'DT-depth20'),
               (neighbors.KNeighborsClassifier(n_neighbors=2), 'KNN-k2'),
               (neighbors.KNeighborsClassifier(n_neighbors=15), 'KNN-k15'),
               (ensemble.RandomForestClassifier(n_estimators=10, max_depth=2), 'RF-n10-depth2'),
               (ensemble.RandomForestClassifier(n_estimators=1000, max_depth=2), 'RF-n1000-depth2'),
               (ensemble.RandomForestClassifier(n_estimators=10, max_depth=8), 'RF-n10-depth8'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[6,], max_iter=200), 'NN-6-iter200'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[6,], max_iter=15000), 'NN-6-iter15000'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[20,]), 'NN-20'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[32,16,8,4]), 'NN-32-16-8-4'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[32,32,32]), 'NN-32-32-32')
              ]



for clf, descr in classifiers:
    print(f'***{descr}***\n')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = sklearn.metrics.accuracy_score(y_test, y_pred)
    auc = sklearn.metrics.roc_auc_score(y_test, y_pred)
    print(f'Scores: accuracy = {acc}, ROC AUC = {auc}')
    print(sklearn.metrics.confusion_matrix(y_test, y_pred))
    print()

### Partial conclusions
Most classifiers can easily handle dataset with ___uniformly___ distributed holes. (Sometimes) Even extremely simple NN (4 hidden units in 1 layer) assuming sufficiently long training.  
Only exceptions are tree-based classifiers if tree's depth is < 18 (incl. huge RF) -- each level exludes holes in one sector -- see graph below


In [None]:
import graphviz
clf_dt = tree.DecisionTreeClassifier(max_depth=6)
clf_dt.fit(X_train, y_train)
dot_data = tree.export_graphviz(clf_dt, out_file=None, 
                      feature_names=['feat'+str(i) for i in range(18)],  
                      class_names=['bad', 'good'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

In [None]:
# fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, y_pred)
# plt.plot(fpr, tpr, color='darkorange',
#          lw=2, label='ROC curve (area = %0.2f)' % auc)
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.015])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.legend()

# Non-uniform holes coverage

## basic sklearn classifiers

In [None]:
holes_ids_train = [i for i in range(0,9)]
holes_ids_test  = [i for i in range(0,18)]
X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_train)
X_test,  y_test  = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)

classifiers = [(svm.SVC(gamma='auto'), 'SVM'),
               (tree.DecisionTreeClassifier(max_depth=3), 'DT-depth3'),
               (tree.DecisionTreeClassifier(max_depth=16), 'DT-depth16'),
               (tree.DecisionTreeClassifier(max_depth=20), 'DT-depth20'),
               (neighbors.KNeighborsClassifier(n_neighbors=2), 'KNN-k2'),
               (neighbors.KNeighborsClassifier(n_neighbors=15), 'KNN-k15'),
               (ensemble.RandomForestClassifier(n_estimators=10, max_depth=2), 'RF-n10-depth2'),
               (ensemble.RandomForestClassifier(n_estimators=1000, max_depth=2), 'RF-n1000-depth2'),
               (ensemble.RandomForestClassifier(n_estimators=10, max_depth=8), 'RF-n10-depth8'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[6,], max_iter=200), 'NN-6-iter200'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[6,], max_iter=15000), 'NN-6-iter15000'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[20,]), 'NN-20'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[32,16,8,4]), 'NN-32-16-8-4'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[32,32,32]), 'NN-32-32-32'),
               (neural_network.MLPClassifier(hidden_layer_sizes=[32,32,16,8,4]), 'NN-64-64-64-64'),
              ]



for clf, descr in classifiers:
    print(f'***{descr}***\n')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = sklearn.metrics.accuracy_score(y_test, y_pred)
    auc = sklearn.metrics.roc_auc_score(y_test, y_pred)
    print(f'Scores: accuracy = {acc}, ROC AUC = {auc}')
    print(sklearn.metrics.confusion_matrix(y_test, y_pred))
    print()

### Partial conlusions
None of the classifiers cannot *of course* recognise unseen data. Some larger NN do quite good job reaching >80% when trained on training sample containing holes in half of the sectors (expected result is then 75% = 50% from good classification of "good" examples and 25% from correct classification of half of "bad"). 

Maybe it is possible to train good classificator without explicit invariance (i.e. convolutions)?

# DNN

## helper functions

In [None]:
def create_MLP(activation='relu', dropout=0.0, neurons_layers='2x64', input_shape=1,
                 batch_size=64, lr=0.01, optimizer='Nadam',
                 return_descr=False, use_as_subnet=False):
    
    # for 1-layer nets dropout is not used !!!
    model = Sequential()
    if 'x' in neurons_layers:
        n_layers, n_hidden = [int(n) for n in neurons_layers.split('x')]
        model.add(Dense(n_hidden, activation=activation, input_shape=input_shape))
        for neurons in range(n_layers-1):
            # add dropout before, not after dense layer,
            # as there should be no dropout between two last layers
            model.add(Dropout(dropout))
            model.add(Dense(n_hidden, activation=activation))

        if use_as_subnet:
            return model
        model.add(Dense(2, activation='softmax'))
    else:
        hidden_lst = [int(h) for h in neurons_layers.split('-')]
        model.add(Dense(hidden_lst[0], activation=activation, input_shape=input_shape))
        for n_hidden in hidden_lst[1:]:
            # add dropout before, not after dense layer,
            # as there should be no dropout between two last layers
            model.add(Dropout(dropout))
            model.add(Dense(n_hidden, activation=activation))

        if use_as_subnet:
            return model
        model.add(Dense(2, activation='softmax'))

    if   optimizer == 'SGD': opt = optimizers.SGD(lr=lr)
    elif optimizer == 'Adam': opt = optimizers.Adam(lr=lr)
    elif optimizer == 'Nadam': opt = optimizers.Nadam(lr=lr)
    else: opt = optimizer  # accept also optimizer objects
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

    model_descr = 'struct=FC:{struct}_lr={lr}_dropout={dropout}__opt={opt}_act={act}_batchsize={batch_size}'.format(
                                                                     struct=neurons_layers,
                                                                     opt=str(opt.__class__.__name__),
                                                                     lr=lr,
                                                                     act=activation,
                                                                     dropout=dropout,
                                                                     batch_size=batch_size)
    print(model_descr)
    if return_descr:
        return model, model_descr
    else:
        return model

In [None]:
def create_ConvNet(n_conv_layers=2, n_filters_first=128, n_filters_change='down',
                      kernel_size=2, pool_size=2, strides=1,
                      n_fc_layers=4, n_fc_units=64,
                      activation='relu', dropout_fc=0.0, dropout_conv=0.0,
                      input_shape=1, batch_size=64, lr=0.0003, optimizer='Nadam',
                      return_descr=False, use_as_subnet=False):
    ''' n_filters_change : string 'down' or 'up' or 'flat'
            each next conv layer will have consecutively
            2x less or 2x more or same number of filters
        pool_size : int or None
            if None then no MaxPooling layers will be used,
            otherwise will be applied after each conv layer
        use_as_subnet : bool, default=False
            if True, then not compiled model without softmax layer is returned
    '''
    model = Sequential()
    # Conv Layers
    model.add( Conv1D(n_filters_first, kernel_size, strides=strides,  activation=activation, padding='same', input_shape=input_shape) )
    if pool_size:
        model.add( MaxPooling1D(pool_size=pool_size) )
    n_filters_prev = n_filters_first
    for i in range(n_conv_layers-1):

        if dropout_conv > 1e-5:
            model.add( Dropout(dropout_conv) )

        if n_filters_change == 'flat':
            n_filters = n_filters_prev
            n_filters_prev = n_filters
        elif n_filters_change == 'down':
            n_filters = int(n_filters_prev/2)
            n_filters_prev = n_filters
        elif n_filters_change == 'up':
            n_filters = int(n_filters_prev*2)
            n_filters_prev = n_filters
        model.add( Conv1D(n_filters, kernel_size, strides=strides,  activation=activation, padding='same') )
        if pool_size:
            model.add( MaxPooling1D(pool_size=pool_size) )


    model.add( Flatten() )

    # Fully-Connected Layers
    for i in range(n_fc_layers):
        if dropout_fc > 1e-5:
            model.add( Dropout(dropout_fc) )
        model.add( Dense(n_fc_units, activation=activation) )

    if use_as_subnet:
        return model


    model.add(Dense(2, activation='softmax'))

    if   optimizer == 'SGD': opt = optimizers.SGD(lr=lr)
    elif optimizer == 'Adam': opt = optimizers.Adam(lr=lr)
    elif optimizer == 'Nadam': opt = optimizers.Nadam(lr=lr)
    else: opt = optimizer  # accept also optimizer objects
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

    index_flatten = ['Flatten' in str(l) for l in model.layers].index(True)
    strides_str = '' if strides == 1  else 's='+str(strides)
    kernel_str = '' if kernel_size == 2 else 'k='+str(kernel_size)
    kernel_strides_str = '' if not strides_str and not kernel_str else '({},{})'.format(kernel_str, strides_str)
    pool_str = '' if pool_size == 2 else '(p={})'.format(pool_size)
    structure = '{conv_maxpool}x{n_conv}[{shape_first}->{shape_last}]+Dense({n_fc_units})x{n_fc_layers}'.format(
                        conv_maxpool='(Conv1D{}+MaxPool{})'.format(kernel_strides_str, pool_str) if pool_size else 'Conv1D'+kernel_strides_str,
                        n_conv=n_conv_layers,
                        shape_first=model.layers[0].output_shape[1:],
                        shape_last=model.layers[index_flatten].input_shape[1:],
                        n_fc_units=n_fc_units,
                        n_fc_layers=n_fc_layers)
    model_descr = 'struct={struct}_lr={lr}_dropouts=({dropout_conv},{dropout_fc})'.format(
                                                                     struct=structure,
                                                                     lr=lr,
                                                                     dropout_conv=dropout_conv,
                                                                     dropout_fc=dropout_fc)
    print(model_descr)
    if return_descr:
        return model, model_descr
    else:
        return model

## MLP

In [None]:
holes_ids_train = [i for i in range(0,18,2)]
holes_ids_test  = [i for i in range(0,18)]
X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_train)
X_test,  y_test  = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)
X_val,  y_val  = generate_dataset(n_sample=1000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)

X_train, X_test, y_train, y_test = shuffle(X_train, random_state=1), shuffle(X_test, random_state=1), shuffle(y_train, random_state=1), shuffle(y_test, random_state=1)

Y_train = K.utils.np_utils.to_categorical(y_train)
Y_test = K.utils.np_utils.to_categorical(y_test)
Y_val = K.utils.np_utils.to_categorical(y_val)


paramsets = [dict(dropout=0.0, neurons_layers='2x4', batch_size=64, lr=0.01, optimizer='Nadam',),
             dict(dropout=0.0, neurons_layers='2x16', batch_size=64, lr=0.01, optimizer='Nadam',),
             dict(dropout=0.0, neurons_layers='2x32', batch_size=64, lr=0.01, optimizer='Nadam',),
             dict(dropout=0.0, neurons_layers='4x16', batch_size=64, lr=0.01, optimizer='Nadam',),
             dict(dropout=0.0, neurons_layers='8x16', batch_size=64, lr=0.01, optimizer='Nadam',),
             dict(dropout=0.0, neurons_layers='2x32', batch_size=64, lr=0.01, optimizer='Nadam',),
             dict(dropout=0.0, neurons_layers='6x32', batch_size=64, lr=0.01, optimizer='Nadam',),
             dict(dropout=0.0, neurons_layers='32-16-8-4', batch_size=64, lr=0.01, optimizer='Nadam',),
             dict(dropout=0.0, neurons_layers='32-32-16-16-8-8-4', batch_size=64, lr=0.01, optimizer='Nadam',),
            ]

classifiers = []
for paramset in paramsets:
    for n_epochs in [5,20,100]:
        model = KerasClassifier(build_fn=create_MLP, epochs=n_epochs, verbose=0, input_shape=[18,])
        for lr in [0.01, 0.001]:
            paramset['lr'] = lr
            print(paramset)
            model.set_params(**paramset)
            n_l = paramset['neurons_layers']
            descr = f'MLP_{n_l}_lr{lr}_epochs{n_epochs}'
            classifiers.append((model, descr))
#history = model.fit(X_train, y_train, validation_data=(X_val, y_val))


history_lst = []
for clf, descr in classifiers:
    print(f'***{descr}***\n')
    history = clf.fit(X_train, Y_train, validation_data=(X_val, Y_val))
    y_pred = clf.predict(X_test)
    acc = sklearn.metrics.accuracy_score(y_test, y_pred)
    auc = sklearn.metrics.roc_auc_score(y_test, y_pred)
    print(f'Scores: accuracy = {acc}, ROC AUC = {auc}')
    print(sklearn.metrics.confusion_matrix(y_test, y_pred))
    print()
    history_lst.append(history)

In [None]:
history_lst[15].model.summary()
history_lst[15].history['val_acc']

### Partial Conclusions
<a id="section_ID"></a>

Some MLPs reaches expected accuracy value (75%), but it's curious that some of them get very high values (>90%).  

*Hypothesis:* Possible reason for that is that unused columns (they have always value = 1) can in principle be used as replacement for bias term in weighted sum of the neurons. This could be check with:
1. training same model couple times and check if results do not vary (according to above theorem it can/should change)
2. introducing some noise to each column (even 0.2 should be enough) - then, unused column would differ from bias term and dependence on them should vanish

### Test 1.

In [None]:
acc_lst, auc_lst = [], []

for _ in range(10):
    holes_ids_train = [i for i in range(0,18,2)]
    holes_ids_test  = [i for i in range(0,18)]
    X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_train)
    X_test,  y_test  = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)
    X_val,  y_val  = generate_dataset(n_sample=1000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)

    X_train, X_test, y_train, y_test = shuffle(X_train, random_state=1), shuffle(X_test, random_state=1), shuffle(y_train, random_state=1), shuffle(y_test, random_state=1)

    Y_train = K.utils.np_utils.to_categorical(y_train)
    Y_test = K.utils.np_utils.to_categorical(y_test)
    Y_val = K.utils.np_utils.to_categorical(y_val)


    model = KerasClassifier(build_fn=create_MLP, epochs=3, verbose=0, input_shape=[18,])
    paramset = dict(dropout=0.0, neurons_layers='2x8', batch_size=64, lr=0.01, optimizer='Nadam',)
    model.set_params(**paramset)

    clf = model
    history = clf.fit(X_train, Y_train, validation_data=(X_val, Y_val))
    y_pred = clf.predict(X_test)
    acc = sklearn.metrics.accuracy_score(y_test, y_pred)
    auc = sklearn.metrics.roc_auc_score(y_test, y_pred)
    print(f'Scores: accuracy = {acc}, ROC AUC = {auc}')
    print(sklearn.metrics.confusion_matrix(y_test, y_pred))
    print()
    acc_lst.append(acc)
    auc_lst.append(auc)
    
print(f'\n\taccuracy = {np.mean(acc_lst)} +/- {np.std(acc_lst)} \t max={np.max(acc_lst)}\n\t ROC AUC = {np.mean(auc_lst)} +/- {np.std(auc_lst)} \t max={np.max(auc_lst)}')


**Test 1 results**:  
Test 1 is rather unconclusive

### Test 2.

**without noise:**

In [None]:
X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=range(0,18,2))
plt.imshow(X_train[-50:], cmap='binary_r')

In [None]:
acc_lst, auc_lst = [], []

for it in range(10):
    holes_ids_train = [i for i in range(0,18,2)]
    holes_ids_test  = [i for i in range(0,18)]
    X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_train)
    X_test,  y_test  = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)
    X_val,  y_val  = generate_dataset(n_sample=1000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)

    X_train, X_test, y_train, y_test = shuffle(X_train, random_state=1), shuffle(X_test, random_state=1), shuffle(y_train, random_state=1), shuffle(y_test, random_state=1)

    Y_train = K.utils.np_utils.to_categorical(y_train)
    Y_test = K.utils.np_utils.to_categorical(y_test)
    Y_val = K.utils.np_utils.to_categorical(y_val)

#     X_train = X_train.astype('float64') + np.random.randn(*X_train.shape)*0.15
#     X_test  = X_test.astype('float64')  + np.random.randn(*X_test.shape)*0.15
#     X_val   = X_val.astype('float64')   + np.random.randn(*X_val.shape)*0.15
    
    model = KerasClassifier(build_fn=create_MLP, epochs=np.random.choice([5,15,25]), verbose=2, input_shape=[18,])
    paramset = dict(dropout=np.random.choice([0, 0, 0, 0.1, 0.25]), 
                    neurons_layers=np.random.choice(['2x4', '2x8', '4x8', '4x16', '4x32', '6x8']), 
                    batch_size=64, 
                    lr=np.random.choice([1e-3, 3e-3, 1e-2]), 
                    optimizer='Nadam',)
    model.set_params(**paramset)

    clf = model
    history = clf.fit(X_train, Y_train, validation_data=(X_val, Y_val))
    y_pred = clf.predict(X_test)
    acc = sklearn.metrics.accuracy_score(y_test, y_pred)
    auc = sklearn.metrics.roc_auc_score(y_test, y_pred)
    print(f'iter={it}\n {paramset}\n\n')
    print(f'Scores: accuracy = {acc}, ROC AUC = {auc}')
    print(sklearn.metrics.confusion_matrix(y_test, y_pred))
    print()
    acc_lst.append(acc)
    auc_lst.append(auc)
    print(f"{'+--'*30}")
    
print(f'\n\taccuracy = {np.mean(acc_lst)} +/- {np.std(acc_lst)} \t max={np.max(acc_lst)}\n\t ROC AUC = {np.mean(auc_lst)} +/- {np.std(auc_lst)} \t max={np.max(auc_lst)}')


**with noise:**

In [None]:
X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=range(0,18,2))
X_train = X_train.astype('float64') + np.random.randn(*X_train.shape)*0.15
plt.figure(figsize=(10,15))
plt.imshow(X_train[-50:], cmap='binary_r')

In [None]:
acc_lst, auc_lst = [], []

for it in range(30):
    holes_ids_train = [i for i in range(0,18,2)]
    holes_ids_test  = [i for i in range(0,18)]
    X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_train)
    X_test,  y_test  = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)
    X_val,  y_val  = generate_dataset(n_sample=1000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)

    X_train, X_test, y_train, y_test = shuffle(X_train, random_state=1), shuffle(X_test, random_state=1), shuffle(y_train, random_state=1), shuffle(y_test, random_state=1)

    Y_train = K.utils.np_utils.to_categorical(y_train)
    Y_test = K.utils.np_utils.to_categorical(y_test)
    Y_val = K.utils.np_utils.to_categorical(y_val)

    X_train = X_train.astype('float64') + np.random.randn(*X_train.shape)*0.15
    X_test  = X_test.astype('float64')  + np.random.randn(*X_test.shape)*0.15
    X_val   = X_val.astype('float64')   + np.random.randn(*X_val.shape)*0.15
    
    model = KerasClassifier(build_fn=create_MLP, epochs=np.random.choice([5,15,25]), verbose=2, input_shape=[18,])
    paramset = dict(dropout=np.random.choice([0, 0, 0, 0.1, 0.25]), 
                    neurons_layers=np.random.choice(['2x4', '2x8', '4x8', '4x16', '4x32', '6x8']), 
                    batch_size=64, 
                    lr=np.random.choice([1e-3, 3e-3, 1e-2]), 
                    optimizer='Nadam',)
    model.set_params(**paramset)

    clf = model
    history = clf.fit(X_train, Y_train, validation_data=(X_val, Y_val))
    y_pred = clf.predict(X_test)
    acc = sklearn.metrics.accuracy_score(y_test, y_pred)
    auc = sklearn.metrics.roc_auc_score(y_test, y_pred)
    print(f'iter={it}\n {paramset}\n\n')
    print(f'Scores: accuracy = {acc}, ROC AUC = {auc}')
    print(sklearn.metrics.confusion_matrix(y_test, y_pred))
    print()
    acc_lst.append(acc)
    auc_lst.append(auc)
    print(f"{'+--'*30}")
    
print(f'\n\taccuracy = {np.mean(acc_lst)} +/- {np.std(acc_lst)} \t max={np.max(acc_lst)}\n\t ROC AUC = {np.mean(auc_lst)} +/- {np.std(auc_lst)} \t max={np.max(auc_lst)}')


**TEST 2 results:**  
MLP without noise can quite easily learn to distinguish bad and good examples reaching acc of 80% and more  
MLP seems to not be capable to achieve more than 75% (on the test set) if trained on dataset with noise even though train accuracy is often > 90-95%  

[Hypothesis](#section_ID) was probably correct.

## ConvNet

**TODO**  
try various holes distribution in traininig sample, like range(0,18,2), range(0,9), (1,1,0,0,1,1,0,0..) etc

In [None]:
%%time

history_lst_conv = []
acc_lst, auc_lst = [], []


for it in range(20):
    print(f'iter = {it}')
    holes_ids_train = [i for i in range(0,9)]
    holes_ids_test  = [i for i in range(0,18)]
    X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_train)
    X_test,  y_test  = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)
    X_val,  y_val  = generate_dataset(n_sample=1000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)

    X_train, X_test, y_train, y_test = shuffle(X_train, random_state=1), shuffle(X_test, random_state=1), shuffle(y_train, random_state=1), shuffle(y_test, random_state=1)

    # add noise
    X_train = X_train.astype('float64') + np.random.randn(*X_train.shape)*0.1
    X_test  = X_test.astype('float64')  + np.random.randn(*X_test.shape)*0.1
    X_val   = X_val.astype('float64')   + np.random.randn(*X_val.shape)*0.1
    
    Y_train = K.utils.np_utils.to_categorical(y_train)
    Y_test = K.utils.np_utils.to_categorical(y_test)
    Y_val = K.utils.np_utils.to_categorical(y_val)

    X_train = np.expand_dims(X_train, 2)
    X_val = np.expand_dims(X_val, 2)
    X_test = np.expand_dims(X_test, 2)


    pset = dict(    filters1 = np.random.choice([10,25,100]),
                    #strides1 = [1],
                    kernel_size1 = [1],
                    #filters2 = 0,
                    filters2 = np.random.choice([0,0,5,10,20]),
#                     strides1 = [np.random.choice([1,1,1,2,3,4])],
#                     strides2 = [np.random.choice([1,2,3,4,6])],
#                     kernel_size1 = [np.random.choice([1,2,3,4,6,9])],
                    kernel_size2 = [np.random.choice([1,2,3,4])],
                    max_pool1 = [np.random.choice([0,0,2,3,4])],
                    max_pool2 = [np.random.choice([0,0,2,3,4])],
                    #max_pool2 = [0],
                    epochs = np.random.choice([5,15,25,50]),
                    lr = np.random.choice([3e-5, 1e-4, 3e-4]),
                    )

    
    print(f'\n paramset = {pset}\n')
    try:
        model = Sequential()
        # Conv Layers
        model.add( Conv1D(filters=pset['filters1'], 
                          kernel_size=pset['kernel_size1'], 
#                           strides=pset['strides1'], 
                          strides=pset['kernel_size1'],
                          padding='same', activation='relu', input_shape=[18,1]) 
                 )
        if pset['max_pool1'][0]: model.add( MaxPooling1D(pool_size=pset['max_pool1']) )
        if pset['filters2']: model.add( Conv1D(filters=pset['filters2'], 
                                               kernel_size=pset['kernel_size2'], 
#                                                strides=pset['strides2'], 
                                               strides=pset['kernel_size2'],
                                               padding='same', activation='relu')
                                      )
        if pset['max_pool2'][0]: model.add( MaxPooling1D(pool_size=pset['max_pool2']) )
        model.add( Flatten() )

        model.add( Dense(16, activation='relu') )
        model.add( Dense(16, activation='relu') )
        model.add( Dense(8, activation='relu') )
        model.add(Dense(2, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer=optimizers.Nadam(lr=pset['lr']), metrics=['accuracy'], )
    except InvalidArgumentError:
        print('InvalidArgumentError!!')
        continue       
    except ValueError:
        print('ValueError!!')
        continue

    clf = model          
    history = clf.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=pset['epochs'], verbose=2)

    y_pred_proba = [p[1] for p in clf.predict_proba(X_test)]
    y_pred = [1 if p > 0.5 else 0 for p in y_pred_proba]
    acc = sklearn.metrics.accuracy_score(y_test, y_pred)
    auc = sklearn.metrics.roc_auc_score(y_test, y_pred_proba)
    printmd(f'\nScores: accuracy = {acc}, ROC AUC = **{auc}**')
    print(sklearn.metrics.confusion_matrix(y_test, y_pred))
    print(f'\n{"+-"*50}\n')
    history_lst_conv.append(dict(history=history, paramset=pset, acc=acc, auc=auc))
    acc_lst.append(acc)
    auc_lst.append(auc)

          
print(f'\n\taccuracy = {np.mean(acc_lst)} +/- {np.std(acc_lst)} \t max={np.max(acc_lst)}\n\t ROC AUC = {np.mean(auc_lst)} +/- {np.std(auc_lst)} \t max={np.max(auc_lst)}')


accuracy = 0.7222295454545454 +/- 0.06880813766469993 	 max=0.8767  
ROC AUC = 0.7653125054545454 +/- 0.0994628708587779 	 max=0.9289947  
Wall time: 21min 38s  

accuracy = 0.7754347826086957 +/- 0.055096284169629514 	 max=0.8818  
ROC AUC = 0.8188514591304348 +/- 0.07894300614814957 	 max=0.9635026200000001  
Wall time: 59min 31s  

accuracy = 0.7426666666666667 +/- 0.06864766727444001 	 max=0.8299  
ROC AUC = 0.7832579722222222 +/- 0.08914965431386786 	 max=0.9762938799999998  
Wall time: 31min 43s  

accuracy = 0.7621090909090907 +/- 0.09308272424955667 	 max=0.959  
ROC AUC = 0.805930949090909 +/- 0.12715505881597186 	 max=0.9970582  
Wall time: 16min 4s  


`paramset = {'filters1': 20, 'filters2': 10, 'strides2': [3], 'kernel_size1': [3], 'kernel_size2': [3], 'max_pool1': [0], 'max_pool2': [2], 'epochs': 50, 'lr': 3e-05}, padding='same'`


`paramset = {'filters1': 10, 'filters2': 10, 'strides2': [2], 'kernel_size1': [9], 'kernel_size2': [1], 'max_pool1': [2], 'max_pool2': [0], 'epochs': 25, 'lr': 0.0003}, padding='same'`

#### CONV 1D - WTF

In [None]:
holes_ids_train = [i for i in range(0,9)]
holes_ids_test  = [i for i in range(0,18)]
X_train, y_train = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_train)
X_test,  y_test  = generate_dataset(n_sample=10000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)
X_val,  y_val  = generate_dataset(n_sample=1000, bad_fraction=0.5, avail_holes_ids=holes_ids_test)

X_train, X_test, y_train, y_test = shuffle(X_train, random_state=1), shuffle(X_test, random_state=1), shuffle(y_train, random_state=1), shuffle(y_test, random_state=1)

# add noise
X_train = X_train.astype('float64') + np.random.randn(*X_train.shape)*0.15
X_test  = X_test.astype('float64')  + np.random.randn(*X_test.shape)*0.15
X_val   = X_val.astype('float64')   + np.random.randn(*X_val.shape)*0.15

Y_train = K.utils.np_utils.to_categorical(y_train)
Y_test = K.utils.np_utils.to_categorical(y_test)
Y_val = K.utils.np_utils.to_categorical(y_val)

X_train = np.expand_dims(X_train, 2)
X_val = np.expand_dims(X_val, 2)
X_test = np.expand_dims(X_test, 2)


pset = dict(    filters1 = 20,
                strides1 = [1],
                kernel_size1 = [3],
#                 filters2 = 0,
                filters2 = 20,
#                     strides1 = [np.random.choice([1,1,1,2,3,4])],
#                     strides2 = [np.random.choice([1,2,3,4,6])],
#                     kernel_size1 = [np.random.choice([1,2,3,4,6,9])],
                kernel_size2 = [3],
                strides2 = [1],
                max_pool1 = [0],
                max_pool2 = [0],
                #max_pool2 = [0],
                epochs = 15,
                lr = 3e-4,
                )


print(f'\n paramset = {pset}\n')
try:
    model = Sequential()
    # Conv Layers
    model.add( Conv1D(filters=pset['filters1'], 
                      kernel_size=pset['kernel_size1'], 
                      strides=pset['strides1'], 
#                       strides=pset['kernel_size1'],
                      padding='same', activation='relu', input_shape=[18,1]) 
             )
    if pset['max_pool1'][0]: model.add( MaxPooling1D(pool_size=pset['max_pool1']) )
    if pset['filters2']: model.add( Conv1D(filters=pset['filters2'], 
                                           kernel_size=pset['kernel_size2'], 
                                            strides=pset['strides2'], 
#                                            strides=pset['kernel_size2'],
                                           padding='same', activation='relu')
                                  )
    if pset['max_pool2'][0]: model.add( MaxPooling1D(pool_size=pset['max_pool2']) )
    model.add( Flatten() )

#     model.add( Dense(16, activation='relu', input_shape=[18,]) )
    model.add( Dense(16, activation='relu') )
    model.add( Dense(16, activation='relu') )
    model.add( Dense( 8, activation='relu') )
    model.add( Dense( 2, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizers.Nadam(lr=pset['lr']), metrics=['accuracy'], )
except InvalidArgumentError:
    print('InvalidArgumentError!!')
except ValueError:
    print('ValueError!!')

clf = model          
history = clf.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=pset['epochs'], verbose=2)

y_pred_proba = [p[1] for p in clf.predict_proba(X_test)]
y_pred = [1 if p > 0.5 else 0 for p in y_pred_proba]
acc = sklearn.metrics.accuracy_score(y_test, y_pred)
auc = sklearn.metrics.roc_auc_score(y_test, y_pred_proba)
printmd(f'\nScores: accuracy = {acc}, ROC AUC = **{auc}**')
print(sklearn.metrics.confusion_matrix(y_test, y_pred))
print(f'\n{"+-"*50}\n')

In [None]:
plt.figure(figsize=(6,6))
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.015])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.axis('equal')
plt.legend()

In [None]:
plt.figure(figsize=(12,6))
val = pd.DataFrame(dict(pred=y_pred_proba, true=y_test))
plt.hist(val.query('true==0')['pred'], bins=np.linspace(0,1,30), histtype='step', color='red', lw=2);
plt.hist(val.query('true==1')['pred'], bins=np.linspace(0,1,30), histtype='step', lw=3, ls=':');

In [None]:
model.summary()

#### Parameters analysis

In [None]:
for exp in history_lst_conv:
    if exp['auc'] > 0.85:
        printmd(f"**{exp['auc']}**,  {exp['paramset']['filters1']}, {exp['paramset']['kernel_size1']}, {exp['paramset']['max_pool1']}, {exp['paramset']['filters2']}, {exp['paramset']['epochs']}")
#         printmd(f"**{exp['auc']}**,  {exp['paramset']}")

In [None]:
data = {}

def get_val(x):
    from collections.abc import Iterable
    if isinstance(x, Iterable): return x[0]
    else: return x

for k in history_lst_conv[0]['paramset'].keys():
    col = [get_val(exp['paramset'][k])  for exp in history_lst_conv]
    data[k] = col 

df = pd.DataFrame(data)
df['acc'] = [exp['acc'] for exp in history_lst_conv]
df['auc'] = [exp['auc'] for exp in history_lst_conv]
df.query('acc > 0.8').head(50)

In [None]:
# vars = ['strides1', 'auc']
xy = df.query('acc > 0')[['kernel_size1', 'auc']]
xx = xy[xy.columns[0]]
yy = xy[xy.columns[1]]
xx = np.array(xx) + (np.random.rand(len(xx))-0.5)*0.3

plt.scatter(xx, yy, edgecolors='b', facecolors='none')
# plt.xlim([1e-5, 1e-3])

In [None]:
for h in history_lst_conv:
    if h['auc'] > 0.9:
        printmd(f"**AUC = {h['auc']}**, acc = {h['acc']}  \n{h['paramset']}  \n{h['history'].model.count_params()}")

### Partial conclusions:

ConvNets can learn to identify holes in regions not corrupted in the training sample in case of training holes randomly selected from `range(0,18,2)`

**to be confirmed for `range(0,9)`** (~83% achieved so far)

## CD