# Keras / Optuna Starter

The intention of this notebooks is to provide a simple example of how a basic Keras structure can be trained with Optuna. With a bit of experimentation to the architecture it should be fairly easy to get a nice score for this point in the competition. Hopefully this provides a more accessible point for those wishing to experiment with NNs for this project.
## Grab our data

In [None]:
import pandas as pd
import numpy as np

DATA_DIR = '../input/tabular-playground-series-nov-2021/'

train = pd.read_csv(DATA_DIR + 'train.csv').set_index('id')
y = train.pop('target').values

test = pd.read_csv(DATA_DIR + 'test.csv').set_index('id')
sample = pd.read_csv(DATA_DIR + 'sample_submission.csv').set_index('id')

no_features = test.shape[1]

## Now lets go generate some normalisations for us to play with
I have taken out a few examples here and commented more. Robust and Z score seem to provide the best results for me.

Note: I am cheating here and using the test set to help scale the training set. This is one of those things that's cool in a Kaggle competition but probably not advisable in the wild.

In [None]:
from sklearn.preprocessing import QuantileTransformer, StandardScaler, RobustScaler, MinMaxScaler
from gc import collect

data = {}

# This was no help at all
#print('Fitting quantiles transformer: Normal')
norm = QuantileTransformer(output_distribution='normal', n_quantiles=1000)
norm.fit(pd.concat([train, test]))
data['norm'] = {
    'train': norm.transform(train),
    'test': norm.transform(test),
}

print('Fitting standard scaler')
norm = StandardScaler()
norm.fit(pd.concat([train, test]))
data['z'] = {
    'train': norm.transform(train),
    'test': norm.transform(test),
}

print('Fitting robust scaler')
norm = RobustScaler()
norm.fit(pd.concat([train, test]))
data['robust'] = {
    'train': norm.transform(train),
    'test': norm.transform(test),
}

# This was no help at all
print('Fitting min-max')
norm = MinMaxScaler(feature_range=(-1, 1))
norm.fit(pd.concat([train, test]))
data['min-max'] = {
    'train': norm.transform(train),
    'test': norm.transform(test),
}

del(train)
collect();

## Model
This is our basic model structure. A few additions that worked for me...
- Adding an attention layer between the raw and noised features
- Using add to incorporate residuals

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout, GaussianNoise, Input, Attention, Add, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import Adam

def get_model(depth, noise, lr):
    tf.random.set_seed(42)
    np.random.seed(42)
    
    features = 2 ** (depth + 2)
    
    # Input
    inputs = Input(no_features)

    # Add noise to the continuous
    x = GaussianNoise(noise)(inputs)
    
    # Build out some blocks
    for _ in range(0, depth):
        x = Dense(features, activation='swish')(x)
        if features > 8:
            features /= 8
    
    out = Dense(1, activation='sigmoid')(x)
    
    # Build
    model = Model(inputs=inputs, outputs=out, name='perceptomanic')

    # Optimiser
    opt = Adam(learning_rate=lr)

    # Metrics
    auc = tf.keras.metrics.AUC(name='auc')
    
    # Compile
    model.compile(loss='binary_crossentropy', 
                  optimizer=opt,
                  metrics=[auc])
    
    return model


## Optuna Search

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna

def nn(trial):
    '''
    '''
    params = {
        'noise': trial.suggest_float('noise', 0.0, 0.06),
        'lr': trial.suggest_float('lr', 0.0001, 0.1),
        'depth': trial.suggest_int('depth', 1, 8),
    }
    norm_method = trial.suggest_categorical('norm_method', ['robust'])
    
    kf = StratifiedKFold(10, shuffle=True, random_state=42)
    estimates = []
    y_hat_validation = np.zeros(len(y)) + np.NaN
    
    # callbacks
    callback_early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=False, mode='min', restore_best_weights=True)
    callback_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=False, mode='min')
    
    for train_index, test_index in kf.split(data[norm_method]['train'], y):
        # Train model
        model = get_model(**params)

        # Split out test and train
        X_train = data[norm_method]['train'][train_index]
        y_train = y[train_index]

        X_test = data[norm_method]['train'][test_index]
        y_test = y[test_index]

        # Fit
        model.fit(x=X_train, 
                  y=y_train, 
                  batch_size=4096, 
                  epochs=1000, 
                  validation_data=(X_test, y_test), 
                  verbose=False, 
                  callbacks=[callback_early_stopping, callback_lr])

        # Predict
        y_hat_validation[test_index] = model.predict(X_test).squeeze()
        
        # Check for early stop
        not_null = ~np.isnan(y_hat_validation)
        score = roc_auc_score(y[not_null], y_hat_validation[not_null])
        if score < 0.735:
            print('Early stopping')
            return score
        
        del(model)
        
    score = roc_auc_score(y, y_hat_validation)
    
    return score


In [None]:
study = optuna.create_study(study_name='Find me some params dude', direction='maximize')

In [None]:
study.optimize(nn, timeout=60*60*6, gc_after_trial=True, show_progress_bar=True)

print('Number of finished trials: {}'.format(len(study.trials)))

print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

### Now run it

In [None]:
def nn_run(params):
    '''
    '''
    
    norm_method = params.pop('norm_method')
    
    kf = StratifiedKFold(10, shuffle=True, random_state=42)
    estimates = []
    y_hat_validation = np.zeros(len(y)) + np.NaN
    
    # callbacks
    callback_early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=False, mode='min', restore_best_weights=True)
    callback_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=False, mode='min')
    
    for train_index, test_index in kf.split(data[norm_method]['train'], y):
        # Train model
        model = get_model(**params)

        # Split out test and train
        X_train = data[norm_method]['train'][train_index]
        y_train = y[train_index]

        X_test = data[norm_method]['train'][test_index]
        y_test = y[test_index]

        # Fit
        model.fit(x=X_train, 
                  y=y_train, 
                  batch_size=4096, 
                  epochs=1000, 
                  validation_data=(X_test, y_test), 
                  verbose=True, 
                  callbacks=[callback_early_stopping, callback_lr])

        # Predict
        y_hat_validation[test_index] = model.predict(X_test).squeeze()
        estimates.append(model.predict(data[norm_method]['test']).squeeze())
        
        print('\n')
        
        del(model)
        
    print(roc_auc_score(y, y_hat_validation))
    
    return estimates

estimates = nn_run(study.best_trial.params)

In [None]:
sample['target'] = np.vstack(estimates).T.mean(axis=1)
sample.to_csv('simple_af_nn.csv')