In [7]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import numpy as np
import pandas as pd
from random import choices
import pickle

In [8]:
SEED = 1111

tf.random.set_seed(SEED)
np.random.seed(SEED)

In [9]:
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)

    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model

In [10]:
#Function for splitting data into train/test set!

def train_test_split(test_share, data):
    
    #Split data into initial train/test
    
    train_share = 1 - test_share    
    train_size = int(len(data) * train_share)
    train_set = data[0:train_size]
    test_set = data[train_size:len(data)]    
    
    
    return (train_set, test_set)

In [11]:
def load_data():
    
    return pickle.load(open('df_down_sampled_alternative.p', 'rb'))

In [12]:
df = load_data()

In [13]:
#Action is the target-variable. Equals 1 if resp is positive, 0 otherwise!

df['action'] = ((df['resp'].values) > 0).astype(int)

In [14]:
df.fillna(df.mean(), inplace = True)

In [15]:
df = df[df.weight > 0]

In [16]:
train_set, test_set = train_test_split(test_share = 0.3, data = df)

In [17]:
train_set

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id,action
527894,86,0.859516,-0.003656,-0.005449,-0.017403,-0.028896,-0.021435,1,3.151305,5.467693,...,2.433699,4.282284,1.621115,4.331030,2.553220,3.799011,2.642943,3.998054,527894,0
527896,86,0.590949,0.000347,-0.000376,-0.004051,-0.007995,-0.004743,-1,-0.365888,0.824004,...,-0.702873,4.038753,-0.789767,4.133183,-1.207878,3.402796,-0.928290,3.511141,527896,0
527897,86,0.172997,0.000168,0.000333,-0.002375,-0.003064,0.001527,1,1.514607,0.596214,...,2.304354,1.530169,3.596848,4.613493,4.516110,3.341374,2.635798,1.535235,527897,1
527900,86,1.507813,0.001534,0.000481,-0.000082,-0.002271,-0.001663,-1,-1.008999,0.242132,...,-1.700655,-0.376747,-0.876730,1.645537,-1.318172,1.369452,-1.352577,0.733124,527900,0
527901,86,5.554003,0.003948,0.005943,0.015943,0.025001,0.017121,-1,-0.391784,0.152261,...,-1.690686,1.656818,-1.213169,2.949877,-2.033542,2.248799,-1.838696,1.905638,527901,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1837910,392,2.188796,-0.002056,-0.001794,-0.011772,-0.025964,-0.019898,-1,0.345525,1.332189,...,-1.420351,-0.821810,-0.953915,-0.136324,-1.408877,0.089653,-1.576863,-0.578055,1837910,0
1837911,392,0.753501,0.005347,0.009221,0.010137,0.013388,0.015555,-1,-0.973443,-0.884154,...,-1.224727,0.050566,-1.159235,-0.684111,-1.807972,-0.338881,-1.500253,-0.135812,1837911,1
1837913,392,0.302947,-0.001689,-0.006500,-0.060199,-0.131697,-0.103592,-1,0.517003,-0.660177,...,1.529932,-2.074714,3.674693,0.062659,5.004443,-0.009987,3.039767,-1.100508,1837913,0
1837914,392,0.183864,-0.000296,0.000871,0.014661,0.020175,0.009908,1,-2.408805,-2.348480,...,-0.890299,0.688568,-0.619310,1.729515,-0.896659,1.402814,-0.912040,0.923593,1837914,1


In [25]:
features = [c for c in df.columns if "feature" in c]

In [26]:
f_mean = np.mean(df[features[1:]].values, axis=0)

In [27]:
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

In [28]:
X_train = df.loc[:, df.columns.str.contains('feature')]

In [33]:
y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T

NameError: name 'train' is not defined

In [30]:
batch_size = 5000
hidden_units = [150, 150, 150]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

In [32]:
clf = create_mlp(
    len(features), 5, hidden_units, dropout_rates, label_smoothing, learning_rate
    )