In [None]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import tensorflow_addons as tfa
from random import choices
from numba import njit
import matplotlib.pyplot as plt
import os,gc
import random
from random import choices

SEED = 1111

tf.random.set_seed(SEED)
np.random.seed(SEED)

train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
train = train.query('date > 85').reset_index(drop = True) 
train = train[train['weight'] != 0]
train.fillna(train.mean(),inplace=True)
train['action'] = ((train['resp'].values) > 0).astype(int)
features = [c for c in train.columns if "feature" in c]
f_mean = np.mean(train[features[1:]].values,axis=0)
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
X_train = train.loc[:, train.columns.str.contains('feature')]
y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T

###### Here, we will now create a pipeline #####
#from sklearn.compose import ColumnTransformer
#from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.pipeline import Pipeline

#numerical_transformer = SimpleImputer(strategy = 'constant')
#categorical_transformer = Pipeline(steps = [('imputer',SimpleImputer(strategy = 'most_frequent')), 
#                                            ('one_hot',OneHotEncoder(handle_unknown = 'ignore'))])
#preprocessor = ColumnTransformer (transformers = [('num', numerical_transformer, resp#need to modify this#),
#                                                  ('cat', categorial_transformer, resp_3#need to modify this#)])
##### Pipeline basic opening over #####

def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)
    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model

epochs = 300
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.20, 0.20, 0.20, 0.20]
label_smoothing = 1e-2
learning_rate = 1e-3

tf.keras.backend.clear_session()
tf.random.set_seed(SEED)
clf = create_mlp(
    len(features), 5, hidden_units, dropout_rates, label_smoothing, learning_rate
    )
clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=2)
clf.save(f'model.h5')

##### Here we will replace the clf with our pipeline for cleaner processing #####
#from sklearn.metrices import mean_absolute_error
#res_pipeline = Pipeline (steps = [('preprocessor',preprocessor),
#                                  ('clf',clf)                                 
#                                  ])
#res_pipeline.fit(X_train, y_train, epochs=200, batch_size=5000)
##### And the work ends ! #####

## Cross-validation ##
#from sklearn.model_selection import cross_val_score
#cv_scores = cross_val_score(clf, X_train, y_train, 
#                            cv=5,
#                            scoring='neg_mean_absolute_error')
#print("MAE scores:\n", scores)
## Process ends !##

models = []
models.append(clf)
th = 0.502
f = np.median
import janestreet
env = janestreet.make_env()
for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        pred = np.mean([model(x_tt, training = False).numpy() for model in models],axis=0)
        pred = f(pred)
        pred_df.action = np.where(pred >= th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)

## All utility scores - Do consider these before making a submission ! ##

In [None]:
#def utility_score_LDMTWO(df, labels='action,.r0,.weight,.date'.split(',')):

#    action,resp,weight,date = labels
#    df = df.set_index(date)
#    p = df[weight]  * df[resp] * df[action]
#    p_i = p.groupby(date).sum()
#    t = (p_i.sum() / np.sqrt((p_i**2).sum())) * (np.sqrt(250 / p_i.index.size))
#    return np.clip(t,0,6) * p_i.sum()

#def utility_score_Jorijn(df):

#    df['p'] = df['weight']  * df['resp'] * df['action']
#    p_i = df.set_index('date')['p'].groupby('date').sum()
#    t = (p_i.sum() / np.sqrt((p_i**2).sum())) * (np.sqrt(250 / p_i.index.size))
#    return min(max(t, 0), 6) * p_i.sum()

#print('LDMTWO\'s:')
#%timeit utility_score_LDMTWO(train, labels = 'action,resp,weight,date'.split(','))
#print('-' * 70)
#print('Jorijn\'s:')
#%timeit utility_score_Jorijn(train)

## Finding correlation
This link can be used to make a correlational neural network model :
https://towardsdatascience.com/a-comprehensive-guide-to-correlational-neural-network-with-keras-3f7886028e4a