In [None]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.python.keras.utils.vis_utils import plot_model
import tensorflow as tf
import tensorflow_addons as tfa

from sklearn.cluster import KMeans

import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices

import janestreet

In [None]:
SEED = 1111

np.random.seed(SEED)

train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
train = train.query('date > 85').reset_index(drop = True) 
train = train[train['weight'] != 0]
train.fillna(0, inplace=True)

In [None]:
features = [c for c in train.columns if 'feature' in c]
targets  = [c for c in train.columns if 'resp' in c]

In [None]:
# Calculating the Correlation across the different features
features_df = train.loc[:, features]
corr_features = features_df.corr()
corr_feat_mtx = corr_features.to_numpy()

# Using kmeans to cluster the features based on their correlation
n_clusters = 5
kmeans = KMeans(n_clusters = n_clusters, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
corr_feat_labels = kmeans.fit_predict(corr_feat_mtx)

# Preparing a dataframe to collect some cluster stats
corr_feat_clust_df = pd.DataFrame(np.c_[features, corr_feat_labels])
corr_feat_clust_df.columns = ["feature", "cluster"]
corr_feat_clust_df.head()

In [None]:
grouped = corr_feat_clust_df.groupby(["cluster"])
clusters = []
for g in range(len(grouped)):
    clusters.append(grouped.get_group(str(g)).feature.to_list())

In [None]:
X =[]
for i in range(len(clusters)):
    X.append(train.loc[:, clusters[i]])    

Y = np.stack([(train[c] > 0).astype('int') for c in targets]).T

In [None]:
def create_model(X, Y):
    inputs=[]
    concats=[]
    for input in X:
        size=input.shape[1]
        
        if size==28:
            inp = Input(shape=size)
            inputs.append(inp)
            x = Dense(50)(inp)
            x = BatchNormalization()(x)
            x = Activation(tf.keras.activations.swish)(x)
            x = Dropout(0.7)(x)
            
        if size==37:
            inp = Input(shape=size)
            inputs.append(inp)
            x = Dense(52)(inp)
            x = BatchNormalization()(x)
            x = Activation(tf.keras.activations.swish)(x)
            x = Dropout(0.2)(x)

        if size==23:
            inp = Input(shape=size)
            inputs.append(inp)
            x = Dense(51)(inp)
            x = BatchNormalization()(x)
            x = Activation(tf.keras.activations.swish)(x)
            x = Dropout(0.2)(x)

        if size==17:
            inp = Input(shape=size)
            inputs.append(inp)
            x = Dense(17)(inp)
            x = BatchNormalization()(x)
            x = Activation(tf.keras.activations.swish)(x)
            x = Dropout(0.6)(x)
        
        if size==25:
            inp = Input(shape=size)
            inputs.append(inp)
            x = Dense(51)(inp)
            x = BatchNormalization()(x)
            x = Activation(tf.keras.activations.swish)(x)
            x = Dropout(0.2)(x)
        
            
        concats.append(x)

    concat = tf.keras.layers.Concatenate()(concats)
    outputs = Dense(Y.shape[1])(concat)
    outputs = Activation("sigmoid")(outputs)
    
    model = Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=1e-3),
        loss=BinaryCrossentropy(label_smoothing=1e-2),
        metrics="acc",
    )

    return model

In [None]:
epochs = 200
batch_size = 2048

tf.keras.backend.clear_session()
tf.random.set_seed(SEED)

clf = create_model(X, Y)
clf.fit(X, Y, epochs=epochs, batch_size=batch_size, verbose=1)

In [None]:
# inference
th = 0.5065
f = np.median
env = janestreet.make_env()
for (test_df, pred_df) in tqdm(env.iter_test()):
    test_df.fillna(0, inplace=True)
    X =[]
    for i in range(len(clusters)):
        X.append(test_df.loc[:, clusters[i]].values)    

    pred = np.mean(clf(X).numpy(),axis=1)
    pred_df.action = np.where(pred >= th, 1, 0).astype(int)
    env.predict(pred_df)