**This is the Cross Validation Version. The final submission is the same code, without the train-test split so that the final model is trained in all data points**

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import optuna

import lightgbm as lgbm
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
#plt.style.use('fivethirtyeight')
import xgboost as xgb
import sklearn
import tqdm
import random
import janestreet
import tensorflow as tf

SEED=1111

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

train = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")

from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import tensorflow_addons as tfa

import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices

### Train - Test Split 

**An important differences compared to other kernels: No days were excluded (e.g. first 85 days) except for the weight=0 days.**

**Secondly, test split starts at the start of a day, not inside a day**

In [None]:
train = train[train['weight'] != 0]

train.fillna(train.mean(),inplace=True)

train['action'] = ((train['resp'].values) > 0).astype(int)


features = [c for c in train.columns if "feature" in c]

features.remove('feature_0')

train['resp'] = (((train['resp'].values)*train['weight']) > 0).astype(int)
train['resp_1'] = (((train['resp_1'].values)*train['weight']) > 0).astype(int)
train['resp_2'] = (((train['resp_2'].values)*train['weight']) > 0).astype(int)
train['resp_3'] = (((train['resp_3'].values)*train['weight']) > 0).astype(int)
train['resp_4'] = (((train['resp_4'].values)*train['weight']) > 0).astype(int)

f_mean = np.mean(train[features[1:]].values,axis=0)

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

a = np.where(train.date==410)[0][0]

X_train=train.loc[:,features].values
#y_train = (train.loc[:, 'action'])
y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T

X_test = X_train[a:,:]
X_train = X_train[:a,:]

y_test = y_train[a:,:]
y_train = y_train[:a,:]

The first model is a LightGBM model with 450 leaves and 450 maximum bins

In [None]:
params={"num_leaves":450,
       "max_bin":450,    #### 450
       "feature_fraction":0.52,
       "bagging_fraction":0.52,
       "objective":"binary",
       "learning_rate":0.05,
       "boosting_type":"gbdt",
       "metric":"auc"
       }
models = [] # list of model , we will train 
for i in range(y_train.shape[1]):
   
    d_train = lgbm.Dataset(X_train,label=y_train[:,i])
    clf = lgbm.train(params,d_train,num_boost_round=1000)
                     
    models.append(clf)

In [None]:
Preds1=np.mean([model.predict(X_test) for model in models],axis=0)
predictions1 = np.zeros(len(Preds1))
predictions1[Preds1>=0.5] = 1
sum(predictions1==y_test[:,3])/len(Preds1)

In [None]:
pd.crosstab(y_test[:,3],predictions1 )

**This is an MLP model which will be used for ensembling in high risk trades**

In [None]:
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
    
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model

epochs = 44   ### 45
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

tf.keras.backend.clear_session()
tf.random.set_seed(SEED)
clf = create_mlp(
    len(features), 5, hidden_units, dropout_rates, label_smoothing, learning_rate
    )

clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=2)

In [None]:
Preds2 =clf.predict(X_test)
Preds2=np.mean(Preds2,axis=1)
predictions2 = np.zeros(len(Preds2))
predictions2[Preds2>=0.5] = 1

sum(predictions2==y_test[:,3])/len(Preds2)

In [None]:
pd.crosstab(y_test[:,3],predictions2 )

**The ensemeble prediction occurs for weights above 15, for the rest trades only LightGBM is used for prediction**

In [None]:
predictions3= predictions1
predictions3[train.weight[a:]>15] = predictions1[train.weight[a:]>15]*predictions2[train.weight[a:]>15]

sum(predictions3==y_test[:,3])/len(predictions3)

In [None]:
pd.crosstab(y_test[:,3],predictions3 )

**We can see that the ensembe model is an improvement of the previous 2, but the change in the portfolio's value will be more evident (correct prediction of large trades means big improvements in revenue)** 

In [None]:
from tqdm import tqdm
import janestreet
threshold = 0.5
f= np.median

env = janestreet.make_env()
for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df["weight"].item() > 0:
        x_test = test_df.loc[:, features].values
        if np.isnan(x_test[:, 1:].sum()):
            x_test[:, 1:] = np.nan_to_num(x_test[:, 1:]) + np.isnan(x_test[:, 1:]) * f_mean
            
        lgbm_prob = f(np.mean([model.predict(x_test) for model in models],axis=0)  )
        

#        lgbm_pred = np.where(lgbm_prob >= threshold, 1, 0).astype("int")
        if test_df["weight"].item() > 10:
            mlp_prob = f(np.mean(clf.predict(x_test),axis=1)  )
            mlp_pred = np.where(mlp_prob >= threshold, 1, 0).astype("int")
            lgbm_pred= np.where(lgbm_prob >= threshold, 1, 0).astype("int") 
            pred_df["action"] =  mlp_pred*lgbm_pred
        else:
            pred_df["action"] = np.where(lgbm_prob >= threshold, 1, 0).astype("int") 
            
#        pred = lgbm_prob*0.6+mlp_prob*0.4
#        pred_df["action"] = np.where(pred >= threshold, 1, 0).astype("int")
#        pred_df["action"] =  mlp_pred*lgbm_pred
    else:
        pred_df["action"] = 0
    env.predict(pred_df)