In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
#train = reduce_mem_usage(train)
features = [c for c in train.columns if 'feature' in c]

NAN_VALUE = -999

In [None]:
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float16').columns}) 
train = train.fillna(train.mean())
f_mean = np.mean(train[features[1:]].values,axis=0)
train = train.query('date > 85').reset_index(drop = True)
train = train[train.weight != 0]
n_folds = 5
seed = 2020
skf = StratifiedKFold(n_splits=n_folds, shuffle=False)
resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']

X_train = train[train['date'] < 400][features]
X_test = train[train['date'] >= 400][features]

y_train = np.stack([(train[train['date'] < 400][c] > 0).astype('int') for c in resp_cols]).T
y_test = np.stack([(train[train['date'] >= 400][c] > 0).astype('int') for c in resp_cols]).T


X_train = train[features]
y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T

In [None]:
TUNNING = False
def create_model(hp,input_dim,output_dim):
    inputs = tf.keras.layers.Input(input_dim)
    x = tf.keras.layers.BatchNormalization()(inputs)
    x = tf.keras.layers.GaussianNoise(hp.Choice('noise',[0.0,0.03,0.05]))(x)
    x = tf.keras.layers.Dropout(hp.Choice('init_dropout',[0.0,0.3,0.5]))(x)    
    x = tf.keras.layers.Dense(hp.Int('num_units_1', 128, 2048, 64), activation=hp.Choice('activation_1', ['tanh','relu','swish']))(x)
    x = tf.keras.layers.Dropout(hp.Choice(f'dropout_1',[0.0,0.3,0.5]))(x)
    x = tf.keras.layers.Dense(hp.Int('num_units_2', 128, 1024, 32), activation=hp.Choice('activation_2', ['tanh','relu','swish']))(x)
    x = tf.keras.layers.Dropout(hp.Choice(f'dropout_2',[0.0,0.3,0.5]))(x)
    x = tf.keras.layers.Dense(output_dim, activation='sigmoid')(x)
    model = tf.keras.models.Model(inputs=inputs,outputs=x)
    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Choice('lr',[1e-2, 1e-3, 1e-5])),loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=hp.Choice('label_smoothing',[0.0, 0.01, 0.1])),metrics=[tf.keras.metrics.AUC(name = 'auc')])
    return model

model = tf.keras.Sequential([
    tf.keras.Input(shape = len(features)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GaussianNoise(0.05),
    tf.keras.layers.Dropout(0.3),        
    tf.keras.layers.Dense(256, activation='tanh'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='tanh'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(256, activation='tanh'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),   
    tf.keras.layers.Dense(5, activation = 'sigmoid')
  ])

EPOCHS = 50
BATCH_SIZE = 4096

if TUNNING:
    import kerastuner as kt
    EPOCHS = 50
    MAX_TRIAL = 20
    model_fn = lambda hp: create_model(hp, X_train.shape[-1], y_train.shape[-1])
    tuner = kt.tuners.BayesianOptimization(model_fn, kt.Objective('val_auc', direction='max'), MAX_TRIAL, seed = 2020)
    tuner.search(X_train, y_train, epochs=EPOCHS, validation_data=(X_test, y_test),callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 10, restore_best_weights=True)])
    model = tuner.get_best_models()[0]
else:
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    #optimizer = tf.keras.optimizers.RMSprop()
    loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=1e-2)
    model.compile(loss = loss, optimizer=optimizer, metrics=[tf.keras.metrics.AUC()])
    #history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=[callback], validation_data=(X_test, y_test)) 
    history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS)
#0.63

In [None]:
import janestreet
from tqdm.notebook import tqdm
#janestreet.competition.make_env.__called__ = False
env = janestreet.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in tqdm(iter_test):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
            
        action = np.mean(model(x_tt, training = False).numpy()[0])
       
        if (action > 0.5):
            sample_prediction_df.action = 1
        else:
            sample_prediction_df.action = 0 
    else:
        sample_prediction_df.action = 0 
    env.predict(sample_prediction_df)