In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
from tensorflow.data.experimental import cardinality

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
gc.collect()

In [None]:
# add in a feature tracking time since current day started
perday = train['date'].value_counts().sort_index()
time_since_start = [list(range(i)) for i in perday]
time_since_start = sum(time_since_start, [])
train['time_since_start'] = time_since_start

In [None]:
# get the features and target
feats = [i for i in train.columns if i.startswith('feature')] + ['time_since_start']
X = train[feats].fillna(0)
y = train['resp']
del train
gc.collect()

In [None]:
# normalize features
from sklearn import preprocessing

sc = preprocessing.StandardScaler()

X_train_sc = sc.fit_transform(X)

print(X_train_sc.shape)
pd.DataFrame(X_train_sc[:5])

In [None]:
# set hyperparameter values
batch_size = 7
lstm_size = 32
dense_size = 16
beta = 0.1
lstm_dropout = 0.2

In [None]:
# 'chunk' the data into chunks of size batch_size for training the LSTM
# note that we're using separate chunks rather than a sliding window here
# i.e. [1,2,3,4,5,6] would become [[1,2,3],[4,5,6]] with a batch size of 3

dataset_train = tf.data.Dataset.from_tensor_slices((X_train_sc, y))
dataset_train = dataset_train.batch(batch_size)
del X, y, X_train_sc
gc.collect()

In [None]:
l_train = cardinality(dataset_train).numpy() - 1

X_train = np.array([i[0].numpy() for i in dataset_train.take(l_train)])

y_train = np.array([i[1].numpy() for i in dataset_train.take(l_train)])

del dataset_train

gc.collect()

In [None]:
# loss function attempting to penalize high variance of trade responses
# note it differs from the real utility score, where the penalization of high variance is 
# on the level of days; here it's on the level of individual trades

def my_loss(action, response):
    x = tf.multiply(action, response)
    xsq = tf.square(x)
    return - tf.reduce_sum(x) + beta * tf.reduce_sum(xsq)

# accuracy function, looking for cases where a trade is 'successful'
# i.e. either resp<0 and no trade, or resp>0 and a trade

def my_accuracy(y_true, y_pred):
    return (y_true>=0) == (y_pred>=0.5)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

model = keras.Sequential()
model.add(layers.LSTM(lstm_size, 
                      input_shape=(batch_size,131), 
                      return_sequences=True, 
                      dropout=lstm_dropout))
model.add(layers.Dense(dense_size))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss=my_loss, 
              metrics=[my_accuracy])
gc.collect()

In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=10)

In [None]:
# submission
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
def make_submission():
    
    row_history = []
    current_day = 0
    time_of_day = 0
    n_rows = batch_size
    subs = []

    for (test_df, sample_prediction_df) in iter_test:
        if test_df['date'].iloc[0] == current_day:
            time_of_day = time_of_day + 1
        else:
            current_day = test_df['date'].iloc[0]
            time_of_day = 0
            print(current_day, time_of_day) 

        test_df = test_df.fillna(0)
        test_df['time_since_start'] = time_of_day
        r = sc.transform(test_df[feats])
        if len(row_history) < n_rows:
            sample_prediction_df.action = 0 # no trades for first 40
            row_history.append(r)
        elif len(row_history)==n_rows:
            row_history = row_history[1:]
            row_history.append(r)
            X = np.array(row_history).reshape(1,n_rows,len(feats))
            a = model(X, training=False)
            sample_prediction_df.action = int(a[0][-1][0] > 0.5)
        else:
            print("ERROR: Row history is > n_rows")

        subs.append(sample_prediction_df['action'])
        env.predict(sample_prediction_df)
        
    return subs
    

In [None]:
subs = make_submission()