In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import dill
import tensorflow as tf
from tensorflow import keras
import random
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
#tf.debugging.set_log_device_placement(True)

In [None]:
np.random.seed(1230)
random.seed(1230)

We converted the dataset to the **.feather** format in this [Notebook](https://www.kaggle.com/tchaye59/jane-street-market-dataset-to-feather). It is faster to load and support all build in pandas functionality.

### Objective:
We will build a recurrent neural network. The network will take **n** consecutive data **steps** and predict the next **action** to take

In [None]:
train_df = pd.read_feather('/kaggle/input/jane-street-market-dataset-to-feather/train.feather')
feature_df = pd.read_feather('/kaggle/input/jane-street-market-dataset-to-feather/features.feather')
feature_df.set_index('feature',inplace=True)
train_df.shape

In [None]:
# Remove weight == 0
train_df = train_df.loc[train_df.weight != 0]
train_df.shape

## Define The target

In [None]:
# Make the trade  when the expected return is positive
train_df['target'] = (train_df.resp > 0).values.astype(np.int8)
train_df.head(3)

In [None]:
# Use all columns in the dataframe as a feature  except these
ignore_columns = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp','ts_id','target']
feature_cols = [col for col in train_df.columns if col not in ignore_columns]

# We will need it during submission
dill.dump(feature_cols,open('feature_cols.dill','wb'))

In [None]:
%%time
# We will work with NumPy arrays
train_df.fillna(0,inplace=True)
y = train_df.target.values.astype(np.float32)
X = train_df[feature_cols].values.astype(np.int8)
del train_df

#scaler = StandardScaler().fit(X)
#X = scaler.transform(X)

#dill.dump(scaler,open('scaler.dill','wb'))

In [None]:
# Return two generators: one for training and the second for validation
def build_data_gen(X,y,lookback=10,delay=1,batch_size=128,val_split=0.1):
    min_index = 0
    max_index = X.shape[0]
    # We will create a frame to hold the steps' indexes
    idx = tf.signal.frame(tf.range(min_index,max_index),lookback,1).numpy().astype(np.int8)
    # We cannot predict more values than the delay
    idx = idx[tf.reduce_all(idx+delay <= max_index,axis=-1)]
    
    # We shuffle the frame and split it into validation and training
    np.random.shuffle(idx)
    val_size = int(idx.shape[0]*val_split)
    val_idx = idx[-val_size:]
    train_idx = idx[:-val_size]
    print(idx.shape,train_idx.shape,val_idx.shape)
    del idx
    
    # This function is our generator
    def fn_data_gen(idx):
        while True:
            # We return batches from the frame, shuffle it a repeat again
            for i in range(0,idx.shape[0],batch_size):
                j = min(i+batch_size,idx.shape[0])
                idx_tmp = idx[i:j]
                # The target position is the last step+delay
                y_idx = idx_tmp[...,-1]+delay
                yield X.take(idx_tmp,axis=0),y.take(y_idx,axis=0)
            np.random.shuffle(idx) #s huffle and repeat
            
    return lambda: fn_data_gen(train_idx),lambda: fn_data_gen(val_idx)

In [None]:
def prepare_dataset(X,y,window_length,batch_size,cache_name='train'):
    x_ds = tf.data.Dataset.from_tensor_slices(X) 
    y_ds = tf.data.Dataset.from_tensor_slices(y[(window_length-1):])#ignore first window_length elements

    x_ds = x_ds.window(window_length,shift=1,drop_remainder=True)
    x_ds = x_ds.flat_map(lambda window: window.batch(window_length))

    ds = tf.data.Dataset.zip((x_ds, y_ds)).repeat()#.cache(f'/tmp/{cache_name}').repeat()

    ds = ds.shuffle(10000).batch(batch_size)
    ds = ds.prefetch(30)
    return ds

In [None]:
filepath="val_model.hdf5"
callbacks_list = [
    keras.callbacks.ModelCheckpoint(filepath, 
                                    verbose=1,
                                    monitor='val_acc', 
                                    save_best_only=True, 
                                    mode='max'),
]
batch_size = 2**12
lookback = 10
val_split = 0.2
batch_size

# Define the model

In [None]:
input_layer = keras.layers.Input(shape=(lookback,len(feature_cols)))

input_norm = keras.layers.BatchNormalization()(input_layer)


encoder = keras.layers.LSTM(64)(input_layer)

output = keras.layers.Dense(1,activation='sigmoid')(encoder)


model = keras.models.Model(input_layer,output)
model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.Adam(0.01),
              metrics=[tf.keras.metrics.BinaryAccuracy(name='acc')]
             )
model.summary()

### Prepare training and validation dataset 

In [None]:
size = X.shape[0]
val_size = int(size*val_split)
X_train, X_test, y_train, y_test = X[:-val_size],X[-val_size:],y[:-val_size],y[-val_size:]
X_train.shape,X_test.shape

In [None]:
train_steps = int(X.shape[0]//batch_size*(1-val_split))
val_steps = int(X.shape[0]//batch_size*(val_split))
train_steps,val_steps

In [None]:
dataset = prepare_dataset(X_train,y_train,lookback,batch_size)
val_dataset = prepare_dataset(X_test,y_test,lookback,batch_size)

## Train

In [None]:
history = model.fit(dataset,
          steps_per_epoch=train_steps,
          validation_data=val_dataset,
          validation_steps=val_steps,
          epochs=60,
          callbacks=callbacks_list)
model.save('train_model.hdf5')

In [None]:
pd.DataFrame(history.history).plot()

## [Check the submission notebook](https://www.kaggle.com/tchaye59/jmarket-rnn-with-keras-submit)