# Jane Street Market Prediction

Reference:  https://www.kaggle.com/c/jane-street-market-prediction

In [None]:

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sb

import tensorflow as tf

import sys

from tensorflow.keras.callbacks import EarlyStopping

%matplotlib inline

SEED = 1111
tf.random.set_seed(SEED)
np.random.seed(SEED)


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Functions To Get Trainning Data

Due to the size of trainning data, supporting functions are available to create and read a sample of trainning data.

In [None]:
TRAIN_FILE = '/kaggle/input/jane-street-market-prediction/train.csv'
SAMPLE_TRAINNING_FILE = './train-sample.csv'


# Get all trainning data
def get_trainning_data():
    print("Reading training data...")
    df = pd.read_csv(TRAIN_FILE)
    return df

# Create sample of trainning data
def create_trainning_sample(frac = 0.2):
    df_train = get_trainning_data();
    print("Creating sample file...")
    df_sample = df_train.sample(frac=frac)
    df_sample.to_csv(SAMPLE_TRAINNING_FILE, index=False, header=True)
    
def get_sample_trainning_data():
    print("Reading sample training data...")
    # Read training data
    return pd.read_csv(SAMPLE_TRAINNING_FILE)    

### Create Trainning Data Sample

Run this block to create or recreate the sample trainning data file.

In [None]:
#create_trainning_sample(frac = 0.02);

# Start Exploration

In [None]:
# Read only sample data created in previous step above
#df_train = get_sample_trainning_data()

# Read all the data
df_train = get_trainning_data()


In [None]:
print("Generate action column...")
df_train['action'] = 0
df_train.loc[(df_train['resp'] > 0), 'action'] = 1


In [None]:
df_train.info()

In [None]:
# Check for balance
sb.distplot(df_train['action']);

## Sort By Time

In [None]:
df_train.sort_values(by=['date', 'ts_id'], inplace=True)

## Delete rows

In [None]:
df_train = df_train[df_train['weight'] != 0]

## Drop unecessary columns

In [None]:
columns_to_delete = ['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp', 'ts_id']
for column in columns_to_delete:
    df_train.drop(column, axis=1, inplace=True)
    
feature_count = len(df_train.columns) - 1

## Handle Missing Data

In [None]:
df_train.fillna(df_train.mean(),inplace=True)


# Reset Index

In [None]:
df_train = df_train.reset_index(drop=True)
df_train

# Train

In [None]:
NORMALIZE_NONE = 0
NORMALIZE_MIN_MAX = 1
NORMALIZE_MEAN = 2

NORMALIZE_TYPE = NORMALIZE_NONE


In [None]:
# 20 / 80 split
df_validation, df_train = np.split(df_train, [int(.2*len(df_train))])

df_train = df_train.reset_index(drop=True)
df_validation = df_validation.reset_index(drop=True)

df_train_labels = df_train['action'].copy()
df_train.drop('action', inplace=True, axis=1)

df_validation_labels = df_validation['action'].copy()
df_validation.drop('action', inplace=True, axis=1)

print("Shapes: (train={}, train_labels={}, validation={}, validation_labels={})".format(df_train.shape, df_train_labels.shape, df_validation.shape, df_validation_labels.shape))


## Normalize Data

In [None]:
def normalize_data(df):
    if NORMALIZE_TYPE == NORMALIZE_MIN_MAX:
        return (df-df.min())/(df.max()-df.min())
    elif NORMALIZE_TYPE == NORMALIZE_MEAN:
        return (df-df.mean())/df.std()
    else:
        return df;

df_train = normalize_data(df_train)
df_validation = normalize_data(df_validation)

In [None]:

input_length = 7
batch_size = 70

generator_train = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    df_train, 
    df_train_labels,
    length=input_length,
    batch_size=batch_size)

generator_validation = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    df_validation, 
    df_validation_labels,
    length=input_length,
    batch_size=batch_size)

In [None]:
# some values that may or may not get used as I experiment
EPOCHS = 100
STEPS = len(df_train) / batch_size
LEARNING_RATE = 0.0005


In [None]:

model = tf.keras.models.Sequential()

model.add(tf.keras.layers.LSTM(
    feature_count, 
    activation='relu', 
    input_shape=(input_length, feature_count), 
    return_sequences=True))

model.add(tf.keras.layers.Dropout(0.02))

model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

model.compile(loss=tf.keras.losses.BinaryCrossentropy(), 
                optimizer=tf.optimizers.Adam(learning_rate=LEARNING_RATE),
                metrics=["accuracy"])
model.summary()

In [None]:

model.fit(generator_train,
          validation_data = generator_validation,
          epochs = EPOCHS,
          steps_per_epoch = STEPS,
          validation_steps=STEPS/10,
          verbose = 1,
          callbacks = [
                EarlyStopping(monitor='loss', verbose=1, patience=10),
                EarlyStopping(monitor='val_loss', verbose=1, patience=10)
          ]
    )  


In [None]:
train_history = model.history.history

plt.figure(1)
plt.subplot(211)
plt.ylim(top=5)
plt.plot(train_history['loss'])
plt.ylabel('Average Loss Per Epoch')
plt.xlabel('Epoch')
plt.title('Average Loss Per Epoch vs Epoch')

plt.subplot(212)
plt.ylim(top=15)
plt.plot(train_history['val_loss'])
plt.ylabel('Val Loss per Epoch')
plt.xlabel('epoch')
plt.title('Val Loss per Epoch vs Epoch')
plt.show()


## Prediction

In [None]:
# During prediction phase, what is the threshold to set action = 1
PREDICTION_THRESHOLD = 0.5
PREDICTION_IGNORE_WEIGHT = False

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
print("Starting predictions...")
df_submission = pd.DataFrame(columns = ['ts_id', 'action'])

zero_weight_count = 0
prediction_count = 0
prediction_within_threshold_count = 0
prediction_value = 0;

# TODO:
#   1) Figure out how to add a rolling cache of previous test_df. per Gena (https://www.kaggle.com/gdonchyts)
#

for (test_df, sample_prediction_df) in iter_test:
    
    # drop columns
    weight = test_df['weight'].item()
    columns_to_delete = ['date', 'weight']
    for column in columns_to_delete:
        test_df.drop(column, axis=1, inplace=True)
        
    ts_id = sample_prediction_df.index.values[0]
    action = 0

    if weight != 0 and PREDICTION_IGNORE_WEIGHT == False:  
        # handle missing values
        test_df.fillna(test_df.mean(),inplace=True)
        # normalize
        test_df = normalize_data(test_df)
        # predict
        prediction = model.predict(np.array(test_df.values).reshape(1,1,130));
        prediction_value = prediction[0][0][0]
        if prediction_value > PREDICTION_THRESHOLD:
            action = 1
            prediction_within_threshold_count = prediction_within_threshold_count + 1
        prediction_count = prediction_count + 1
    else:
        zero_weight_count = zero_weight_count + 1
        
    sample_prediction_df.action = action
    print("\rPrediciton: (index={}, action={}, prediction={}) Metrics: (submission_size={}, prediction_count={}, prediction_within_threshold_count={}, zero_weight_count={})".format(
        # Prediction
        ts_id, 
        action, 
        prediction_value,
        # Notes
        len(df_submission),
        prediction_count, 
        prediction_within_threshold_count,
        zero_weight_count
    ), end="");
    
    df_submission = df_submission.append( {'ts_id': ts_id, 'action': action}, ignore_index=True )
    env.predict(sample_prediction_df)   

print("\nFinished predictions")
df_submission.to_csv("./submission.csv", index=False, header=True)
print("Submission file created")
print("Metrics: (submission_size={}, prediction_count={}, prediction_within_threshold_count={}, zero_weight_count={})".format(
    len(df_submission),
    prediction_count, 
    prediction_within_threshold_count,
    zero_weight_count
))
