The purpose is to perform neural network on the data provided for the Jane Street Market Prediction

### Import the relevant packages an librairies

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.layers import Input, BatchNormalization, Dropout, Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
# !pip install tensorflow_addons
import tensorflow_addons as tfa
from matplotlib import pyplot

### DataSets Loading 

In [None]:
folder_path = '../input/jane-street-market-prediction/'

In [None]:
%%time
train_df = pd.read_csv(folder_path +'train.csv' , nrows=1800000)
features_df = pd.read_csv(folder_path + 'features.csv')
sample_df = pd.read_csv(folder_path + 'example_sample_submission.csv')
test_data_df = pd.read_csv(folder_path + 'example_test.csv')

### Data Analysis

In [None]:
features = [c for c in train_df.columns if 'feature' in c]
resps = [c for c in train_df.columns if 'resp' in c]

In [None]:
train_df = train_df[train_df['weight'] != 0]

In [None]:
train_df['action'] = train_df['resp'].apply(lambda x:x>0).astype(int)

In [None]:
train_df_median = train_df[features].median()

In [None]:
X = train_df[features].fillna(train_df_median)

In [None]:
y = train_df['action']

In [None]:
del train_df

### Data reduction

In [None]:
# Before we perform PCA, we need to normalise the features so that they have zero mean and unit variance
scaler = StandardScaler()
scaler.fit(X)
x_norm = scaler.transform(X)

pca = PCA()
comp = pca.fit(x_norm)

In [None]:
# We plot a graph to show how the explained variation in the 129 features varies with the number of principal components
plt.plot(np.cumsum(comp.explained_variance_ratio_))
plt.grid()
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance')
sns.despine();

# The first 15 principal components explains about 80% of the variation
# The first 40 principal components explains about 95% of the variation

In [None]:
pca = PCA(n_components=50).fit(x_norm)
x_transform = pca.transform(x_norm)

In [None]:
def create_mlp(num_columns, num_labels, hidden_units,
               dropout_rates, label_smoothing, learning_rate):
  
  inp = Input(shape=(num_columns,))
  x = BatchNormalization()(inp)
  x = Dropout(dropout_rates[0])(x)

  for i in range(len(hidden_units)):
    x = Dense(hidden_units[i])(x)
    x = BatchNormalization()(x)
    x = Activation(tf.keras.activations.swish)(x)
    x = Dropout(dropout_rates[i+1])(x)

  x = Dense(num_labels)(x)
  out = Activation('sigmoid')(x)

  model = tf.keras.models.Model(inputs=inp, outputs=out)
  model.compile(
      optimizer = tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
      loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
      metrics = tf.keras.metrics.AUC(name='AUC')
  )

  return model

In [None]:
# make the x for train and test (also called validat ion data)
xtrain,xval, ytrain, yval = train_test_split(x_transform, y,train_size=0.8,random_state=42)

In [None]:
epochs = [400, 40] # PGTSCV folds all stopped bf. 40
batch_size = [4096, 8192]
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

In [None]:
model = create_mlp(50, 1, hidden_units,
                      dropout_rates, label_smoothing, learning_rate)

In [None]:
er = EarlyStopping(patience = 8, 
                    restore_best_weights = True, 
                    monitor = 'val_loss')

In [None]:
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [None]:
history = model.fit(xtrain, ytrain,
              validation_data = (xval, yval),
              epochs = epochs[0],
              batch_size = batch_size[1], callbacks = [er, mc])

In [None]:
# evaluate the keras model
_, train_accuracy = model.evaluate(xtrain, ytrain)
print(' Validation Accuracy: %.2f' % (train_accuracy*100))
_, val_accuracy = model.evaluate(xval, yval)
print(' Validation Accuracy: %.2f' % (val_accuracy*100))

In [None]:
# plot training history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
print(model.summary())

In [None]:
test_data_df

In [None]:
sample_df

### Data Submission

In [None]:
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
import janestreet
janestreet.competition.make_env.__called__ = False
env = janestreet.make_env()

from tqdm import tqdm #

start_time = time.time()
for (test_df, pred_df) in tqdm(env.iter_test()):
    #x_tt = test_df.loc[:, features].values
    #if np.isnan(x_tt[:, 1:].sum()):  # simply ignoring missing values and imediately predicting 0
     #   pred_df.action = 0
    wt = test_df.iloc[0].weight
    if(wt == 0):
        pred_df.action = 0 
    else:
        #pred = model(x_tt, training=False)
        action = model(pca.transform(scaler.transform(fillna_npwhere(test_df[features].values,train_df_median[features].values))))
        a = 1 if action[0].numpy()[0]>0.5 else 0
        pred_df.action = np.int64(a)    
        #pred_df.action = np.where(pred > 0.5, 1, 0).astype(int)
    env.predict(pred_df)
print(f"took: {time.time() - start_time} seconds")