The purpose is to perform neural network on the data provided for the Jane Street Market Prediction

### Import the relevant packages an librairies

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

### DataSets Loading 

In [None]:
folder_path = '../input/jane-street-market-prediction/'

In [None]:
%%time
train_df = pd.read_csv(folder_path +'train.csv' , nrows=1800000)
features_df = pd.read_csv(folder_path + 'features.csv')
sample_df = pd.read_csv(folder_path + 'example_sample_submission.csv')
test_data_df = pd.read_csv(folder_path + 'example_test.csv')

### Data Analysis

In [None]:
features = [c for c in train_df.columns if 'feature' in c]
resps = [c for c in train_df.columns if 'resp' in c]

In [None]:
train_df = train_df[train_df['weight'] != 0]

In [None]:
train_df['action'] = train_df['resp'].apply(lambda x:x>0).astype(int)

In [None]:
train_df_median = train_df[features].median()

In [None]:
X = train_df[features].fillna(train_df_median)

In [None]:
y = train_df['action']

In [None]:
del train_df

### Data reduction

In [None]:
# Before we perform PCA, we need to normalise the features so that they have zero mean and unit variance
scaler = StandardScaler()
scaler.fit(X)
x_norm = scaler.transform(X)

pca = PCA()
comp = pca.fit(x_norm)

In [None]:
# We plot a graph to show how the explained variation in the 129 features varies with the number of principal components
plt.plot(np.cumsum(comp.explained_variance_ratio_))
plt.grid()
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance')
sns.despine();

# The first 15 principal components explains about 80% of the variation
# The first 40 principal components explains about 95% of the variation

In [None]:
pca = PCA(n_components=50).fit(x_norm)
x_transform = pca.transform(x_norm)

In [None]:
# make the x for train and test (also called validat ion data)
xtrain,xval, ytrain, yval = train_test_split(x_transform, y,train_size=0.5,random_state=42)

In [None]:
xtrain[:100000]

In [None]:
type(xtrain)

In [None]:
# LSTM expects 3D input (examples, timestep, features)
print(xtrain.shape, xval.shape)
X_train = xtrain.reshape((xtrain.shape[0], 1, xtrain.shape[1]))
X_val = xval.reshape((xval.shape[0], 1, xval.shape[1]))
#print(X_train.shape, X_val.shape)

In [None]:
print(X_train.shape, xval.shape)

In [None]:
### # define the keras model
model = Sequential()
#model.add(Dense(50, input_dim=50, activation='relu'))
#model.add(Dropout(0.2))
#model.add(Dense(42, activation='relu'))
#model.add(Dropout(0.2))
#model.add(Dense(36, activation='relu'))
#model.add(Dropout(0.2))
#model.add(Dense(24, activation='relu'))
#model.add(Dropout(0.2))
#model.add(Dense(16, activation='relu'))
#model.add(Dropout(0.2))
#model.add(Dense(8, activation='relu'))
#model.add(Dropout(0.2))
#model.add(Dense(1, activation='sigmoid'))

In [None]:
batch_size = 256
model = Sequential([
    LSTM(64, input_shape=(1,50), return_sequences=True ),
    Dropout(0.25),
    LSTM(32, return_sequences=True),
    Dropout(0.25),
    LSTM(16, return_sequences=True),
    Dropout(0.25),
    LSTM(8, return_sequences=True),
    Dropout(0.25),
    Dense(1, activation='sigmoid')
])
# model.compile(optimizer='adam', loss='mse')
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#model.add(LSTM(64, batch_size=256, return_sequences = True, input_shape=(1,50)))
#model.add(Dropout(0.2))

#model.add(LSTM(32, return_sequences = True))
#model.add(Dropout(0.2))

#model.add(LSTM(16, return_sequences = True))
#model.add(Dropout(0.2))

#model.add(LSTM(8, return_sequences = True))
#model.add(Dropout(0.2))
#model.add(Dense(1, activation='sigmoid'))

In [None]:
import tensorflow as tf
train = tf.data.Dataset.from_tensor_slices((X_train, ytrain))
val = tf.data.Dataset.from_tensor_slices((X_val, yval)).batch(batch_size)
train = train.cache().batch(batch_size).repeat()

In [None]:
# compile the keras model
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
#model.fit(xtrain, ytrain,validation_data=(xval, yval), epochs=10, batch_size=batch_size)
#model.fit(X_train, ytrain,validation_data=(X_val, yval), epochs=10, batch_size=batch_size)
model.fit(train, epochs=20, steps_per_epoch=200, validation_data=val, validation_steps=50)
# evaluate the keras model
_, accuracy = model.evaluate(X_val, yval)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
print(model.summary())

In [None]:
test_data_df

In [None]:
sample_df

### Data Submission

In [None]:
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
import janestreet
janestreet.competition.make_env.__called__ = False
env = janestreet.make_env()

from tqdm import tqdm #

start_time = time.time()
for (test_df, pred_df) in tqdm(env.iter_test()):
    #x_tt = test_df.loc[:, features].values
    #if np.isnan(x_tt[:, 1:].sum()):  # simply ignoring missing values and imediately predicting 0
     #   pred_df.action = 0
    wt = test_df.iloc[0].weight
    if(wt == 0):
        pred_df.action = 0 
    else:
        #pred = model(x_tt, training=False)
        xpred = pca.transform(scaler.transform(fillna_npwhere(test_df[features].values,train_df_median[features].values)))
        X_pred = xpred.reshape((xpred.shape[0], 1, xpred.shape[1]))
        action = model(X_pred)
        a = 1 if action[0][0].numpy()[0]>0.5 else 0
        pred_df.action = np.int64(a)
        # print(tf.make_ndarray(pred))
        # print(xpred)
        # pred_df.action = np.where(pred > 0.5, 1, 0).astype(int)
    env.predict(pred_df)
print(f"took: {time.time() - start_time} seconds")