The purpose is to perform neural network on the data provided for the Jane Street Market Prediction

### Import the relevant packages an librairies

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.layers import Input, BatchNormalization, Dropout, Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
# !pip install tensorflow_addons
import tensorflow_addons as tfa
from matplotlib import pyplot
import time

### DataSets Loading 

In [None]:
folder_path = '../input/jane-street-market-prediction/'

In [None]:
%%time

features_df = pd.read_csv(folder_path + 'features.csv')
sample_df = pd.read_csv(folder_path + 'example_sample_submission.csv')


In [None]:
# addition feature engineering
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# load as 32bits dtypes
types = pd.read_csv('../input/jane-street-market-prediction/train.csv', nrows=10).dtypes.astype(str).str.replace('64','32').to_dict()
#train_df = pd.read_csv('../input/jane-street-market-prediction/train.csv', dtype = types, nrows=1200000)
train_df = pd.read_csv(folder_path +'train.csv', nrows=1400000)


In [None]:
types = pd.read_csv('../input/jane-street-market-prediction/example_test.csv', nrows=10).dtypes.astype(str).str.replace('64','32').to_dict()
#test_df = pd.read_csv('../input/jane-street-market-prediction/example_test.csv', dtype = types)
test_df = pd.read_csv(folder_path + 'example_test.csv')

In [None]:
%%time
## Step2: Memory reduction, precision and data type transformation
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage before optimization 2 is: {:.2f} MB'.format(start_mem))
    print("The dataframe 2 has {} columns.".format(df.shape[1]))
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    start_mem = end_mem
    print('Memory usage after optimization 2 is: {:.2f} MB'.format(end_mem))
    print("The reduced dataframe 2 has {} columns.".format(df.shape[1]))                
    return df
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)



In [None]:
"""
%%time
# Add group statistics features
train_df = get_group_stats(train_df)
print(train_df.head())
"""


In [None]:
"""
test_df = get_group_stats(test_df)
#print(test_df.head())
"""

In [None]:
print(train_df.tail())

### Data Analysis

In [None]:
features = [c for c in train_df.columns if 'feature' in c]
resps = [c for c in train_df.columns if 'resp' in c]

In [None]:
train_df = train_df[train_df['weight'] != 0]

In [None]:
train_df['action'] = train_df['resp'].apply(lambda x:x>0).astype(int)

In [None]:
train_df_median = train_df[features].median()

In [None]:

val_range = train_df[features].max()-train_df[features].min()
filler = pd.Series(train_df[features].min()-0.01*val_range, index=features)
# This filler value will be used as a constant replacement of missing values 

# A function to maintain data type consistency of dataframe
dtype_dict = dict(train_df[features].dtypes)
def consistent_dtype(df):
    return df.astype(dtype_dict)

def fill_missing(df):
    df[features] = np.nan_to_num(df[features]) + filler*np.isnan(df[features])
    return df   

X = fill_missing(train_df)
X = consistent_dtype(X)

In [None]:
X = train_df[features].fillna(X)

In [None]:
y = train_df['action']

In [None]:
#del train_df

### Data reduction

In [None]:
# Before we perform PCA, we need to normalise the features so that they have zero mean and unit variance
scaler = StandardScaler()
scaler.fit(X.fillna(0.01))
x_norm = scaler.transform(X.fillna(0.01))

pca = PCA()
comp = pca.fit(x_norm)

In [None]:
# We plot a graph to show how the explained variation in the 129 features varies with the number of principal components
plt.plot(np.cumsum(comp.explained_variance_ratio_))
plt.grid()
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance')
sns.despine();

# The first 15 principal components explains about 80% of the variation
# The first 40 principal components explains about 95% of the variation

In [None]:
pca = PCA(n_components=50).fit(x_norm)
x_transform = pca.transform(x_norm)

In [None]:
#TODO add LSTM to mix


def create_mlp(num_columns, num_labels, hidden_units,
               dropout_rates, label_smoothing, learning_rate):
  
  inp = Input(shape=(num_columns,))
  x = BatchNormalization()(inp)
  x = Dropout(dropout_rates[0])(x)

  for i in range(len(hidden_units)):
    x = Dense(hidden_units[i])(x)
    x = BatchNormalization()(x)
    x = Activation(tf.keras.activations.swish)(x)
    x = Dropout(dropout_rates[i+1])(x)

  x = Dense(num_labels)(x)
  out = Activation('sigmoid')(x)

  model = tf.keras.models.Model(inputs=inp, outputs=out)
  model.compile(
      optimizer = tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
      loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
      metrics = tf.keras.metrics.AUC(name='AUC')
  )

  return model

In [None]:
# make the x for train and test (also called validat ion data)
xtrain,xval, ytrain, yval = train_test_split(x_transform, y,train_size=0.8,random_state=42)

In [None]:
epochs = [800, 40] # PGTSCV folds all stopped bf. 40
batch_size = [4096, 8192]
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

In [None]:
model = create_mlp(50, 1, hidden_units,
                      dropout_rates, label_smoothing, learning_rate)

In [None]:
er = EarlyStopping(patience = 10, 
                    restore_best_weights = True, 
                    monitor = 'val_loss')

In [None]:
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [None]:
%%time

history = model.fit(xtrain, ytrain,
              validation_data = (xval, yval),
              epochs = epochs[0],
              batch_size = batch_size[1], callbacks = [er, mc])

In [None]:
# evaluate the keras model
_, train_accuracy = model.evaluate(xtrain, ytrain)
print(' Validation Accuracy: %.2f' % (train_accuracy*100))
_, val_accuracy = model.evaluate(xval, yval)
print(' Validation Accuracy: %.2f' % (val_accuracy*100))

In [None]:
# plot training history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
print(model.summary())

In [None]:
test_df

In [None]:
sample_df

In [None]:
train_df

In [None]:
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
import janestreet
janestreet.competition.make_env.__called__ = False
env = janestreet.make_env()
#test_df=test_df.reindex(columns=['feature_15_std', 'feature_9_sum', 'feature_9_std', 'feature_14_median', 'feature_16_sum', 'feature_10_sum', 'feature_13_median', 'feature_10_std', 'feature_15_median', 'feature_feature_mean', 'feature_9_mean', 'feature_10_median', 'feature_15_mean', 'feature_13_mean', 'feature_17_median', 'feature_12_median', 'feature_17_std', 'feature_14_sum', 'feature_11_sum', 'feature_feature_sum', 'feature_14_std', 'feature_9_median', 'feature_16_mean', 'feature_11_mean', 'feature_11_std', 'feature_11_median', 'feature_15_sum', 'feature_12_mean', 'feature_feature_median', 'feature_17_mean', 'feature_10_mean', 'feature_16_std', 'feature_12_sum', 'feature_14_mean', 'feature_12_std', 'feature_feature_std', 'feature_16_median', 'feature_13_std', 'feature_13_sum', 'feature_17_sum'])
#train_df=train_df.reindex(columns=['feature_15_std', 'feature_9_sum', 'feature_9_std', 'feature_14_median', 'feature_16_sum', 'feature_10_sum', 'feature_13_median', 'feature_10_std', 'feature_15_median', 'feature_feature_mean', 'feature_9_mean', 'feature_10_median', 'feature_15_mean', 'feature_13_mean', 'feature_17_median', 'feature_12_median', 'feature_17_std', 'feature_14_sum', 'feature_11_sum', 'feature_feature_sum', 'feature_14_std', 'feature_9_median', 'feature_16_mean', 'feature_11_mean', 'feature_11_std', 'feature_11_median', 'feature_15_sum', 'feature_12_mean', 'feature_feature_median', 'feature_17_mean', 'feature_10_mean', 'feature_16_std', 'feature_12_sum', 'feature_14_mean', 'feature_12_std', 'feature_feature_std', 'feature_16_median', 'feature_13_std', 'feature_13_sum', 'feature_17_sum'])
#features = [c for c in train_df.columns if 'feature' in c]
#resps = [c for c in train_df.columns if 'resp' in c]
from tqdm import tqdm #

start_time = time.time()
for (test_df, pred_df) in tqdm(env.iter_test()):
    #x_tt = test_df.loc[:, features].values
    #if np.isnan(x_tt[:, 1:].sum()):  # simply ignoring missing values and imediately predicting 0
     #   pred_df.action = 0
    wt = test_df.iloc[0].weight
    if(wt == 0):
        pred_df.action = 0 
    else:
        #pred = model(x_tt, training=False)
        action = model(pca.transform(scaler.transform(fillna_npwhere(test_df[features].values,train_df[features].values))))
        a = 1 if action[0].numpy()[0]>0.5 else 0
        pred_df.action = np.int64(a)    
        #pred_df.action = np.where(pred > 0.5, 1, 0).astype(int)
    env.predict(pred_df)
print(f"took: {time.time() - start_time} seconds")

### Data Submission