In [None]:
import os
import pandas as pd
import missingno as msn
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import datatable as dt
import seaborn as sns
from numba import njit

In [None]:
# notebook created by R. Geiges 200119
# implementing a simple serial NN with feature engineering and normaization
# highly correlated features are combined
ccut = 0.975
dropout = 0.2
dateCut = 'date > 85'
nepochs = 140
print("Running prediction with correlation cutoff: ", ccut)
print('Layer dropouts set to: ', dropout)
runjane = True

In [None]:
if runjane :
    print('# File sizes')
    total_size = 0
    start_path = '../input/jane-street-market-prediction'  # To get size of current directory
    for path, dirs, files in os.walk(start_path):
        for f in files:
            fp = os.path.join(path, f)
            total_size += os.path.getsize(fp)
    print("Directory size: " + str(round(total_size/ 1000000, 2)) + 'MB')
    # load training data
    train_raw = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
#featureData = pd.read_csv('../input/jane-street-market-prediction/features.csv')
# example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
# sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')
else :
    train_raw = pd.read_csv('train85_100k.csv')

print ("Data is loaded")

In [None]:
# eliminate first 85 days
print("Reading in data with cutoff: ", dateCut)
train_raw=train_raw.query( dateCut ).reset_index(drop = True)

In [None]:
# build list of feature columns
features = [c for c in train_raw.columns if 'feature' in c]
# ignore feature_0
features.remove('feature_0')
features.remove('feature_41')
features.remove('feature_42')
features.remove('feature_43')
train_raw.describe()

In [None]:
%%time
#Function to reduce memory usage. from Kaggle -> maxwienandts
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df 



#train_raw = reduce_mem_usage(train_raw)

In [None]:
# create correlated feature tree
# print(features)
correl= train_raw[features].corr().abs()
correl[correl == 1] = 0
# drop lower triangle of matrix
for i in range(len(correl)) :
    for j in range(i) :
        correl.iat[j,i] = 0
# print(correl.head())        
cflist = correl.unstack().sort_values(ascending=False).drop_duplicates()
# combine features with correlation bigger than ccut
cflcut = cflist[cflist > ccut]
cflcutis = cflcut.index
print("List of correlated features with absolute value > ", ccut)
feat2drop = []
print("Building list of features to drop")
for i in range(len(cflcutis)) :
    print("Index: %s , value: %f " % (cflcutis[i] , cflcut[i]))
    if not(cflcutis[i][1] in feat2drop) :
        feat2drop.append(cflcutis[i][1])
                
print("Features to drop: ", len(feat2drop))
print(feat2drop)
# remove feature from feature index list
for feat in feat2drop:
    features.remove(feat)
print("Number of remaining features is: ", len(features))

In [None]:
print("Building model and stetting up training now!")
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import Normalization

In [None]:
# do feature engineering
train = train_raw.fillna(train_raw.mean())
#train = train_raw.fillna(0)
print(train.head())
print("Null values found in train: ", train.isnull().sum().sum()) 
train = reduce_mem_usage(train)

In [None]:
# prepare train and test datasets

train = train[train['weight'] != 0]

if 'resp' in train.columns :
    train['action'] = np.where(train['resp'] > 0,1,0)
#    train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')
else:
    train['action'] = np.where(train['weight'] > 0,1,0)    
#    train['action'] = (train['weight'].values > 0).astype('int')

X = train.loc[:, features]
y = train.loc[:, 'action']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
# X_test.head
# y_test.head

In [None]:
# create numpy datasets for normalization
X_nptrain = X_train[:200000].to_numpy()
y_nptrain = y_train[:200000].to_numpy()
# define shapes of model layers
print("Shape X_train: ", X_train.shape[1])
dim_lay1 = X_train.shape[1]
dim_lay2 = dim_lay1 / 1.5
dim_lay3 = dim_lay2 / 2
#dim_lay2 = 150
#dim_lay3 = 150
dim_lay4 = min(10,dim_lay3/2)

In [None]:
# implement normalization of input features
normalizer = preprocessing.Normalization()
normalizer.adapt(X_nptrain)
input_shape = X_train.shape[1:]

In [None]:
inputs = keras.Input(shape=input_shape)
x = tf.keras.layers.BatchNormalization()(inputs)
#x = normalizer(inputs)
x = layers.Dense(dim_lay1, activation="tanh")(x)
x = layers.Dropout(dropout)(x)
# x = layers.Dense(dim_lay1, activation="tanh")(x)
x = layers.Dense(dim_lay2, activation="tanh")(x)
x = layers.Dropout(dropout)(x)
x = layers.Dense(dim_lay3, activation="tanh")(x)
x = layers.Dropout(dropout)(x)
x = layers.Dense(dim_lay4, activation="tanh")(x)
x = layers.Dropout(dropout)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# now train model
%time history = model.fit(X_train, y_train, epochs=nepochs, batch_size = 4096, validation_split=0.05, verbose=1)

In [None]:
print('Show trainint statistics...')
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
score = model.evaluate(X_test, y_test)
print("Accuracy Score on X_test:  "+str(round(score[1],4)))

In [None]:
if runjane :
# specials for janestreet submission
    import janestreet
    janestreet.make_env.__called__ = False
    env = janestreet.make_env() # initialize the environment
    iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
%%time
if runjane :
    for (test_df, sample_prediction_df) in iter_test:
    # predict only trades that generate results
        if test_df['weight'].item() > 0:
            X_test = test_df.loc[: , features].values
            X_test = np.nan_to_num(X_test[:,:])
            y_preds = model(X_test)
            sample_prediction_df.action = np.where(y_preds >= 0.5, 1, 0).astype(int)
        else:
            sample_prediction_df.action = 0
        
        submission=env.predict(sample_prediction_df)

    print("test_df prediction completed!")
    print(sample_prediction_df)
    