In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cudf
pd.set_option('display.max_columns', 500)


# Standard plotly imports
import plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
import cufflinks
import cufflinks as cf
import plotly.figure_factory as ff
import os


import warnings
warnings.filterwarnings("ignore")

## Create Environment

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
import xgboost as xgb
print("XGBoost version:", xgb.__version__)

In [None]:
%%time
train_cudf  = cudf.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
train = train_cudf.to_pandas()
del train_cudf
features = pd.read_csv('../input/jane-street-market-prediction/features.csv')
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')
print ("Data is loaded!")

In [None]:
print('train shape is {}'.format(train.shape))
print('features shape is {}'.format(features.shape))
print('example_test shape is {}'.format(example_test.shape))
print('sample_prediction_df shape is {}'.format(sample_prediction_df.shape))

In [None]:
train.head()

### Missing Values Count

In [None]:
missing_values_count = train.isnull().sum()
print (missing_values_count)
total_cells = np.product(train.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)

# Is the data balanced or not?

In [None]:
train = train[train['weight'] != 0]

train['action'] = (train['resp'].values > 0).astype('int')


X_train = train.loc[:, train.columns.str.contains('feature')]
y_train = train.loc[:, 'action']

In [None]:
# Model params and some modifications taked from this kernel
# https://www.kaggle.com/wilddave/xgb-starter
#X_train = X_train.fillna(-999)

# Fill null cell with mean of columns
f_mean = X_train.mean() 
X_train.fillna(f_mean)

In [None]:
x = train['action'].value_counts().index
y = train['action'].value_counts().values

trace2 = go.Bar(
     x=x ,
     y=y,
     marker=dict(
         color=y,
         colorscale = 'Viridis',
         reversescale = True
     ),
     name="Imbalance",    
 )
layout = dict(
     title="Data imbalance - action",
     #width = 900, height = 500,
     xaxis=go.layout.XAxis(
     automargin=True),
     yaxis=dict(
         showgrid=False,
         showline=False,
         showticklabels=True,
 #         domain=[0, 0.85],
     ), 
)
fig1 = go.Figure(data=[trace2], layout=layout)
iplot(fig1)

In [None]:
del x, y, train, features, example_test, sample_prediction_df

In [None]:
X_train = X_train.fillna(f_mean)

In [None]:
X_train['y'] = y_train

In [None]:
X_train_mean = X_train.groupby('y').mean()

In [None]:
indices = np.abs(X_train_mean.loc[0] - X_train_mean.loc[1]).sort_values(ascending = False).head(20).index

X_train = X_train[list(indices)]
y_train = y_train

## Training
##### To activate GPU usage, simply use tree_method='gpu_hist' (took me an hour to figure out, I wish XGBoost documentation was clearer about that).

In [None]:
# Fully connect
import keras
from keras import backend as K
from numpy import loadtxt
from tensorflow.keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint,EarlyStopping
from keras.layers import Input, Concatenate, concatenate, BatchNormalization
from keras.models import Model
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import LSTM, Reshape
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.callbacks import TensorBoard
from tensorflow_addons.layers import WeightNormalization
from functools import partial
import tensorflow as tf

InputLayer = Input(shape=(X_train.shape[1], 1))

# ConvLayer = Conv1D(filters=20,
#                            kernel_size=20,
#                            padding='valid',
#                            activation='sigmoid',
#                            strides=1)(InputLayer)
LSTM_Layer = LSTM(X_train.shape[1])(InputLayer)
# PoolingLayer = GlobalMaxPooling1D()(LSTM_Layer)
# PoolingLayer = BatchNormalization()(PoolingLayer)
PoolingLayer = BatchNormalization()(LSTM_Layer)
merge = Dropout(0.4)(PoolingLayer)

merge = Dense(20, activation='sigmoid')(merge)

OutputLayer = Dense(1,
                                        activation='sigmoid',
                                        #kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),
                                        #activity_regularizer=regularizers.l2(1e-3)
                   )(merge)

model = Model(inputs=InputLayer, outputs=OutputLayer)
# merged = WeightNormalization(Dense(1000, activation='relu'))(merged)
# merged = BatchNormalization()(merged)
METRICS = [
            'accuracy'
        ]
model.compile(loss='binary_crossentropy',
              #loss="binary_crossentropy",
              optimizer='adam',
              metrics=METRICS)

In [None]:
X_train_val = X_train.sample(2000,random_state = 1)
y_train_val = y_train.loc[X_train_val.index]

validation = (X_train_val, y_train_val)

X_train_sampling = X_train.drop(X_train_val.index)
y_train_sampling = y_train.loc[X_train_sampling.index]


# for random_state in range(10):
#     X_train_batch = X_train_sampling.sample(1000,random_state = random_state)
#     y_train_batch = y_train.loc[X_train_batch.index]
    
#     model.fit(X_train_batch, y_train_batch, validation_data = validation, epochs=50)

    


In [None]:
for random_state in range(10):
    X_train_batch = X_train_sampling.sample(1000,random_state = random_state)
    y_train_batch = y_train.loc[X_train_batch.index]
    break

clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=11,
    min_child_weight=9.15,
    gamma=0.59,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.7,
    alpha=10.4,
    nthread=5,
    missing=-999,
    random_state=2020,
    tree_method='gpu_hist'  # THE MAGICAL PARAMETER
)

clf.fit(X_train.drop(X_train_val.index), y_train.drop(X_train_val.index))

In [None]:
# %time clf.fit(X_train, y_train)

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
    
    #y_preds = model.predict(X_test)
    y_preds =  clf.predict(X_test)
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)