In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import xgboost as xgb
import sklearn
import tqdm
import random
import janestreet

In [None]:
SEED= 9899
random.seed(SEED)
np.random.seed(SEED)

In [None]:
train = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')

In [None]:
train.head(5)

In [None]:
def eda(data):
    print("----------Top-5- Record----------")
    print(data.head(5))
    print("-----------Information-----------")
    print(data.info())
    print("-----------Data Types-----------")
    print(data.dtypes)
    print("----------Missing value-----------")
    print(data.isnull().sum())
    print("----------Null value-----------")
    print(data.isna().sum())
    print("----------Shape of Data----------")
    print(data.shape)

def graph_insight(data):
    print(set(data.dtypes.tolist()))
    df_num = data.select_dtypes(include = ['float64', 'int64'])
    df_num.hist(figsize=(16, 16), bins=50, xlabelsize=8, ylabelsize=8);
    
def drop_duplicate(data, subset):
    print('Before drop shape:', data.shape)
    before = data.shape[0]
    data.drop_duplicates(subset,keep='first', inplace=True) #subset is list where you have to put all column for duplicate check
    data.reset_index(drop=True, inplace=True)
    print('After drop shape:', data.shape)
    after = data.shape[0]
    print('Total Duplicate:', before-after)

In [None]:
graph_insight(train)

In [None]:
eda(train)

In [None]:
train['resp_1'] = (((train['resp_1'].values)*train['weight']) > 0).astype(int)
train['resp_2'] = (((train['resp_2'].values)*train['weight']) > 0).astype(int)
train['resp_3'] = (((train['resp_3'].values)*train['weight']) > 0).astype(int)
train['resp_4'] = (((train['resp_4'].values)*train['weight']) > 0).astype(int)

In [None]:
train = train[train['weight'] != 0]

train = train.query('date > 85').reset_index(drop = True) 

train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use

train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')

train.fillna(train.mean(),inplace=True)

features = [c for c in train.columns if 'feature' in c]

df_train = train.sample(frac=0.8, random_state=0)
df_valid = train.drop(df_train.index)


X_train = df_train.loc[:, df_train.columns.str.contains('feature')]
X_valid = df_valid.loc[:, df_valid.columns.str.contains('feature')]
y_train = df_train['action']
y_valid = df_valid['action']


In [None]:
len(features)

In [None]:
feature = 'feature_11'
sns.lmplot(
    x=feature, y="action", hue="feature_0", col="feature_0",
    data=df_train, scatter_kws={"edgecolor": 'w'}, col_wrap=3, height=4,
);

In [None]:
corrmat = X_train.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(X_train[top_corr_features].corr(),annot=True,cmap="RdYlGn")
#corrmat.to_csv('correlation.csv')

feature_0 does not  seem to have any impact on the results as being the only categorical variable.

In [None]:
import xgboost as xgb
print("XGBoost version:", xgb.__version__)

the below features are selected after removing the correlared features from the ones with top feature importance scores.

In [None]:
selected_features = [
'feature_0','feature_1','feature_3','feature_6','feature_20','feature_27','feature_31','feature_37','feature_39','feature_41','feature_42','feature_43','feature_44','feature_45','feature_60','feature_62','feature_83','feature_107']

In [None]:
clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.01,
    gamma = 0.3,
    min_child_weight=5,
    random_state=SEED,
    subsample=0.8, 
    colsample_bytree= 0.8,
    eval_metric = "error",
    use_label_encoder=False,
    scale_pos_weight=1,
    nthread=4,
    tree_method='gpu_hist'  # THE MAGICAL PARAMETER
)
%time clf.fit(X_train[selected_features], y_train)


The feature selection by finding threshold for the feature importance and then removing the correlated features helps to increase the score from 0.5635 to 0.5695.

There are many rounds of optimizing the XGBoost parameters and the ones in baseline and clf have the optimized parameters only.

In [None]:
import time
from tqdm.notebook import tqdm

In [None]:
TRAINING = True

start_time = time.time()

env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

# count = 0
for (test_df, sample_prediction_df) in iter_test:
    if test_df['weight'].item() > 0:
        X_test = test_df.loc[:, features]
        X_test.fillna(X_test.mean(),inplace=True)
        select_X_test = X_test[selected_features]
        y_preds = clf.predict(select_X_test)
        sample_prediction_df.action = y_preds.astype(int)
    else:
        sample_prediction_df.action = 0
    env.predict(sample_prediction_df)
        
print(f"took: {time.time() - start_time} seconds")