In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_set = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', nrows=2000000)
train_set.info()

In [None]:
train_set.columns

In [None]:
train_set.head()

In [None]:
train = train_set[train_set['weight']!=0]

In [None]:
train['action'] = train['resp'].apply(lambda x:x>0).astype(int)

In [None]:
train.head()

In [None]:
features = [col for col in list(train.columns) if 'feature' in col]

In [None]:
X = train[features]
y = train['action']

In [None]:
X.head()

# Data Cleaning
Here the NULL values are observed from feature_120 to feature_121. We need to fix them first and also we need to remove feature_0.

In [None]:
val_range = X[features].max()-X[features].min()
val_range

In [None]:
missing_substitute = pd.Series(X[features].min()-0.01*val_range, index=features)

In [None]:
import matplotlib.pyplot as plt
missing_substitute.plot(figsize=(20,5),kind='bar',rot=90, color='green')
plt.show()

In [None]:
print("Now we have %d missing values in our data" %X.isnull().sum().sum())

In [None]:
X[features] = X[features].fillna(missing_substitute)

In [None]:
X.head()

In [None]:
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size= 0.2)

# XGBoost

In [None]:
SEED= 9899
import random
import numpy as np
random.seed(SEED)
np.random.seed(SEED)

In [None]:
import xgboost as xgb
clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.01,
    gamma = 0.3,
    min_child_weight=5,
    random_state=SEED,
    subsample=0.8, 
    colsample_bytree= 0.8,
    eval_metric = "error",
    use_label_encoder=False,
    scale_pos_weight=1,
    nthread=4,
    tree_method='gpu_hist'  # THE MAGICAL PARAMETER
)
%time clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error

predictions = clf.predict(X_valid)

print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

In [None]:
import time
from tqdm.notebook import tqdm
import janestreet

In [None]:
TRAINING = True

start_time = time.time()

env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
        X_test = test_df.loc[:, features]
        X_test.fillna(X_test.mean(),inplace=True)
        select_X_test = X_test[features]
        y_preds = clf.predict(select_X_test)
        sample_prediction_df.action = y_preds.astype(int)
    else:
        sample_prediction_df.action = 0
    env.predict(sample_prediction_df)
        
print(f"took: {time.time() - start_time} seconds")