In [None]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings("ignore")

In [None]:
import janestreet
env = janestreet.make_env() 
iter_test = env.iter_test() 
import xgboost as xgb
print("XGBoost version:", xgb.__version__)

In [None]:
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
features = pd.read_csv('../input/jane-street-market-prediction/features.csv')
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')

In [None]:
print('train shape is {}'.format(train.shape))
print('features shape is {}'.format(features.shape))
print('example_test shape is {}'.format(example_test.shape))
print('sample_prediction_df shape is {}'.format(sample_prediction_df.shape))

In [None]:
train.head()

## Handle Missing Values

In [None]:
print (train.isnull().sum())

In [None]:
train.dropna(axis = 1, how = "all", inplace = True)

In [None]:
features = [col for col in list(train.columns) if 'feature' in col]

In [None]:
train = train[train['weight'] != 0]

# binarize the target
train['action'] = (train['resp'].values > 0).astype(int)

#train = train.fillna(-99999)
f_mean = train.mean()
train.fillna(f_mean)

# split data for training and free data space usage to prevent exceeding maximum allowed
X_train = train.loc[:, features]
y_train = train.loc[:, 'action']
del train

print('Finished.')

In [None]:
clf = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=7,
    eta=0.5, # learning_rate
    missing=None,
    random_state=42,
    tree_method='gpu_hist',
    subsample=0.8,
    colsample_bytree=1,
    #sampling_method='gradient_based',
    #eval_metric='logloss',
    verbosity=2   # info
)

In [None]:
f_mean = X_train.mean()
X_train.fillna(f_mean)

In [None]:
%time 
clf.fit(X_train, y_train)

In [None]:
print('Creating submissions file...', end='')
rcount = 0
for (test_df, prediction_df) in env.iter_test():
    X_test = test_df.loc[:, features]
    y_preds = clf.predict(X_test)
    prediction_df.action = y_preds
    env.predict(prediction_df)
    rcount += len(test_df.index)
print(f'Finished processing {rcount} rows.')