In [None]:
import multiprocessing
import gc

import pandas as pd
import xgboost as xgb

import janestreet


### Read in the training data

In [None]:
# Load data
data = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

### Dump the zero weights and grab only the feature data

In [None]:
data = data[data.weight!=0]

# Settings
features = [c for c in data.columns if 'feature' in c]
target = 'resp'

# Split into features X and target Y
X = data.loc[:, features]
y = (data.loc[:, target] > 0).astype(int)

### Data is big so garbage collect to make sure we are good for space

In [None]:
del data
gc.collect()

### Define and train a XGB Classifier using GPU. This leverages the ability of XGB to do its own handling of NaN as well

In [None]:
# Train model

model = xgb.XGBClassifier(
    random_state=1337,
    tree_method='gpu_hist',
    nthread=multiprocessing.cpu_count()
)
model.fit(X, y)

### Create submission file

In [None]:
# Create submission
env = janestreet.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test: 
    X_test = test_df.loc[:, features]
    sample_prediction_df.action = model.predict(X_test)
    env.predict(sample_prediction_df)

print('Finished')