In [None]:
import gc

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
import janestreet

In [None]:
# load data
data = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

In [None]:
# Clean data (usuless data)
data=data[data.weight!=0]
print(f'Cleaned Weight Data equal Zero. Raw data shape is {data.shape}')
train_median = data.median()
# Settings
features = [c for c in data.columns if 'feature' in c]
target = 'resp'
# split data into X (features) and Y (target)
X = data.loc[:, features]
Y = (data.loc[:, target] > 0).astype(int)
print('Split data into X and Y - Finished')

In [None]:
del data
gc.collect()

In [None]:
X = X.fillna(train_median)
print('Fill data with NAN values correction - Finished')

In [None]:
# Before we perform PCA, we need to normalise the features so that they have zero mean and unit variance
scaler = StandardScaler()
scaler.fit(X)
X_norm = scaler.transform(X)
pca = PCA(n_components=60).fit(X_norm)
X_transform = pca.transform(X_norm)
print('PCA perform - Finished')

In [None]:
# Train model
model = xgb.XGBClassifier(
    tree_method='gpu_hist',
    nthread=4
)
model.fit(X_transform, Y)
print('Train Model - Finished')

In [None]:
# We impute the missing values with the medians
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
# Create submission
env = janestreet.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test: 
    sample_prediction_df.action = model.predict(pca.transform(scaler.transform(fillna_npwhere(test_df[features].values,train_median[features].values))))
    env.predict(sample_prediction_df)