In [None]:
import numpy as np
import pandas as pd
import os

import janestreet
import xgboost as xgb
from sklearn.feature_selection import RFE

In [None]:
env = janestreet.make_env()
iter_test = env.iter_test()

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
features = pd.read_csv('../input/jane-street-market-prediction/features.csv')
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')

In [None]:
train = train[train['weight'] != 0]
train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')

train = train.reset_index().drop(['index'], axis=1)

X_train = train.loc[:, train.columns.str.contains('feature')]
y_train = train.loc[:, 'action']

In [None]:
X_train = X_train.fillna(-999)

In [None]:
del train

In [None]:
params = {
    'n_estimators': 500,
    'max_depth': 11,
    'learning_rate': 0.05,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'missing': -999,
    'random_state': 666,
    'tree_method': 'gpu_hist' 
}

In [None]:
from sklearn.model_selection import KFold

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# rfe = RFE(
#     estimator=xgb.XGBClassifier(
#         random_state=666,
#         tree_method='gpu_hist' 
#     ), 
#     n_features_to_select=120
# )
# rfe.fit(X_train, y_train)
# X_transformed = rfe.transform(X_train)
# X_transformed = pd.DataFrame(X_transformed)

models = []
res = y_train.copy()
for n, (tr, te) in enumerate(KFold(n_splits=3, random_state=666, shuffle=True).split(y_train)):
    print(f'Fold {n}')

    train = pd.DataFrame(X_train.values[tr])
    train.columns = X_train.columns

    model = xgb.XGBClassifier(**params)
    model.fit(train, y_train.values[tr])

    check = pd.DataFrame(X_train.values[te])
    check.columns = X_train.columns

    res.loc[te] = model.predict(check)
    models.append(model)

print(accuracy_score(y_train, res))

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    if test_df['weight'].item() > 0:
        X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
        X_test = X_test.fillna(-999)
#         X_test = rfe.transform(X_test)
#         X_test = pd.DataFrame(X_test)
        y_preds = models[0].predict(X_test) + models[1].predict(X_test) + models[2].predict(X_test)
        if y_preds >= 2:
            y_preds = np.array([1])
        else:
            y_preds = np.array([0])
    else:
        y_preds = np.array([0])
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)