In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from xgboost import plot_importance

In [None]:
train_df = pd.read_csv("../input/jane-street-market-prediction/train.csv")
test_df = pd.read_csv("../input/jane-street-market-prediction/example_test.csv")

In [None]:
print("Data shape : \n\n" , train_df.shape, "\n\n")
print("Data colname : \n\n", train_df.columns, "\n\n")
print("Data head : \n\n", train_df.head(), "\n\n")
print("Data describe : \n\n", train_df.describe(), "\n\n")
print("Data types : \n\n", train_df.dtypes, "\n\n")
print("Data Null count : \n\n", train_df.isnull().sum(axis=0), "\n\n")

In [None]:
features = [c for c in train_df.columns if 'feature' in c]
for i in features:
    x = train_df[i].mean()
    train_df[i] = train_df[i].fillna(x)
    
train_df['action'] = ((train_df['resp'].values) > 0).astype(int)
X_train = train_df.loc[:, features]
y_train = train_df.iloc[:, -1]

In [None]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)

In [None]:
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.3, random_state=777, stratify=y_train) 

In [None]:
# xgb = XGBClassifier(n_estimators=50, learning_rate=0.01, max_depth=10)
# xgb.fit(X_train, y_train)
# pred = xgb.predict(X_train)
# Acc_score = accuracy_score(pred, y_train)
# F1_score = f1_score(pred, y_train)
# Precision_score = precision_score(pred, y_train)
# print("Acc score : {:.2f}".format(Acc_score))
# print("F1 score : {:.2f}".format(F1_score))
# print("Precision score : {:.2f}".format(Precision_score))
xgb = XGBClassifier(n_estimators=20, learning_rate=0.1, max_depth=50)
xgb.fit(x_train_split, y_train_split)
pred = xgb.predict(x_test_split)
Acc_score = accuracy_score(pred, y_test_split)
F1_score = f1_score(pred, y_test_split)
Precision_score = precision_score(pred, y_test_split)
print("Acc score : {:.2f}".format(Acc_score))
print("F1 score : {:.2f}".format(F1_score))
print("Precision score : {:.2f}".format(Precision_score))

In [None]:
import janestreet
from tqdm import tqdm
env = janestreet.make_env()
iter_test = env.iter_test()

f_mean = np.mean(train_df[features[1:]].values,axis=0)

for (test_df, pred_df) in tqdm(iter_test):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        x_tt = pd.DataFrame(x_tt, columns=features)
        x_tt = sc.transform(x_tt)
        pred = xgb.predict(x_tt)
        pred_df.action = pred.astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)