In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import datatable as dt
import lightgbm as lgb
import gc
import pickle
from tqdm import tqdm
import os

In [None]:
#data = dt.fread('../input/jane-street-market-prediction/train.csv').to_pandas()
data = pd.read_csv('../input/jane-street-market-prediction/train.csv', nrows=1000)

In [None]:
target_cols = ['resp']
useless_cols = ['date', 'weight', 'ts_id', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
feature_cols = list(data.drop(useless_cols + target_cols, axis=1))

In [None]:
plt.hist(data[target_cols[0]])
plt.show()

In [None]:
for p in [10, 25, 50, 75, 90]:
    print(p, np.percentile(data[target_cols[0]].values, p))

In [None]:
data['action'] = 0
data['action'] = np.where(data[target_cols[0]].values<-0.02, 0, data['action'].values)
data['action'] = np.where((data[target_cols[0]].values>=-0.02) & (data[target_cols[0]].values<-0.01), 1, data['action'].values)
data['action'] = np.where((data[target_cols[0]].values>=-0.01) & (data[target_cols[0]].values<0), 2, data['action'].values)
data['action'] = np.where((data[target_cols[0]].values>=0) & (data[target_cols[0]].values<0.01), 3, data['action'].values)
data['action'] = np.where((data[target_cols[0]].values>=0.01) & (data[target_cols[0]].values<0.02), 4, data['action'].values)
data['action'] = np.where(data[target_cols[0]].values>=0.02, 5, data['action'].values)

In [None]:
plt.hist(data['action'])
plt.show()

In [None]:
params_k = {
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'num_class':6,
            'subsample': 0.5,
            'subsample_freq': 1,
            'learning_rate': 0.03,
            'num_leaves': 2**11-1,
            'min_data_in_leaf': 2**12-1,
            'feature_fraction': 0.5,
            'max_bin': 100,
            'n_estimators': 500,
            'boost_from_average': False,
            "random_seed":42}

In [None]:
train_set, val_set = data.loc[data['date']>=85], data.loc[data['date']<85]
del data
gc.collect()

In [None]:
train_data = lgb.Dataset(data=train_set[feature_cols],
                         label=train_set['action'],
                         free_raw_data=False)
del train_set
gc.collect()
valid_data = lgb.Dataset(data=val_set[feature_cols],
                         label=val_set['action'],
                         free_raw_data=False)
del val_set
gc.collect()

In [None]:
if os.path.isfile('../input/lgb-multi-class/model_lgb.pkl'):
    with open('../input/lgb-multi-class/model_lgb.pkl', 'rb') as fin:
        model_gbm = pickle.load(fin)
else:
    model_gbm = lgb.train(params_k, train_data, valid_sets=[valid_data],
                  num_boost_round=2000, early_stopping_rounds=25,
                  verbose_eval=25)
    with open('model_lgb.pkl', 'wb') as fout:
        pickle.dump(model_gbm, fout)

In [None]:
test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv', nrows=1)
pred = model_gbm.predict(test[feature_cols].values)
print(np.argmax(pred))

In [None]:
import janestreet
env = janestreet.make_env()
for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
        x_tt = test_df[feature_cols]
        pred = np.argmax(model_gbm.predict(x_tt))
        pred_df.action = np.where(pred > 2, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)