references

https://www.kaggle.com/kamalnaithani/lightgbm-stock-prediction-1-1

* version 12: without weight==0, fillna - None, cv: 0.6625 - time out
* version 13: without weight==0, fillna - Mean, cv: 0.6622 - time out
* version 15: with weight==0, fillna - None, cv: 0.6856 - time out
* version 16: ver15 + not reduce mem, num leaves 450, n_estimators 1000, cv: 0.6764 - 5457.310
* version 17: ver16 + num leaves 300, learning rate 0.12, cv: 0.6799 - 5510.777
* version 18: ver17 + metric auc, max_bin 450, cv: 0.6807 - time out
* version 20: ver18 + RobustScaler, cv: 0.6822 - time out
* version 21: ver20 + modified params(learning rate, cs, ss), cv: 0.6865
* version 22: ver21 + boosting 'goss', cv: 0.6328
* version 23: ver21 + boosting 'dart'

# packages

In [None]:
import numpy as np
import pandas as pd
import datatable as dt

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler # ver 19

import gc
import janestreet
import warnings
warnings.filterwarnings(action='ignore')

# Defining functions

In [None]:
# def reduce_memory_usage(df):
    
#     start_memory = df.memory_usage().sum() / 1024**2
#     print(f"Memory usage of dataframe is {start_memory} MB")
    
#     for col in df.columns:
#         col_type = df[col].dtype
        
#         if col_type != 'object':
#             c_min = df[col].min()
#             c_max = df[col].max()
            
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int64)
            
#             else:
#                 if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     pass
#         else:
#             df[col] = df[col].astype('category')
    
#     end_memory = df.memory_usage().sum() / 1024**2
#     print(f"Memory usage of dataframe after reduction {end_memory} MB")
#     print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
#     return df

# Loading files

In [None]:
path = '/kaggle/input/jane-street-market-prediction/'
train = dt.fread(path+'train.csv')
train = train.to_pandas()

# Preprocessing data

In [None]:
# train = reduce_memory_usage(train)
# train.fillna(train.mean(),inplace=True)

In [None]:
train['resp'] = (((train['resp'].values)*train['weight']) > 0).astype(int)
train['resp_1'] = (((train['resp_1'].values)*train['weight']) > 0).astype(int)
train['resp_2'] = (((train['resp_2'].values)*train['weight']) > 0).astype(int)
train['resp_3'] = (((train['resp_3'].values)*train['weight']) > 0).astype(int)
train['resp_4'] = (((train['resp_4'].values)*train['weight']) > 0).astype(int)

In [None]:
#f_mean = train.mean()
features = ['feature_{}'.format(i) for i in range(0,130)]
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp']

In [None]:
train_df = train[train['date']>85]
#train_df = train_df[train_df['weight'] != 0]
train_data = train_df[features]
train_target = np.stack([(train_df[c] > 0).astype('int') for c in resp_cols]).T
del train, train_df

In [None]:
train_data.head()

In [None]:
rb = RobustScaler().fit(train_data)
train_data = pd.DataFrame(rb.transform(train_data), columns=train_data.columns)

In [None]:
train_data

# Modeling

In [None]:
lgb_models = []

In [None]:
lgb_params = {
    'n_jobs':-1,
    'num_leaves':300,
    'learning_rate':0.1,
    'n_estimators':1500,
    'objective':'binary',
    'subsample':0.52,
    'colsample_bytree':0.52,
    'metric':'auc',
    'max_bin':450,
    'boosting': 'dart'
}

In [None]:
for i in range(train_target.shape[1]):
    x_tr,x_val,y_tr,y_val = train_test_split(train_data ,train_target[:,i],test_size=0.2, stratify=train_target[:,i], random_state=i)
    lgb_clf = LGBMClassifier(**lgb_params)
    lgb_clf.fit(x_tr, y_tr, eval_set=[(x_tr, y_tr),(x_val,y_val)], eval_metric='auc', early_stopping_rounds=100, verbose=50)
    lgb_models.append(lgb_clf)

In [None]:
print('average CV score:',np.mean([model.best_score_['valid_1']['auc'] for model in lgb_models]))

# Submission

In [None]:
th = 0.5
env = janestreet.make_env()

In [None]:
for (test_df, pred_df) in env.iter_test():
    if test_df['weight'].item() > 0:
        x_tt = test_df[features]
        #x_tt.fillna(f_mean, inplace=True)
        x_tt = pd.DataFrame(rb.transform(x_tt), columns=x_tt.columns)
        
        pred = np.median([model.predict_proba(x_tt)[:,1] for model in lgb_models]).T
        pred_df.action = np.where(pred >= th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)