In [12]:
from fastai.tabular.all import *

from sklearn.model_selection import KFold, GroupKFold
import lightgbm as lgb

In [13]:
import pickle

In [14]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [15]:
train_df = pd.read_csv('train_with_features.csv')

In [26]:
def train_models(train):
    # Hyperparammeters (optimized)
    seed = 29
    params = {
        'learning_rate': 0.1,        
        'lambda_l1': 2,
        'lambda_l2': 7,
        'num_leaves': 800,
        'min_sum_hessian_in_leaf': 20,
        'feature_fraction': 0.8,
        'feature_fraction_bynode': 0.8,
        'bagging_fraction': 0.9,
        'bagging_freq': 42,
        'min_data_in_leaf': 700,
        'max_depth': 4,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'objective': 'rmse',
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
    }   
    
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    models =[]
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create a KFold object
    #kfold = KFold(n_splits = 10, random_state = 1111, shuffle = True)
    kfold = GroupKFold(n_splits=5)
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x, groups=train.time_id)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 3000, 
                          early_stopping_rounds = 25, 
                          verbose_eval = 100,
                          feval = feval_rmspe)
        models.append(model)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        # Predict the test set
        #test_predictions += model.predict(x_test) / 10
        
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return models

In [27]:
models = train_models(train_df)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000468478	training's RMSPE: 0.217007	valid_1's rmse: 0.000505003	valid_1's RMSPE: 0.232905
[200]	training's rmse: 0.000450753	training's RMSPE: 0.208796	valid_1's rmse: 0.000501078	valid_1's RMSPE: 0.231095
[300]	training's rmse: 0.000438788	training's RMSPE: 0.203254	valid_1's rmse: 0.000499106	valid_1's RMSPE: 0.230185
Early stopping, best iteration is:
[310]	training's rmse: 0.00043731	training's RMSPE: 0.202569	valid_1's rmse: 0.000498997	valid_1's RMSPE: 0.230135
Training fold 2




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000468578	training's RMSPE: 0.216426	valid_1's rmse: 0.000513493	valid_1's RMSPE: 0.23956
[200]	training's rmse: 0.000450836	training's RMSPE: 0.208232	valid_1's rmse: 0.000508127	valid_1's RMSPE: 0.237057
Early stopping, best iteration is:
[199]	training's rmse: 0.000451057	training's RMSPE: 0.208334	valid_1's rmse: 0.000508106	valid_1's RMSPE: 0.237047
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472582	training's RMSPE: 0.218412	valid_1's rmse: 0.000482166	valid_1's RMSPE: 0.224394
[200]	training's rmse: 0.000453994	training's RMSPE: 0.209821	valid_1's rmse: 0.000478207	valid_1's RMSPE: 0.222551
Early stopping, best iteration is:
[253]	training's rmse: 0.000447311	training's RMSPE: 0.206732	valid_1's rmse: 0.000477407	valid_1's RMSPE: 0.222179
Training fold 4




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472537	training's RMSPE: 0.218653	valid_1's rmse: 0.000479732	valid_1's RMSPE: 0.222201
[200]	training's rmse: 0.000455613	training's RMSPE: 0.210822	valid_1's rmse: 0.000475906	valid_1's RMSPE: 0.220429
Early stopping, best iteration is:
[247]	training's rmse: 0.000450194	training's RMSPE: 0.208315	valid_1's rmse: 0.000474217	valid_1's RMSPE: 0.219646
Training fold 5




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472597	training's RMSPE: 0.219321	valid_1's rmse: 0.000479139	valid_1's RMSPE: 0.219312
[200]	training's rmse: 0.00045358	training's RMSPE: 0.210496	valid_1's rmse: 0.000475596	valid_1's RMSPE: 0.21769
Early stopping, best iteration is:
[249]	training's rmse: 0.000447797	training's RMSPE: 0.207812	valid_1's rmse: 0.000474627	valid_1's RMSPE: 0.217247
Our out of folds RMSPE is 0.22536971651368237


In [6]:
x_test = train_df.drop(['row_id', 'time_id', 'target'], axis = 1)
x_test['stock_id'] = x_test['stock_id'].astype(int)

In [29]:
for idx, model in enumerate(models):
    filename = f'optiver-models-private/data/models/lgb_fold{idx}.pickle'
    pickle.dump(model, open(filename, 'wb'))
    

In [8]:
for idx in range(10):
    filename = f'models/lgb_fold{idx}.pickle'
    model = pickle.load(open(filename, 'rb'))
    preds = model.predict(x_test)
    print(rmspe(train_df.target, preds))

0.18470989403729027
0.1737223782065638
0.19073378463864546
0.1880587200715555
0.17799874022536552
0.1725530401092998
0.1898247989760112
0.18514039121932407
0.18203909027786244
0.1981542175208046


## NEW features

In [34]:
from optiver_features import *

In [38]:
book_feature_dict = {
        wap1: [np.mean, np.std],
        wap2: [np.mean, np.std],
        log_return1: [realized_volatility, np.mean, np.std],
        log_return2: [realized_volatility, np.mean, np.std],
        wap_balance: [np.mean, np.std],
        price_spread:[np.mean, np.std],
        bid_spread: [np.mean, np.std],
        ask_spread:[np.mean, np.std],
        total_volume:[np.mean, np.std],
        volume_imbalance:[np.mean, np.std]
    }

trade_feature_dict = {
        log_return_price: [realized_volatility],
        'seconds_in_bucket':[np.size],
        'size':[np.sum],
        'order_count':[np.mean],
    }

time_windows = [(0,600), (150,600), (300,600), (450,600)]


cols = ['log_return_price_realized_volatility', 'log_return1_realized_volatility', 'log_return2_realized_volatility']
time_id_features = [f'{col}_{x}_{y}' for x,y in time_windows for col in cols] 
time_id_aggregations = ['mean', 'std', 'max', 'min' ]
stock_id_features = time_id_features
stock_id_aggregations = time_id_aggregations

In [40]:
time_id_features

['log_return_price_realized_volatility_0_600',
 'log_return1_realized_volatility_0_600',
 'log_return2_realized_volatility_0_600',
 'log_return_price_realized_volatility_150_600',
 'log_return1_realized_volatility_150_600',
 'log_return2_realized_volatility_150_600',
 'log_return_price_realized_volatility_300_600',
 'log_return1_realized_volatility_300_600',
 'log_return2_realized_volatility_300_600',
 'log_return_price_realized_volatility_450_600',
 'log_return1_realized_volatility_450_600',
 'log_return2_realized_volatility_450_600']

In [41]:
ofg = OptiverFeatureGenerator(book_feature_dict, trade_feature_dict, time_windows, time_id_features,time_id_aggregations, stock_id_features, stock_id_aggregations)

In [42]:
%%time
train_new = ofg.generate_train_df()

CPU times: user 2.68 s, sys: 1.12 s, total: 3.8 s
Wall time: 3min 33s


In [43]:
models = train_models(train_new)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000470255	training's RMSPE: 0.21783	valid_1's rmse: 0.000506402	valid_1's RMSPE: 0.23355
[200]	training's rmse: 0.000452357	training's RMSPE: 0.209539	valid_1's rmse: 0.000501532	valid_1's RMSPE: 0.231304
Early stopping, best iteration is:
[232]	training's rmse: 0.000448597	training's RMSPE: 0.207798	valid_1's rmse: 0.000501053	valid_1's RMSPE: 0.231083
Training fold 2




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000469125	training's RMSPE: 0.216679	valid_1's rmse: 0.000517973	valid_1's RMSPE: 0.24165
[200]	training's rmse: 0.000451591	training's RMSPE: 0.20858	valid_1's rmse: 0.000512826	valid_1's RMSPE: 0.239249
[300]	training's rmse: 0.000439905	training's RMSPE: 0.203183	valid_1's rmse: 0.000510238	valid_1's RMSPE: 0.238041
Early stopping, best iteration is:
[322]	training's rmse: 0.000438094	training's RMSPE: 0.202346	valid_1's rmse: 0.000509593	valid_1's RMSPE: 0.23774
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472924	training's RMSPE: 0.21857	valid_1's rmse: 0.000483791	valid_1's RMSPE: 0.22515
[200]	training's rmse: 0.000455384	training's RMSPE: 0.210463	valid_1's rmse: 0.000481298	valid_1's RMSPE: 0.22399
Early stopping, best iteration is:
[263]	training's rmse: 0.000447562	training's RMSPE: 0.206849	valid_1's rmse: 0.000480469	valid_1's RMSPE: 0.223604
Training fold 4




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000473982	training's RMSPE: 0.219322	valid_1's rmse: 0.000483915	valid_1's RMSPE: 0.224138
Early stopping, best iteration is:
[126]	training's rmse: 0.000468624	training's RMSPE: 0.216843	valid_1's rmse: 0.00048298	valid_1's RMSPE: 0.223705
Training fold 5




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472782	training's RMSPE: 0.219407	valid_1's rmse: 0.00048239	valid_1's RMSPE: 0.2208
Early stopping, best iteration is:
[151]	training's rmse: 0.000462582	training's RMSPE: 0.214673	valid_1's rmse: 0.000481369	valid_1's RMSPE: 0.220333
Our out of folds RMSPE is 0.22738050185706846


In [44]:
for idx, model in enumerate(models):
    filename = f'optiver-models-private/data/models/lgb_fold{idx}.pickle'
    pickle.dump(model, open(filename, 'wb'))

In [46]:
train_new['seconds_in_bucket_size_0_600']=train_new['seconds_in_bucket_size_0_600'].astype(np.float32)

In [51]:
nacols = train_new.isna().any()

In [53]:
nacols[nacols==True]

seconds_in_bucket_size_0_600                    True
size_sum_0_600                                  True
order_count_mean_0_600                          True
log_return_price_realized_volatility_0_600      True
seconds_in_bucket_size_150_600                  True
size_sum_150_600                                True
order_count_mean_150_600                        True
log_return_price_realized_volatility_150_600    True
seconds_in_bucket_size_300_600                  True
size_sum_300_600                                True
order_count_mean_300_600                        True
log_return_price_realized_volatility_300_600    True
seconds_in_bucket_size_450_600                  True
size_sum_450_600                                True
order_count_mean_450_600                        True
log_return_price_realized_volatility_450_600    True
dtype: bool

In [48]:
[(c, train_new[c].dtype) for c in train_new.columns]

[('stock_id', dtype('int64')),
 ('time_id', dtype('int64')),
 ('target', dtype('float64')),
 ('row_id', dtype('O')),
 ('seconds_in_bucket_size_0_600', dtype('float32')),
 ('size_sum_0_600', dtype('float64')),
 ('order_count_mean_0_600', dtype('float32')),
 ('log_return_price_realized_volatility_0_600', dtype('float32')),
 ('seconds_in_bucket_size_150_600', dtype('float32')),
 ('size_sum_150_600', dtype('float64')),
 ('order_count_mean_150_600', dtype('float32')),
 ('log_return_price_realized_volatility_150_600', dtype('float32')),
 ('seconds_in_bucket_size_300_600', dtype('float32')),
 ('size_sum_300_600', dtype('float64')),
 ('order_count_mean_300_600', dtype('float32')),
 ('log_return_price_realized_volatility_300_600', dtype('float32')),
 ('seconds_in_bucket_size_450_600', dtype('float32')),
 ('size_sum_450_600', dtype('float64')),
 ('order_count_mean_450_600', dtype('float32')),
 ('log_return_price_realized_volatility_450_600', dtype('float32')),
 ('wap1_mean_0_600', dtype('float32