In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
from fastai.tabular.all import *
from multiprocessing import Pool
from sklearn.model_selection import KFold
import lightgbm as lgb
from optiver_features import ffill

## Generate 5m dataset

In [3]:
data_dir = Path('../input/optiver-realized-volatility-prediction')

In [4]:
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

def book_preprocessor(df, stock_id):
    #df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std]
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    #df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    #df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    #df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)
    
    # Merge all
    #df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    #df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    #df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    #df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    # Create row_id so we can merge
    #stock_id = str(file_path).split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature = df_feature.rename(columns={"time_id_": "time_id"})
    
    df_feature['stock_id'] = stock_id
    return df_feature

In [5]:
def trade_preprocessor(df, stock_id):
    #df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
#     df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
#     df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
#     df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)

    # Merge all
#     df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
#     df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
#     df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
#     df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    df_feature = df_feature.add_prefix('trade_')
    #stock_id = str(file_path).split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

In [6]:
def realized_volatility_per_time_id(df_book_data, stock_id):
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':'target'})
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id', 'target']]

In [8]:
def preprocess_one_stock(stock_id, typ='train'):
    book_df = pd.read_parquet(data_dir / f'book_{typ}.parquet/stock_id={stock_id}')
    book_df = ffill(book_df)
    trade_df = pd.read_parquet(data_dir / f'trade_{typ}.parquet/stock_id={stock_id}')
    book_5m  = book_preprocessor(book_df[book_df.seconds_in_bucket < 300].copy(), stock_id)
    trade_5m = trade_preprocessor(trade_df[trade_df.seconds_in_bucket < 300].copy(), stock_id)
    realized_vol = realized_volatility_per_time_id(book_df[book_df.seconds_in_bucket >= 300].copy(), stock_id)
    res = pd.merge(book_5m, trade_5m, on = 'row_id', how = 'left')
    res = pd.merge(res, realized_vol, on = 'row_id', how='left')
    return res

In [9]:
def preprocess_all(list_stock_ids, typ='train'):
    pool = Pool(16)
    df = pool.starmap(preprocess_one_stock, zip(list_stock_ids, [typ]*len(list_stock_ids)))
    df = pd.concat(df, ignore_index = True)
    return df


In [10]:
def generate_train_df():
    train_df = pd.read_csv(data_dir/'train.csv')
    train_stock_ids = train_df['stock_id'].unique()
    train_features = preprocess_all(train_stock_ids, 'train')
    return train_features

In [11]:
train_df = generate_train_df()


In [12]:
train_df.to_csv('train_5m.df', index=False)

In [13]:
len(train_df)

428932

In [14]:
train_df = train_df[train_df.target!=0]
len(train_df)

428931

## LGBM Baseline

In [15]:
def rmspe_np(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe_np(y_true, y_pred), False

def train_models(train):
    # Hyperparammeters (optimized)
    seed = 29
    params = {
        'learning_rate': 0.1,        
        'lambda_l1': 2,
        'lambda_l2': 7,
        'num_leaves': 800,
        'min_sum_hessian_in_leaf': 20,
        'feature_fraction': 0.8,
        'feature_fraction_bynode': 0.8,
        'bagging_fraction': 0.9,
        'bagging_freq': 42,
        'min_data_in_leaf': 700,
        'max_depth': 4,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'objective': 'rmse',
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
    }   
    
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    models =[]
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = 5, random_state = 1111, shuffle = True)
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 3000, 
                          early_stopping_rounds = 25, 
                          verbose_eval = 100,
                          feval = feval_rmspe)
        models.append(model)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        # Predict the test set
        #test_predictions += model.predict(x_test) / 10
        
    rmspe_score = rmspe_np(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return models

In [16]:
_=train_models(train_df)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000360649	training's RMSPE: 0.247353	valid_1's rmse: 0.000370301	valid_1's RMSPE: 0.253292
[200]	training's rmse: 0.000352139	training's RMSPE: 0.241517	valid_1's rmse: 0.00036751	valid_1's RMSPE: 0.251383
Early stopping, best iteration is:
[208]	training's rmse: 0.000351711	training's RMSPE: 0.241223	valid_1's rmse: 0.000367395	valid_1's RMSPE: 0.251304
Training fold 2




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000360059	training's RMSPE: 0.247387	valid_1's rmse: 0.000385922	valid_1's RMSPE: 0.262085
[200]	training's rmse: 0.000352039	training's RMSPE: 0.241877	valid_1's rmse: 0.000382626	valid_1's RMSPE: 0.259846
[300]	training's rmse: 0.0003472	training's RMSPE: 0.238551	valid_1's rmse: 0.000381338	valid_1's RMSPE: 0.258971
Early stopping, best iteration is:
[368]	training's rmse: 0.000344786	training's RMSPE: 0.236893	valid_1's rmse: 0.000380891	valid_1's RMSPE: 0.258668
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000361442	training's RMSPE: 0.245717	valid_1's rmse: 0.000433884	valid_1's RMSPE: 0.307054
Early stopping, best iteration is:
[106]	training's rmse: 0.000360679	training's RMSPE: 0.245198	valid_1's rmse: 0.000433256	valid_1's RMSPE: 0.30661
Training fold 4




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000360735	training's RMSPE: 0.247391	valid_1's rmse: 0.000382665	valid_1's RMSPE: 0.261836
[200]	training's rmse: 0.000352754	training's RMSPE: 0.241918	valid_1's rmse: 0.000378777	valid_1's RMSPE: 0.259176
Early stopping, best iteration is:
[225]	training's rmse: 0.000351392	training's RMSPE: 0.240984	valid_1's rmse: 0.000378498	valid_1's RMSPE: 0.258985
Training fold 5




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000359133	training's RMSPE: 0.24739	valid_1's rmse: 0.000371712	valid_1's RMSPE: 0.249735
[200]	training's rmse: 0.000351016	training's RMSPE: 0.241799	valid_1's rmse: 0.000367894	valid_1's RMSPE: 0.24717
[300]	training's rmse: 0.000346282	training's RMSPE: 0.238538	valid_1's rmse: 0.000366285	valid_1's RMSPE: 0.246089
[400]	training's rmse: 0.000342881	training's RMSPE: 0.236195	valid_1's rmse: 0.000365247	valid_1's RMSPE: 0.245391
[500]	training's rmse: 0.000340189	training's RMSPE: 0.234341	valid_1's rmse: 0.000364664	valid_1's RMSPE: 0.245
[600]	training's rmse: 0.000337919	training's RMSPE: 0.232777	valid_1's rmse: 0.000364243	valid_1's RMSPE: 0.244717
[700]	training's rmse: 0.000335984	training's RMSPE: 0.231444	valid_1's rmse: 0.000363968	valid_1's RMSPE: 0.244532
Early stopping, best iteration is:
[717]	training's rmse: 0.000335638	training's RMSPE: 0.231206	valid_1's rmse: 0.000363884	valid_1

## Learning embeddings

In [17]:
cont_nn,cat_nn = cont_cat_split(train_df, max_card=9000, dep_var='target')
cat_nn.remove('row_id')

In [18]:
cat_nn

['time_id', 'stock_id']

In [19]:
procs_nn = [Categorify,FillMissing, Normalize]

splits = RandomSplitter()(train_df)

dls = TabularPandas(train_df, procs_nn, cat_nn, cont_nn,
                      splits=splits, 
                       
                      y_names='target').dataloaders(1024)

In [20]:
def rmspe(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

config={'lin_first':True, 'ps':[.5,.2,0], 'embed_p':.25, }
learn = tabular_learner(dls, y_range=(0,.1), layers=[100,50,20], emb_szs={'stock_id':10, 'time_id':10}, 
                        n_out=1, loss_func = rmspe, metrics=AccumMetric(rmspe), config=config)

In [21]:
learn.fit_one_cycle(20, 5e-3)

epoch,train_loss,valid_loss,rmspe,time
0,13.700855,11.251608,11.255558,00:07
1,2.013942,0.636771,0.641814,00:03
2,0.416136,0.298455,0.299697,00:03
3,0.30659,0.26957,0.284681,00:03
4,0.338225,0.267941,0.270801,00:03
5,0.287679,0.254917,0.261571,00:03
6,0.272733,0.252641,0.257634,00:03
7,0.28398,0.250858,0.260573,00:03
8,0.269417,0.246369,0.253124,00:03
9,0.264217,0.243016,0.250573,00:03


In [22]:
torch.save(learn.model.embeds[0].weight.data, 'timeemb10.pt')

In [23]:
categorify = dls.procs[2]
len(categorify.classes['time_id'])

3831

In [24]:
time_id_embs = dict()
for idx, time_id in enumerate(categorify.classes['time_id']):
    time_id_embs[time_id] = learn.model.embeds[0].weight[idx].tolist()

stock_id_embs = dict()
for idx, stock_id in enumerate(categorify.classes['stock_id']):
    stock_id_embs[stock_id] = learn.model.embeds[1].weight[idx].tolist()

all_embs = []
for _, row in (train_df[['stock_id', 'time_id']].iterrows()):
    emb1 = stock_id_embs[row.stock_id]
    emb2 = time_id_embs[row.time_id]
    all_embs.append(emb1+emb2)

In [26]:
np.array(all_embs).shape

(428931, 20)

In [27]:
embs_df = pd.DataFrame(all_embs)

train_with_embs = pd.concat([train_df.reset_index(), embs_df], axis=1)

train_with_embs

Unnamed: 0,index,time_id,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,log_return1_sum,log_return1_realized_volatility,...,10,11,12,13,14,15,16,17,18,19
0,0,5,301.078967,1.003597,0.000851,301.053515,1.003512,0.001014,0.002134,0.003394,...,0.233299,-0.204058,-0.150260,0.259856,0.089705,0.161742,0.157481,0.161818,0.317646,-0.243916
1,1,11,299.996183,0.999987,0.000156,299.994795,0.999983,0.000182,0.000264,0.000699,...,-0.242039,0.098177,-0.031663,0.035412,-0.154435,-0.094839,-0.036259,-0.021471,-0.101952,0.222299
2,2,16,299.988131,0.999960,0.000444,300.032917,1.000110,0.000470,0.000517,0.001983,...,0.188573,-0.334327,-0.035679,0.115466,-0.243960,0.313850,0.162906,0.240454,0.155647,0.201758
3,3,31,299.765819,0.999219,0.000846,299.650614,0.998835,0.000762,-0.001649,0.001863,...,-0.049514,-0.020515,0.027231,0.091227,0.038912,0.032656,0.040256,0.089729,0.039902,-0.142195
4,4,62,299.930817,0.999769,0.000238,299.905133,0.999684,0.000335,-0.000647,0.001131,...,-0.106804,-0.049446,0.069812,-0.074556,-0.079364,0.084584,0.024387,-0.017761,0.026403,0.020131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428926,428927,32751,299.925384,0.999751,0.000534,299.982478,0.999942,0.000673,-0.000855,0.002284,...,0.156987,-0.354330,-0.268756,0.152665,-0.373841,0.228860,0.202575,0.215003,0.121716,-0.040777
428927,428928,32753,300.440929,1.001470,0.000507,300.453773,1.001513,0.000567,0.001099,0.002217,...,0.145648,-0.015200,-0.168361,0.104826,0.024862,0.089448,-0.038945,0.110650,0.089973,-0.071930
428928,428929,32758,300.196802,1.000656,0.000319,300.175681,1.000586,0.000396,-0.000729,0.001386,...,0.054450,-0.160985,-0.292313,0.040895,-0.161483,0.139058,0.137216,0.047032,0.133540,0.062504
428929,428930,32763,300.535968,1.001787,0.000449,300.538093,1.001794,0.000494,0.002014,0.002783,...,0.273045,-0.242523,-0.257313,0.305342,0.073577,0.004705,0.174177,0.236224,0.149944,-0.267228


## Train 5m LGBM with embeddings


In [28]:
_=train_models(train_with_embs)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000324031	training's RMSPE: 0.222238	valid_1's rmse: 0.000332081	valid_1's RMSPE: 0.227149
[200]	training's rmse: 0.00031552	training's RMSPE: 0.216401	valid_1's rmse: 0.000327265	valid_1's RMSPE: 0.223854
[300]	training's rmse: 0.000310753	training's RMSPE: 0.213132	valid_1's rmse: 0.000325976	valid_1's RMSPE: 0.222973
Early stopping, best iteration is:
[300]	training's rmse: 0.000310753	training's RMSPE: 0.213132	valid_1's rmse: 0.000325976	valid_1's RMSPE: 0.222973
Training fold 2




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000322768	training's RMSPE: 0.221765	valid_1's rmse: 0.000347521	valid_1's RMSPE: 0.236006
[200]	training's rmse: 0.000314802	training's RMSPE: 0.216292	valid_1's rmse: 0.000342993	valid_1's RMSPE: 0.23293
Early stopping, best iteration is:
[256]	training's rmse: 0.000312153	training's RMSPE: 0.214472	valid_1's rmse: 0.000342132	valid_1's RMSPE: 0.232346
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000325286	training's RMSPE: 0.221138	valid_1's rmse: 0.000347615	valid_1's RMSPE: 0.246003
Early stopping, best iteration is:
[103]	training's rmse: 0.00032496	training's RMSPE: 0.220916	valid_1's rmse: 0.00034736	valid_1's RMSPE: 0.245822
Training fold 4




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000323893	training's RMSPE: 0.222125	valid_1's rmse: 0.000343677	valid_1's RMSPE: 0.235159
[200]	training's rmse: 0.000315516	training's RMSPE: 0.216381	valid_1's rmse: 0.000338029	valid_1's RMSPE: 0.231295
[300]	training's rmse: 0.000310725	training's RMSPE: 0.213095	valid_1's rmse: 0.000334501	valid_1's RMSPE: 0.22888
Early stopping, best iteration is:
[347]	training's rmse: 0.000308923	training's RMSPE: 0.211859	valid_1's rmse: 0.00033413	valid_1's RMSPE: 0.228627
Training fold 5




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000322635	training's RMSPE: 0.222249	valid_1's rmse: 0.000336257	valid_1's RMSPE: 0.225914
[200]	training's rmse: 0.000314039	training's RMSPE: 0.216327	valid_1's rmse: 0.000332355	valid_1's RMSPE: 0.223293
[300]	training's rmse: 0.000309389	training's RMSPE: 0.213124	valid_1's rmse: 0.000330444	valid_1's RMSPE: 0.222009
[400]	training's rmse: 0.000306236	training's RMSPE: 0.210952	valid_1's rmse: 0.000329441	valid_1's RMSPE: 0.221335
[500]	training's rmse: 0.000303719	training's RMSPE: 0.209218	valid_1's rmse: 0.000328984	valid_1's RMSPE: 0.221028
[600]	training's rmse: 0.000301583	training's RMSPE: 0.207747	valid_1's rmse: 0.000328692	valid_1's RMSPE: 0.220832
[700]	training's rmse: 0.000299705	training's RMSPE: 0.206453	valid_1's rmse: 0.000328403	valid_1's RMSPE: 0.220638
Early stopping, best iteration is:
[710]	training's rmse: 0.000299511	training's RMSPE: 0.20632	valid_1's rmse: 0.000328358	val

## Baseline LGMB 10m

In [29]:
train_10m = pd.read_csv('train_with_features_NO_ST.csv')

In [30]:
from optiver_features import get_time_stock
train_10m_time_stock = get_time_stock(train_10m)

In [31]:
_=train_models(train_10m_time_stock)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472369	training's RMSPE: 0.218697	valid_1's rmse: 0.000483167	valid_1's RMSPE: 0.223294
[200]	training's rmse: 0.000456434	training's RMSPE: 0.211319	valid_1's rmse: 0.000471963	valid_1's RMSPE: 0.218117
[300]	training's rmse: 0.000444947	training's RMSPE: 0.206001	valid_1's rmse: 0.000463557	valid_1's RMSPE: 0.214231
[400]	training's rmse: 0.00043609	training's RMSPE: 0.201901	valid_1's rmse: 0.000457713	valid_1's RMSPE: 0.211531
[500]	training's rmse: 0.000428288	training's RMSPE: 0.198288	valid_1's rmse: 0.000452597	valid_1's RMSPE: 0.209167
[600]	training's rmse: 0.000422108	training's RMSPE: 0.195427	valid_1's rmse: 0.000449117	valid_1's RMSPE: 0.207558
[700]	training's rmse: 0.000416375	training's RMSPE: 0.192773	valid_1's rmse: 0.000445402	valid_1's RMSPE: 0.205841
[800]	training's rmse: 0.000410935	training's RMSPE: 0.190254	valid_1's rmse: 0.00044216	valid_1's RMSPE: 0.204343
[900]	trainin



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.00047405	training's RMSPE: 0.219247	valid_1's rmse: 0.000493132	valid_1's RMSPE: 0.228847
[200]	training's rmse: 0.000457643	training's RMSPE: 0.211659	valid_1's rmse: 0.000482029	valid_1's RMSPE: 0.223694
[300]	training's rmse: 0.000446028	training's RMSPE: 0.206287	valid_1's rmse: 0.000474004	valid_1's RMSPE: 0.21997
[400]	training's rmse: 0.000437256	training's RMSPE: 0.20223	valid_1's rmse: 0.000468424	valid_1's RMSPE: 0.217381
[500]	training's rmse: 0.000429788	training's RMSPE: 0.198776	valid_1's rmse: 0.000463568	valid_1's RMSPE: 0.215127
Early stopping, best iteration is:
[546]	training's rmse: 0.000426411	training's RMSPE: 0.197214	valid_1's rmse: 0.0004622	valid_1's RMSPE: 0.214492
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000471619	training's RMSPE: 0.21837	valid_1's rmse: 0.000478397	valid_1's RMSPE: 0.221005
[200]	training's rmse: 0.000456622	training's RMSPE: 0.211427	valid_1's rmse: 0.000466537	valid_1's RMSPE: 0.215526
[300]	training's rmse: 0.000445607	training's RMSPE: 0.206327	valid_1's rmse: 0.000458002	valid_1's RMSPE: 0.211583
[400]	training's rmse: 0.000436655	training's RMSPE: 0.202182	valid_1's rmse: 0.000451147	valid_1's RMSPE: 0.208416
[500]	training's rmse: 0.000429267	training's RMSPE: 0.198761	valid_1's rmse: 0.000445774	valid_1's RMSPE: 0.205934
[600]	training's rmse: 0.000422695	training's RMSPE: 0.195718	valid_1's rmse: 0.000441604	valid_1's RMSPE: 0.204008
[700]	training's rmse: 0.00041678	training's RMSPE: 0.192979	valid_1's rmse: 0.000438011	valid_1's RMSPE: 0.202348
[800]	training's rmse: 0.000411842	training's RMSPE: 0.190692	valid_1's rmse: 0.000434914	valid_1's RMSPE: 0.200917
[900]	trainin



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000473051	training's RMSPE: 0.218659	valid_1's rmse: 0.000479689	valid_1's RMSPE: 0.223117
[200]	training's rmse: 0.000457686	training's RMSPE: 0.211557	valid_1's rmse: 0.000469326	valid_1's RMSPE: 0.218297
[300]	training's rmse: 0.000445643	training's RMSPE: 0.20599	valid_1's rmse: 0.000460368	valid_1's RMSPE: 0.214131
[400]	training's rmse: 0.000436376	training's RMSPE: 0.201707	valid_1's rmse: 0.000453278	valid_1's RMSPE: 0.210833
[500]	training's rmse: 0.000428336	training's RMSPE: 0.197991	valid_1's rmse: 0.0004482	valid_1's RMSPE: 0.208471
[600]	training's rmse: 0.000422644	training's RMSPE: 0.19536	valid_1's rmse: 0.000446044	valid_1's RMSPE: 0.207468
[700]	training's rmse: 0.000417281	training's RMSPE: 0.192881	valid_1's rmse: 0.000443371	valid_1's RMSPE: 0.206225
[800]	training's rmse: 0.00041215	training's RMSPE: 0.190509	valid_1's rmse: 0.000440851	valid_1's RMSPE: 0.205053
[900]	training's



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000471791	training's RMSPE: 0.218595	valid_1's rmse: 0.000485867	valid_1's RMSPE: 0.223856
[200]	training's rmse: 0.000456254	training's RMSPE: 0.211397	valid_1's rmse: 0.000472728	valid_1's RMSPE: 0.217803
[300]	training's rmse: 0.000444348	training's RMSPE: 0.20588	valid_1's rmse: 0.000463477	valid_1's RMSPE: 0.213541
[400]	training's rmse: 0.00043573	training's RMSPE: 0.201887	valid_1's rmse: 0.000457275	valid_1's RMSPE: 0.210683
[500]	training's rmse: 0.000428962	training's RMSPE: 0.198751	valid_1's rmse: 0.000453733	valid_1's RMSPE: 0.209051
[600]	training's rmse: 0.000422235	training's RMSPE: 0.195634	valid_1's rmse: 0.000448767	valid_1's RMSPE: 0.206763
[700]	training's rmse: 0.000416327	training's RMSPE: 0.192897	valid_1's rmse: 0.000445273	valid_1's RMSPE: 0.205153
[800]	training's rmse: 0.000411091	training's RMSPE: 0.190471	valid_1's rmse: 0.000442164	valid_1's RMSPE: 0.203721
[900]	trainin

## LGBM 10m with trained embeddings

In [32]:
train_10m = pd.read_csv('train_with_features_NO_ST.csv')

In [33]:
all_embs = []
for _, row in (train_10m[['stock_id', 'time_id']].iterrows()):
    emb1 = stock_id_embs[row.stock_id]
    emb2 = time_id_embs[row.time_id]
    all_embs.append(emb1+emb2)

In [34]:
embs_df = pd.DataFrame(all_embs)

train_10m_with_embs = pd.concat([train_10m.reset_index(), embs_df], axis=1)

In [35]:
train_10m_with_embs = train_10m_with_embs.drop('index', axis=1)

In [36]:
train_10m_with_embs.to_csv('train_10m_with_embs.csv', index=False)

In [37]:
_=train_models(train_10m_with_embs)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.00047901	training's RMSPE: 0.221771	valid_1's rmse: 0.000489841	valid_1's RMSPE: 0.226379
[200]	training's rmse: 0.000467092	training's RMSPE: 0.216254	valid_1's rmse: 0.000480855	valid_1's RMSPE: 0.222226
[300]	training's rmse: 0.000458438	training's RMSPE: 0.212247	valid_1's rmse: 0.000474623	valid_1's RMSPE: 0.219346
[400]	training's rmse: 0.000451401	training's RMSPE: 0.208989	valid_1's rmse: 0.000470069	valid_1's RMSPE: 0.217241
[500]	training's rmse: 0.000445753	training's RMSPE: 0.206374	valid_1's rmse: 0.000466887	valid_1's RMSPE: 0.215771
[600]	training's rmse: 0.000440729	training's RMSPE: 0.204048	valid_1's rmse: 0.000464151	valid_1's RMSPE: 0.214506
[700]	training's rmse: 0.000436425	training's RMSPE: 0.202055	valid_1's rmse: 0.000461894	valid_1's RMSPE: 0.213463
[800]	training's rmse: 0.000431708	training's RMSPE: 0.199872	valid_1's rmse: 0.000459095	valid_1's RMSPE: 0.21217
[900]	trainin



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000480762	training's RMSPE: 0.222351	valid_1's rmse: 0.00050041	valid_1's RMSPE: 0.232225
[200]	training's rmse: 0.000468734	training's RMSPE: 0.216789	valid_1's rmse: 0.000493312	valid_1's RMSPE: 0.228931
[300]	training's rmse: 0.000459816	training's RMSPE: 0.212664	valid_1's rmse: 0.000490282	valid_1's RMSPE: 0.227524
Early stopping, best iteration is:
[306]	training's rmse: 0.000459302	training's RMSPE: 0.212426	valid_1's rmse: 0.000489911	valid_1's RMSPE: 0.227352
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000478773	training's RMSPE: 0.221683	valid_1's rmse: 0.000486042	valid_1's RMSPE: 0.224537
[200]	training's rmse: 0.000467412	training's RMSPE: 0.216422	valid_1's rmse: 0.000478249	valid_1's RMSPE: 0.220936
[300]	training's rmse: 0.000459035	training's RMSPE: 0.212544	valid_1's rmse: 0.000473034	valid_1's RMSPE: 0.218527
[400]	training's rmse: 0.0004524	training's RMSPE: 0.209472	valid_1's rmse: 0.000469162	valid_1's RMSPE: 0.216738
[500]	training's rmse: 0.00044625	training's RMSPE: 0.206624	valid_1's rmse: 0.000465329	valid_1's RMSPE: 0.214968
[600]	training's rmse: 0.00044085	training's RMSPE: 0.204124	valid_1's rmse: 0.000462228	valid_1's RMSPE: 0.213535
[700]	training's rmse: 0.000436317	training's RMSPE: 0.202025	valid_1's rmse: 0.000459871	valid_1's RMSPE: 0.212446
[800]	training's rmse: 0.000432101	training's RMSPE: 0.200073	valid_1's rmse: 0.000457502	valid_1's RMSPE: 0.211352
[900]	training'



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000479458	training's RMSPE: 0.221621	valid_1's rmse: 0.000487176	valid_1's RMSPE: 0.2266
[200]	training's rmse: 0.000467345	training's RMSPE: 0.216022	valid_1's rmse: 0.000480197	valid_1's RMSPE: 0.223354
[300]	training's rmse: 0.000458639	training's RMSPE: 0.211998	valid_1's rmse: 0.000475215	valid_1's RMSPE: 0.221036
[400]	training's rmse: 0.000452067	training's RMSPE: 0.20896	valid_1's rmse: 0.000471546	valid_1's RMSPE: 0.21933
[500]	training's rmse: 0.000446169	training's RMSPE: 0.206233	valid_1's rmse: 0.000468179	valid_1's RMSPE: 0.217764
[600]	training's rmse: 0.000440701	training's RMSPE: 0.203706	valid_1's rmse: 0.000465578	valid_1's RMSPE: 0.216554
[700]	training's rmse: 0.000436238	training's RMSPE: 0.201643	valid_1's rmse: 0.000463425	valid_1's RMSPE: 0.215553
[800]	training's rmse: 0.000432022	training's RMSPE: 0.199694	valid_1's rmse: 0.000461691	valid_1's RMSPE: 0.214746
[900]	training'



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000478615	training's RMSPE: 0.221757	valid_1's rmse: 0.000493492	valid_1's RMSPE: 0.22737
[200]	training's rmse: 0.000466963	training's RMSPE: 0.216358	valid_1's rmse: 0.000485257	valid_1's RMSPE: 0.223575
[300]	training's rmse: 0.000458001	training's RMSPE: 0.212206	valid_1's rmse: 0.000479539	valid_1's RMSPE: 0.220941
[400]	training's rmse: 0.000451343	training's RMSPE: 0.209121	valid_1's rmse: 0.000475593	valid_1's RMSPE: 0.219123
[500]	training's rmse: 0.000445199	training's RMSPE: 0.206274	valid_1's rmse: 0.000473017	valid_1's RMSPE: 0.217936
[600]	training's rmse: 0.000440061	training's RMSPE: 0.203894	valid_1's rmse: 0.000471083	valid_1's RMSPE: 0.217045
Early stopping, best iteration is:
[588]	training's rmse: 0.000440504	training's RMSPE: 0.204099	valid_1's rmse: 0.000470597	valid_1's RMSPE: 0.216821
Our out of folds RMSPE is 0.21368934452167634


## LGBM with embeddings and time/stock features

In [38]:
train_10m_with_embs = get_time_stock(train_10m_with_embs)

In [None]:
_=train_models(train_10m_with_embs)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000469478	training's RMSPE: 0.217358	valid_1's rmse: 0.000479145	valid_1's RMSPE: 0.221435
[200]	training's rmse: 0.000453486	training's RMSPE: 0.209954	valid_1's rmse: 0.000466979	valid_1's RMSPE: 0.215813
[300]	training's rmse: 0.000441454	training's RMSPE: 0.204384	valid_1's rmse: 0.00045842	valid_1's RMSPE: 0.211858
[400]	training's rmse: 0.000432921	training's RMSPE: 0.200433	valid_1's rmse: 0.000452534	valid_1's RMSPE: 0.209137
[500]	training's rmse: 0.000425393	training's RMSPE: 0.196948	valid_1's rmse: 0.000448454	valid_1's RMSPE: 0.207252
[600]	training's rmse: 0.000419201	training's RMSPE: 0.194081	valid_1's rmse: 0.000444025	valid_1's RMSPE: 0.205205
[700]	training's rmse: 0.000413493	training's RMSPE: 0.191438	valid_1's rmse: 0.000440702	valid_1's RMSPE: 0.203669
[800]	training's rmse: 0.000408977	training's RMSPE: 0.189348	valid_1's rmse: 0.00043858	valid_1's RMSPE: 0.202689
[900]	trainin



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000470553	training's RMSPE: 0.21763	valid_1's rmse: 0.000491471	valid_1's RMSPE: 0.228076
[200]	training's rmse: 0.000453771	training's RMSPE: 0.209868	valid_1's rmse: 0.000479281	valid_1's RMSPE: 0.222419
[300]	training's rmse: 0.00044215	training's RMSPE: 0.204494	valid_1's rmse: 0.00047138	valid_1's RMSPE: 0.218753
[400]	training's rmse: 0.000433854	training's RMSPE: 0.200657	valid_1's rmse: 0.000466737	valid_1's RMSPE: 0.216598
[500]	training's rmse: 0.000425961	training's RMSPE: 0.197006	valid_1's rmse: 0.000461459	valid_1's RMSPE: 0.214149
[600]	training's rmse: 0.000419592	training's RMSPE: 0.194061	valid_1's rmse: 0.000458524	valid_1's RMSPE: 0.212786
Early stopping, best iteration is:
[597]	training's rmse: 0.000419692	training's RMSPE: 0.194107	valid_1's rmse: 0.000458129	valid_1's RMSPE: 0.212603
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000468438	training's RMSPE: 0.216897	valid_1's rmse: 0.000474751	valid_1's RMSPE: 0.21932
[200]	training's rmse: 0.000453264	training's RMSPE: 0.209872	valid_1's rmse: 0.000462788	valid_1's RMSPE: 0.213794
[300]	training's rmse: 0.000442085	training's RMSPE: 0.204695	valid_1's rmse: 0.00045425	valid_1's RMSPE: 0.20985
[400]	training's rmse: 0.000432756	training's RMSPE: 0.200376	valid_1's rmse: 0.000447251	valid_1's RMSPE: 0.206616
[500]	training's rmse: 0.000425562	training's RMSPE: 0.197045	valid_1's rmse: 0.000442327	valid_1's RMSPE: 0.204342
[600]	training's rmse: 0.000418904	training's RMSPE: 0.193963	valid_1's rmse: 0.00043767	valid_1's RMSPE: 0.20219
[700]	training's rmse: 0.000412951	training's RMSPE: 0.191206	valid_1's rmse: 0.000433831	valid_1's RMSPE: 0.200417
[800]	training's rmse: 0.000408257	training's RMSPE: 0.189032	valid_1's rmse: 0.000431084	valid_1's RMSPE: 0.199148
[900]	training's



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000469529	training's RMSPE: 0.217031	valid_1's rmse: 0.000476678	valid_1's RMSPE: 0.221717
[200]	training's rmse: 0.000453969	training's RMSPE: 0.209839	valid_1's rmse: 0.000466392	valid_1's RMSPE: 0.216933
[300]	training's rmse: 0.000442021	training's RMSPE: 0.204316	valid_1's rmse: 0.000457537	valid_1's RMSPE: 0.212814
[400]	training's rmse: 0.000433243	training's RMSPE: 0.200258	valid_1's rmse: 0.000451738	valid_1's RMSPE: 0.210117
[500]	training's rmse: 0.000425275	training's RMSPE: 0.196575	valid_1's rmse: 0.000446485	valid_1's RMSPE: 0.207673
[600]	training's rmse: 0.000419041	training's RMSPE: 0.193694	valid_1's rmse: 0.000442938	valid_1's RMSPE: 0.206024
[700]	training's rmse: 0.000413531	training's RMSPE: 0.191147	valid_1's rmse: 0.000439892	valid_1's RMSPE: 0.204607
[800]	training's rmse: 0.000408522	training's RMSPE: 0.188832	valid_1's rmse: 0.000437022	valid_1's RMSPE: 0.203272
[900]	train



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000468379	training's RMSPE: 0.217014	valid_1's rmse: 0.00048083	valid_1's RMSPE: 0.221536
[200]	training's rmse: 0.000452861	training's RMSPE: 0.209824	valid_1's rmse: 0.000468084	valid_1's RMSPE: 0.215663
[300]	training's rmse: 0.00044098	training's RMSPE: 0.20432	valid_1's rmse: 0.000458318	valid_1's RMSPE: 0.211164
[400]	training's rmse: 0.000432071	training's RMSPE: 0.200192	valid_1's rmse: 0.000451827	valid_1's RMSPE: 0.208173
[500]	training's rmse: 0.000424664	training's RMSPE: 0.19676	valid_1's rmse: 0.000447542	valid_1's RMSPE: 0.206199
