In [1]:
%load_ext autoreload
%autoreload 2

In [95]:
from fastai.tabular.all import *
from multiprocessing import Pool
from sklearn.model_selection import KFold
import lightgbm as lgb

## Generate 5m dataset

In [5]:
data_dir = Path('../input/optiver-realized-volatility-prediction')

In [82]:
# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

def book_preprocessor(df, stock_id):
    #df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std]
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    #df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
    #df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    #df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)
    
    # Merge all
    #df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    #df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    #df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
    #df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    # Create row_id so we can merge
    #stock_id = str(file_path).split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature = df_feature.rename(columns={"time_id_": "time_id"})
    
    df_feature['stock_id'] = stock_id
    return df_feature

In [83]:
def trade_preprocessor(df, stock_id):
    #df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
#     df_feature_450 = get_stats_window(seconds_in_bucket = 450, add_suffix = True)
#     df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
#     df_feature_150 = get_stats_window(seconds_in_bucket = 150, add_suffix = True)

    # Merge all
#     df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
#     df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
#     df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    # Drop unnecesary time_ids
#     df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    df_feature = df_feature.add_prefix('trade_')
    #stock_id = str(file_path).split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

In [84]:
def realized_volatility_per_time_id(df_book_data, stock_id):
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':'target'})
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id', 'target']]

In [85]:
def preprocess_one_stock(stock_id, typ='train'):
    book_df = pd.read_parquet(data_dir / f'book_{typ}.parquet/stock_id={stock_id}')
    trade_df = pd.read_parquet(data_dir / f'trade_{typ}.parquet/stock_id={stock_id}')
    book_5m  = book_preprocessor(book_df[book_df.seconds_in_bucket < 300].copy(), stock_id)
    trade_5m = trade_preprocessor(trade_df[trade_df.seconds_in_bucket < 300].copy(), stock_id)
    realized_vol = realized_volatility_per_time_id(book_df[book_df.seconds_in_bucket >= 300].copy(), stock_id)
    res = pd.merge(book_5m, trade_5m, on = 'row_id', how = 'left')
    res = pd.merge(res, realized_vol, on = 'row_id', how='left')
    return res

In [86]:
def preprocess_all(list_stock_ids, typ='train'):
    pool = Pool(16)
    df = pool.starmap(preprocess_one_stock, zip(list_stock_ids, [typ]*len(list_stock_ids)))
    df = pd.concat(df, ignore_index = True)
    return df


In [87]:
def generate_train_df():
    train_df = pd.read_csv(data_dir/'train.csv')
    train_stock_ids = train_df['stock_id'].unique()
    train_features = preprocess_all(train_stock_ids, 'train')
    return train_features

In [88]:
train_df = generate_train_df()


In [92]:
train_df.to_csv('train_5m.df', index=False)

In [122]:
len(train_df)

428932

In [123]:
train_df = train_df[train_df.target!=0]
len(train_df)

428931

## LGBM Baseline

In [242]:
def rmspe_np(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe_np(y_true, y_pred), False

def train_models(train):
    # Hyperparammeters (optimized)
    seed = 29
    params = {
        'learning_rate': 0.1,        
        'lambda_l1': 2,
        'lambda_l2': 7,
        'num_leaves': 800,
        'min_sum_hessian_in_leaf': 20,
        'feature_fraction': 0.8,
        'feature_fraction_bynode': 0.8,
        'bagging_fraction': 0.9,
        'bagging_freq': 42,
        'min_data_in_leaf': 700,
        'max_depth': 4,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'objective': 'rmse',
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
    }   
    
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    models =[]
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = 5, random_state = 1111, shuffle = True)
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 3000, 
                          early_stopping_rounds = 25, 
                          verbose_eval = 100,
                          feval = feval_rmspe)
        models.append(model)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        # Predict the test set
        #test_predictions += model.predict(x_test) / 10
        
    rmspe_score = rmspe_np(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return models

In [129]:
_=train_models(train_df)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000356777	training's RMSPE: 0.245152	valid_1's rmse: 0.000365831	valid_1's RMSPE: 0.250684
[200]	training's rmse: 0.000348716	training's RMSPE: 0.239613	valid_1's rmse: 0.000362449	valid_1's RMSPE: 0.248367
[300]	training's rmse: 0.000343809	training's RMSPE: 0.236241	valid_1's rmse: 0.000360826	valid_1's RMSPE: 0.247254
[400]	training's rmse: 0.000340477	training's RMSPE: 0.233952	valid_1's rmse: 0.000359873	valid_1's RMSPE: 0.246601
[500]	training's rmse: 0.000337652	training's RMSPE: 0.23201	valid_1's rmse: 0.000359379	valid_1's RMSPE: 0.246263
Early stopping, best iteration is:
[500]	training's rmse: 0.000337652	training's RMSPE: 0.23201	valid_1's rmse: 0.000359379	valid_1's RMSPE: 0.246263
Training fold 2




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000354494	training's RMSPE: 0.244011	valid_1's rmse: 0.000382615	valid_1's RMSPE: 0.260327
[200]	training's rmse: 0.000346692	training's RMSPE: 0.23864	valid_1's rmse: 0.000380447	valid_1's RMSPE: 0.258852
[300]	training's rmse: 0.000342182	training's RMSPE: 0.235536	valid_1's rmse: 0.00037942	valid_1's RMSPE: 0.258153
Early stopping, best iteration is:
[328]	training's rmse: 0.000341221	training's RMSPE: 0.234874	valid_1's rmse: 0.000378957	valid_1's RMSPE: 0.257838
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000357652	training's RMSPE: 0.243589	valid_1's rmse: 0.000406593	valid_1's RMSPE: 0.288275
Early stopping, best iteration is:
[113]	training's rmse: 0.000356311	training's RMSPE: 0.242675	valid_1's rmse: 0.000405175	valid_1's RMSPE: 0.287269
Training fold 4




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000357161	training's RMSPE: 0.245391	valid_1's rmse: 0.000379726	valid_1's RMSPE: 0.26031
[200]	training's rmse: 0.000348714	training's RMSPE: 0.239588	valid_1's rmse: 0.000373891	valid_1's RMSPE: 0.256311
Early stopping, best iteration is:
[237]	training's rmse: 0.000346784	training's RMSPE: 0.238262	valid_1's rmse: 0.000373028	valid_1's RMSPE: 0.255719
Training fold 5




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000355604	training's RMSPE: 0.245413	valid_1's rmse: 0.000367851	valid_1's RMSPE: 0.247593
[200]	training's rmse: 0.000346949	training's RMSPE: 0.23944	valid_1's rmse: 0.000363801	valid_1's RMSPE: 0.244867
[300]	training's rmse: 0.000342167	training's RMSPE: 0.23614	valid_1's rmse: 0.000362306	valid_1's RMSPE: 0.243861
[400]	training's rmse: 0.000338851	training's RMSPE: 0.233851	valid_1's rmse: 0.000361438	valid_1's RMSPE: 0.243276
[500]	training's rmse: 0.000336158	training's RMSPE: 0.231992	valid_1's rmse: 0.000360872	valid_1's RMSPE: 0.242895
Early stopping, best iteration is:
[491]	training's rmse: 0.000336314	training's RMSPE: 0.2321	valid_1's rmse: 0.000360869	valid_1's RMSPE: 0.242893
Our out of folds RMSPE is 0.2584719789290689


## Learning embeddings

In [138]:
cont_nn,cat_nn = cont_cat_split(train_df, max_card=9000, dep_var='target')
cat_nn.remove('row_id')

In [139]:
cat_nn

['time_id', 'stock_id']

In [145]:
procs_nn = [Categorify,FillMissing, Normalize]

splits = RandomSplitter()(train_df)

dls = TabularPandas(train_df, procs_nn, cat_nn, cont_nn,
                      splits=splits, 
                       
                      y_names='target').dataloaders(1024)

In [279]:
def rmspe(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

config={'lin_first':True, 'ps':[.5,.2,0], 'embed_p':.25, }
learn = tabular_learner(dls, y_range=(0,.1), layers=[1000,400,200], emb_szs={'stock_id':16, 'time_id':64}, 
                        n_out=1, loss_func = rmspe, metrics=AccumMetric(rmspe), config=config,wd=.2)

In [280]:
learn.fit_one_cycle(70, 5e-3)

epoch,train_loss,valid_loss,rmspe,time
0,4.492495,3.304515,3.333452,00:09
1,1.624791,1.407325,1.498507,00:05
2,0.786906,0.73548,0.927203,00:06
3,0.299071,0.270267,0.272632,00:06
4,0.267113,0.250551,0.25311,00:05
5,0.266898,0.251298,0.253329,00:05
6,0.350433,0.275506,0.277909,00:05
7,0.29588,0.257843,0.26185,00:06
8,0.262504,0.24626,0.249931,00:06
9,0.253144,0.250773,0.263207,00:05


In [283]:
torch.save(learn.model.embeds[0].weight.data, 'timeemb64.pt')

In [180]:
categorify = dls.procs[2]
len(categorify.classes['time_id'])

3831

In [213]:
time_id_embs = dict()
for idx, time_id in enumerate(categorify.classes['time_id']):
    time_id_embs[time_id] = learn.model.embeds[0].weight[idx].tolist()

stock_id_embs = dict()
for idx, stock_id in enumerate(categorify.classes['stock_id']):
    stock_id_embs[stock_id] = learn.model.embeds[1].weight[idx].tolist()

all_embs = []
for _, row in (train_df[['stock_id', 'time_id']].iterrows()):
    emb1 = stock_id_embs[row.stock_id]
    emb2 = time_id_embs[row.time_id]
    all_embs.append(emb1+emb2)

In [214]:
np.array(all_embs).shape

(428931, 185)

In [239]:
embs_df = pd.DataFrame(all_embs)

train_with_embs = pd.concat([train_df.reset_index(), embs_df], axis=1)

train_with_embs

## Train 5m LGBM with embeddings


In [243]:
_=train_models(train_with_embs)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000321247	training's RMSPE: 0.220738	valid_1's rmse: 0.000331815	valid_1's RMSPE: 0.227375
[200]	training's rmse: 0.000310803	training's RMSPE: 0.213562	valid_1's rmse: 0.000325983	valid_1's RMSPE: 0.223378
[300]	training's rmse: 0.000304356	training's RMSPE: 0.209132	valid_1's rmse: 0.000323741	valid_1's RMSPE: 0.221842
[400]	training's rmse: 0.000299905	training's RMSPE: 0.206073	valid_1's rmse: 0.000321768	valid_1's RMSPE: 0.22049
[500]	training's rmse: 0.000296391	training's RMSPE: 0.203659	valid_1's rmse: 0.000321026	valid_1's RMSPE: 0.219982
[600]	training's rmse: 0.000293156	training's RMSPE: 0.201436	valid_1's rmse: 0.000320485	valid_1's RMSPE: 0.219611
Early stopping, best iteration is:
[598]	training's rmse: 0.000293213	training's RMSPE: 0.201475	valid_1's rmse: 0.000320427	valid_1's RMSPE: 0.219571
Training fold 2




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000319558	training's RMSPE: 0.219963	valid_1's rmse: 0.000346515	valid_1's RMSPE: 0.235765
[200]	training's rmse: 0.000309336	training's RMSPE: 0.212927	valid_1's rmse: 0.000341357	valid_1's RMSPE: 0.232255
[300]	training's rmse: 0.000303323	training's RMSPE: 0.208788	valid_1's rmse: 0.000338558	valid_1's RMSPE: 0.230351
[400]	training's rmse: 0.000299239	training's RMSPE: 0.205977	valid_1's rmse: 0.000336807	valid_1's RMSPE: 0.22916
[500]	training's rmse: 0.000295855	training's RMSPE: 0.203647	valid_1's rmse: 0.00033606	valid_1's RMSPE: 0.228651
[600]	training's rmse: 0.000293012	training's RMSPE: 0.201691	valid_1's rmse: 0.000335435	valid_1's RMSPE: 0.228226
Early stopping, best iteration is:
[577]	training's rmse: 0.000293639	training's RMSPE: 0.202122	valid_1's rmse: 0.000335384	valid_1's RMSPE: 0.228191
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000322604	training's RMSPE: 0.219718	valid_1's rmse: 0.000386358	valid_1's RMSPE: 0.273929
[200]	training's rmse: 0.000312497	training's RMSPE: 0.212835	valid_1's rmse: 0.000366235	valid_1's RMSPE: 0.259661
Early stopping, best iteration is:
[244]	training's rmse: 0.000309899	training's RMSPE: 0.211065	valid_1's rmse: 0.000359847	valid_1's RMSPE: 0.255132
Training fold 4




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.0003212	training's RMSPE: 0.220684	valid_1's rmse: 0.00034142	valid_1's RMSPE: 0.234051
[200]	training's rmse: 0.000310534	training's RMSPE: 0.213356	valid_1's rmse: 0.000332464	valid_1's RMSPE: 0.227911
[300]	training's rmse: 0.000304149	training's RMSPE: 0.208969	valid_1's rmse: 0.000328246	valid_1's RMSPE: 0.22502
[400]	training's rmse: 0.000299562	training's RMSPE: 0.205817	valid_1's rmse: 0.000326137	valid_1's RMSPE: 0.223574
[500]	training's rmse: 0.000296156	training's RMSPE: 0.203477	valid_1's rmse: 0.000324966	valid_1's RMSPE: 0.222771
[600]	training's rmse: 0.000292991	training's RMSPE: 0.201303	valid_1's rmse: 0.000324561	valid_1's RMSPE: 0.222494
[700]	training's rmse: 0.000290421	training's RMSPE: 0.199537	valid_1's rmse: 0.000323758	valid_1's RMSPE: 0.221943
Early stopping, best iteration is:
[724]	training's rmse: 0.00028987	training's RMSPE: 0.199158	valid_1's rmse: 0.000323645	valid_1



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.00031992	training's RMSPE: 0.220786	valid_1's rmse: 0.000334967	valid_1's RMSPE: 0.225459
[200]	training's rmse: 0.000309178	training's RMSPE: 0.213373	valid_1's rmse: 0.00032915	valid_1's RMSPE: 0.221544
[300]	training's rmse: 0.000303029	training's RMSPE: 0.209129	valid_1's rmse: 0.000326102	valid_1's RMSPE: 0.219492
[400]	training's rmse: 0.000298654	training's RMSPE: 0.20611	valid_1's rmse: 0.000324334	valid_1's RMSPE: 0.218302
[500]	training's rmse: 0.000295011	training's RMSPE: 0.203596	valid_1's rmse: 0.000323338	valid_1's RMSPE: 0.217632
[600]	training's rmse: 0.000292249	training's RMSPE: 0.20169	valid_1's rmse: 0.000322698	valid_1's RMSPE: 0.217202
Early stopping, best iteration is:
[584]	training's rmse: 0.00029264	training's RMSPE: 0.201959	valid_1's rmse: 0.000322624	valid_1's RMSPE: 0.217151
Our out of folds RMSPE is 0.22880301189538851


## Baseline LGMB 10m

In [245]:
train_10m = pd.read_csv('train_with_features_NO_ST.csv')

In [248]:
from optiver_features import get_time_stock
train_10m_time_stock = get_time_stock(train_10m)

In [249]:
_=train_models(train_10m_time_stock)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472369	training's RMSPE: 0.218697	valid_1's rmse: 0.000483167	valid_1's RMSPE: 0.223294
[200]	training's rmse: 0.000456434	training's RMSPE: 0.211319	valid_1's rmse: 0.000471963	valid_1's RMSPE: 0.218117
[300]	training's rmse: 0.000444947	training's RMSPE: 0.206001	valid_1's rmse: 0.000463557	valid_1's RMSPE: 0.214231
[400]	training's rmse: 0.00043609	training's RMSPE: 0.201901	valid_1's rmse: 0.000457713	valid_1's RMSPE: 0.211531
[500]	training's rmse: 0.000428288	training's RMSPE: 0.198288	valid_1's rmse: 0.000452597	valid_1's RMSPE: 0.209167
[600]	training's rmse: 0.000422108	training's RMSPE: 0.195427	valid_1's rmse: 0.000449117	valid_1's RMSPE: 0.207558
[700]	training's rmse: 0.000416375	training's RMSPE: 0.192773	valid_1's rmse: 0.000445402	valid_1's RMSPE: 0.205841
[800]	training's rmse: 0.000410935	training's RMSPE: 0.190254	valid_1's rmse: 0.00044216	valid_1's RMSPE: 0.204343
[900]	trainin



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.00047405	training's RMSPE: 0.219247	valid_1's rmse: 0.000493132	valid_1's RMSPE: 0.228847
[200]	training's rmse: 0.000457643	training's RMSPE: 0.211659	valid_1's rmse: 0.000482029	valid_1's RMSPE: 0.223694
[300]	training's rmse: 0.000446028	training's RMSPE: 0.206287	valid_1's rmse: 0.000474004	valid_1's RMSPE: 0.21997
[400]	training's rmse: 0.000437256	training's RMSPE: 0.20223	valid_1's rmse: 0.000468424	valid_1's RMSPE: 0.217381
[500]	training's rmse: 0.000429788	training's RMSPE: 0.198776	valid_1's rmse: 0.000463568	valid_1's RMSPE: 0.215127
Early stopping, best iteration is:
[546]	training's rmse: 0.000426411	training's RMSPE: 0.197214	valid_1's rmse: 0.0004622	valid_1's RMSPE: 0.214492
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000471619	training's RMSPE: 0.21837	valid_1's rmse: 0.000478397	valid_1's RMSPE: 0.221005
[200]	training's rmse: 0.000456622	training's RMSPE: 0.211427	valid_1's rmse: 0.000466537	valid_1's RMSPE: 0.215526
[300]	training's rmse: 0.000445607	training's RMSPE: 0.206327	valid_1's rmse: 0.000458002	valid_1's RMSPE: 0.211583
[400]	training's rmse: 0.000436655	training's RMSPE: 0.202182	valid_1's rmse: 0.000451147	valid_1's RMSPE: 0.208416
[500]	training's rmse: 0.000429267	training's RMSPE: 0.198761	valid_1's rmse: 0.000445774	valid_1's RMSPE: 0.205934
[600]	training's rmse: 0.000422695	training's RMSPE: 0.195718	valid_1's rmse: 0.000441604	valid_1's RMSPE: 0.204008
[700]	training's rmse: 0.00041678	training's RMSPE: 0.192979	valid_1's rmse: 0.000438011	valid_1's RMSPE: 0.202348
[800]	training's rmse: 0.000411842	training's RMSPE: 0.190692	valid_1's rmse: 0.000434914	valid_1's RMSPE: 0.200917
[900]	trainin



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000473051	training's RMSPE: 0.218659	valid_1's rmse: 0.000479689	valid_1's RMSPE: 0.223117
[200]	training's rmse: 0.000457686	training's RMSPE: 0.211557	valid_1's rmse: 0.000469326	valid_1's RMSPE: 0.218297
[300]	training's rmse: 0.000445643	training's RMSPE: 0.20599	valid_1's rmse: 0.000460368	valid_1's RMSPE: 0.214131
[400]	training's rmse: 0.000436376	training's RMSPE: 0.201707	valid_1's rmse: 0.000453278	valid_1's RMSPE: 0.210833
[500]	training's rmse: 0.000428336	training's RMSPE: 0.197991	valid_1's rmse: 0.0004482	valid_1's RMSPE: 0.208471
[600]	training's rmse: 0.000422644	training's RMSPE: 0.19536	valid_1's rmse: 0.000446044	valid_1's RMSPE: 0.207468
[700]	training's rmse: 0.000417281	training's RMSPE: 0.192881	valid_1's rmse: 0.000443371	valid_1's RMSPE: 0.206225
[800]	training's rmse: 0.00041215	training's RMSPE: 0.190509	valid_1's rmse: 0.000440851	valid_1's RMSPE: 0.205053
[900]	training's



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000471791	training's RMSPE: 0.218595	valid_1's rmse: 0.000485867	valid_1's RMSPE: 0.223856
[200]	training's rmse: 0.000456254	training's RMSPE: 0.211397	valid_1's rmse: 0.000472728	valid_1's RMSPE: 0.217803
[300]	training's rmse: 0.000444348	training's RMSPE: 0.20588	valid_1's rmse: 0.000463477	valid_1's RMSPE: 0.213541
[400]	training's rmse: 0.00043573	training's RMSPE: 0.201887	valid_1's rmse: 0.000457275	valid_1's RMSPE: 0.210683
[500]	training's rmse: 0.000428962	training's RMSPE: 0.198751	valid_1's rmse: 0.000453733	valid_1's RMSPE: 0.209051
[600]	training's rmse: 0.000422235	training's RMSPE: 0.195634	valid_1's rmse: 0.000448767	valid_1's RMSPE: 0.206763
[700]	training's rmse: 0.000416327	training's RMSPE: 0.192897	valid_1's rmse: 0.000445273	valid_1's RMSPE: 0.205153
[800]	training's rmse: 0.000411091	training's RMSPE: 0.190471	valid_1's rmse: 0.000442164	valid_1's RMSPE: 0.203721
[900]	trainin

## LGBM 10m with trained embeddings

In [272]:
train_10m = pd.read_csv('train_with_features_NO_ST.csv')

In [273]:
all_embs = []
for _, row in (train_10m[['stock_id', 'time_id']].iterrows()):
    emb1 = stock_id_embs[row.stock_id]
    emb2 = time_id_embs[row.time_id]
    all_embs.append(emb1+emb2)

In [274]:
embs_df = pd.DataFrame(all_embs)

train_10m_with_embs = pd.concat([train_10m.reset_index(), embs_df], axis=1)

In [275]:
train_10m_with_embs['total_volume_sum']

0          97696
1          82290
2          78274
3          52232
4          60407
           ...  
428927    125874
428928     54261
428929     89112
428930    170140
428931    115295
Name: total_volume_sum, Length: 428932, dtype: int64

In [276]:
train_10m_with_embs = train_10m_with_embs.drop('index', axis=1)

In [277]:
train_10m_with_embs.to_csv('train_10m_with_embs.csv', index=False)

In [278]:
_=train_models(train_10m_with_embs)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000473355	training's RMSPE: 0.219154	valid_1's rmse: 0.000484163	valid_1's RMSPE: 0.223755
[200]	training's rmse: 0.000452656	training's RMSPE: 0.20957	valid_1's rmse: 0.000467218	valid_1's RMSPE: 0.215923
[300]	training's rmse: 0.000439053	training's RMSPE: 0.203272	valid_1's rmse: 0.000456849	valid_1's RMSPE: 0.211132
[400]	training's rmse: 0.000428527	training's RMSPE: 0.198399	valid_1's rmse: 0.000448811	valid_1's RMSPE: 0.207417
[500]	training's rmse: 0.000419537	training's RMSPE: 0.194237	valid_1's rmse: 0.000442528	valid_1's RMSPE: 0.204513
[600]	training's rmse: 0.000412516	training's RMSPE: 0.190986	valid_1's rmse: 0.000437815	valid_1's RMSPE: 0.202335
[700]	training's rmse: 0.000406529	training's RMSPE: 0.188215	valid_1's rmse: 0.000434523	valid_1's RMSPE: 0.200813
[800]	training's rmse: 0.000401461	training's RMSPE: 0.185868	valid_1's rmse: 0.000431864	valid_1's RMSPE: 0.199585
[900]	traini



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000473481	training's RMSPE: 0.218984	valid_1's rmse: 0.000493451	valid_1's RMSPE: 0.228995
[200]	training's rmse: 0.000453699	training's RMSPE: 0.209835	valid_1's rmse: 0.000480195	valid_1's RMSPE: 0.222843
[300]	training's rmse: 0.000439207	training's RMSPE: 0.203133	valid_1's rmse: 0.000470383	valid_1's RMSPE: 0.21829
[400]	training's rmse: 0.000428393	training's RMSPE: 0.198131	valid_1's rmse: 0.000463588	valid_1's RMSPE: 0.215136
[500]	training's rmse: 0.000420321	training's RMSPE: 0.194398	valid_1's rmse: 0.000458937	valid_1's RMSPE: 0.212978
Early stopping, best iteration is:
[563]	training's rmse: 0.000415771	training's RMSPE: 0.192293	valid_1's rmse: 0.000456764	valid_1's RMSPE: 0.21197
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472517	training's RMSPE: 0.218786	valid_1's rmse: 0.00048034	valid_1's RMSPE: 0.221902
[200]	training's rmse: 0.000452674	training's RMSPE: 0.209599	valid_1's rmse: 0.00046345	valid_1's RMSPE: 0.2141
[300]	training's rmse: 0.000438596	training's RMSPE: 0.20308	valid_1's rmse: 0.000452447	valid_1's RMSPE: 0.209017
[400]	training's rmse: 0.000428032	training's RMSPE: 0.198189	valid_1's rmse: 0.000444419	valid_1's RMSPE: 0.205308
[500]	training's rmse: 0.000419785	training's RMSPE: 0.19437	valid_1's rmse: 0.000438992	valid_1's RMSPE: 0.202801
[600]	training's rmse: 0.000412835	training's RMSPE: 0.191152	valid_1's rmse: 0.000434654	valid_1's RMSPE: 0.200797
[700]	training's rmse: 0.000406894	training's RMSPE: 0.188402	valid_1's rmse: 0.000431387	valid_1's RMSPE: 0.199287
[800]	training's rmse: 0.000401619	training's RMSPE: 0.185959	valid_1's rmse: 0.000428921	valid_1's RMSPE: 0.198148
[900]	training's 



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472684	training's RMSPE: 0.218489	valid_1's rmse: 0.000479329	valid_1's RMSPE: 0.22295
[200]	training's rmse: 0.000453246	training's RMSPE: 0.209505	valid_1's rmse: 0.000465193	valid_1's RMSPE: 0.216375
[300]	training's rmse: 0.000439108	training's RMSPE: 0.20297	valid_1's rmse: 0.000454081	valid_1's RMSPE: 0.211206
[400]	training's rmse: 0.000428427	training's RMSPE: 0.198033	valid_1's rmse: 0.000447008	valid_1's RMSPE: 0.207916
[500]	training's rmse: 0.000419955	training's RMSPE: 0.194117	valid_1's rmse: 0.000441154	valid_1's RMSPE: 0.205194
[600]	training's rmse: 0.000413119	training's RMSPE: 0.190957	valid_1's rmse: 0.000437601	valid_1's RMSPE: 0.203541
[700]	training's rmse: 0.000407025	training's RMSPE: 0.18814	valid_1's rmse: 0.000434305	valid_1's RMSPE: 0.202008
[800]	training's rmse: 0.000401955	training's RMSPE: 0.185796	valid_1's rmse: 0.000432355	valid_1's RMSPE: 0.201101
[900]	training



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472842	training's RMSPE: 0.219082	valid_1's rmse: 0.000488016	valid_1's RMSPE: 0.224846
[200]	training's rmse: 0.000452134	training's RMSPE: 0.209488	valid_1's rmse: 0.000469583	valid_1's RMSPE: 0.216354
[300]	training's rmse: 0.000438077	training's RMSPE: 0.202974	valid_1's rmse: 0.000458693	valid_1's RMSPE: 0.211336
[400]	training's rmse: 0.000427546	training's RMSPE: 0.198095	valid_1's rmse: 0.00045069	valid_1's RMSPE: 0.207649
[500]	training's rmse: 0.000419032	training's RMSPE: 0.19415	valid_1's rmse: 0.000445894	valid_1's RMSPE: 0.205439
[600]	training's rmse: 0.000412001	training's RMSPE: 0.190893	valid_1's rmse: 0.000442128	valid_1's RMSPE: 0.203704
Early stopping, best iteration is:
[616]	training's rmse: 0.000410889	training's RMSPE: 0.190378	valid_1's rmse: 0.000441414	valid_1's RMSPE: 0.203375
Our out of folds RMSPE is 0.2005404468724854
