In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GroupKFold
import lightgbm as lgb
from optiver_features import *

In [3]:
train_df = pd.read_csv('train_ffill_feat.csv')
#train_df = generate_train_df(True, True)

#train_df.to_csv('train_ffill_feat.csv', index=False)

In [4]:
time_ids = train_df.time_id.unique()

np.random.shuffle(time_ids)

splt = int(len(time_ids)*.6)
t_ids, v_ids = time_ids[:splt], time_ids[splt:]

test_df = train_df[train_df.time_id.isin( v_ids)]
train_df = train_df[train_df.time_id.isin( t_ids)]

In [5]:
len(train_df), len(test_df)

(257355, 171577)

In [6]:
real_test_targets = test_df.target.to_numpy()
test_df = test_df.drop('target',axis=1)

In [220]:
emb_sizes = {'stock_id':16, 'time_id':16}
lin_sizes = [100, 50, 20]
ps=0#[.2,.1,0]

## Generate predictions for pseudo labels

In [8]:
def rmspe_np(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe_np(y_true, y_pred), False
def train_models(train):
    # Hyperparammeters (optimized)
    seed = 29
    params = {
        'learning_rate': 0.1,        
        'lambda_l1': 2,
        'lambda_l2': 7,
        'num_leaves': 800,
        'min_sum_hessian_in_leaf': 20,
        'feature_fraction': 0.8,
        'feature_fraction_bynode': 0.8,
        'bagging_fraction': 0.9,
        'bagging_freq': 42,
        'min_data_in_leaf': 700,
        'max_depth': 4,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'objective': 'rmse',
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
    }   
    
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    models =[]
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create a KFold object
    kfold = GroupKFold()
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x, groups = train.time_id)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 3000, 
                          early_stopping_rounds = 25, 
                          verbose_eval = 100,
                          feval = feval_rmspe)
        models.append(model)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        # Predict the test set
        #test_predictions += model.predict(x_test) / 10
        
    rmspe_score = rmspe_np(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return models

In [9]:
models = train_models(train_df)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472622	training's RMSPE: 0.215912	valid_1's rmse: 0.000518295	valid_1's RMSPE: 0.242341
Early stopping, best iteration is:
[143]	training's rmse: 0.000463083	training's RMSPE: 0.211554	valid_1's rmse: 0.000516522	valid_1's RMSPE: 0.241512
Training fold 2




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000471277	training's RMSPE: 0.215766	valid_1's rmse: 0.000506455	valid_1's RMSPE: 0.234824
Early stopping, best iteration is:
[122]	training's rmse: 0.000465585	training's RMSPE: 0.21316	valid_1's rmse: 0.000505427	valid_1's RMSPE: 0.234347
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000471822	training's RMSPE: 0.217686	valid_1's rmse: 0.000507901	valid_1's RMSPE: 0.228253
Early stopping, best iteration is:
[135]	training's rmse: 0.000462135	training's RMSPE: 0.213217	valid_1's rmse: 0.000506469	valid_1's RMSPE: 0.227609
Training fold 4




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000473019	training's RMSPE: 0.215606	valid_1's rmse: 0.000492199	valid_1's RMSPE: 0.232096
[200]	training's rmse: 0.000448813	training's RMSPE: 0.204573	valid_1's rmse: 0.00049159	valid_1's RMSPE: 0.231809
Early stopping, best iteration is:
[183]	training's rmse: 0.000451977	training's RMSPE: 0.206015	valid_1's rmse: 0.000490872	valid_1's RMSPE: 0.231471
Training fold 5




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000467503	training's RMSPE: 0.21651	valid_1's rmse: 0.000510799	valid_1's RMSPE: 0.22586
[200]	training's rmse: 0.000445351	training's RMSPE: 0.206251	valid_1's rmse: 0.000507547	valid_1's RMSPE: 0.224422
Early stopping, best iteration is:
[197]	training's rmse: 0.000446096	training's RMSPE: 0.206596	valid_1's rmse: 0.000507269	valid_1's RMSPE: 0.2243
Our out of folds RMSPE is 0.231917674478357


In [10]:
def pred_lgb(test_df, models):
    test_df = test_df.drop(['row_id', 'time_id'], axis=1)
    res = np.zeros(len(test_df))
    for model in models:
        preds = model.predict(test_df)
        res += preds / 5
    return res

In [11]:
pseudo = pred_lgb(test_df, models)

In [12]:
baseline = rmspe_np(real_test_targets, pseudo)
baseline

0.2246709295068485

## Train embeddings

In [221]:
test_with_pseudo = test_df.copy()
test_with_pseudo['target'] = pseudo
train_pseudo = pd.concat([train_df, test_with_pseudo])

In [222]:
train_pseudo = train_pseudo.drop(['row_id'], axis=1)

In [223]:
cont_nn,cat_nn = cont_cat_split(train_pseudo, max_card=9000, dep_var='target')
cat_nn

['stock_id', 'time_id']

In [224]:
categorify = Categorify()
procs_nn = [categorify, FillMissing, Normalize]
splits = RandomSplitter()(train_pseudo)
to_nn = TabularPandas(train_pseudo, procs_nn, cat_nn, cont_nn,
                      splits=splits, y_names='target')

dls = to_nn.dataloaders(1024)

In [225]:
def rmspe(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

config={'lin_first':True, 'ps':ps, 'embed_p':0.5, }
learn = tabular_learner(dls, y_range=(0,.1), layers=lin_sizes, 
                        emb_szs=emb_sizes, 
                        n_out=1, loss_func = rmspe, metrics=AccumMetric(rmspe), config=config,wd=0)

In [226]:
learn.fit_one_cycle(8, 5e-3)

epoch,train_loss,valid_loss,rmspe,time
0,2.909177,1.180899,1.188217,00:05
1,0.20808,0.185592,0.187282,00:05
2,0.174738,0.212384,0.383939,00:05
3,0.179518,0.182434,0.202771,00:05
4,0.163956,0.157282,0.157489,00:05
5,0.159554,0.153312,0.153502,00:05
6,0.154794,0.150156,0.150343,00:05
7,0.153365,0.149699,0.149872,00:05


In [227]:
torch.save(learn.model.embeds[0].weight.data, 'stock_embed.pt')
torch.save(learn.model.embeds[1].weight.data, 'time_embed.pt')

In [228]:
test_dl = dls.test_dl(test_df)

In [229]:
test_preds,_ = learn.get_preds(dl = test_dl)

In [230]:
rmspe(test_preds.view(-1), torch.tensor(real_test_targets))

tensor(0.6033, dtype=torch.float64)

In [231]:
rmspe(test_preds.view(-1), torch.tensor(pseudo))

tensor(0.3903, dtype=torch.float64)

## Train with pretrained embedding

In [232]:
class MyCategorify(Categorify):
    def setups(self, to):
        pass
categorify2 = MyCategorify()
categorify2.classes = categorify.classes

In [233]:
def train_fold(splits):
    cont_nn,cat_nn = cont_cat_split(train_df, max_card=9000, dep_var='target')
    cat_nn.remove('row_id')
    procs_nn = [categorify2, FillMissing, Normalize]
    
    to_nn = TabularPandas(train_df, procs_nn, cat_nn, cont_nn,
                      splits=[list(splits[0]), list(splits[1])], y_names='target')

    dls = to_nn.dataloaders(1024)
    config={'lin_first':True, 'ps':ps, 'embed_p':0.5, }
    learn = tabular_learner(dls, y_range=(0,.1), layers=lin_sizes,
                        emb_szs=emb_sizes, 
                        n_out=1, loss_func = rmspe, metrics=AccumMetric(rmspe), config=config,wd=0)
    learn.model.embeds[0].weight.data[:,:]=torch.load( 'stock_embed.pt')
    learn.model.embeds[1].weight.data[:,:]=torch.load( 'time_embed.pt')
    learn.model.embeds[0].requires_grad_(False)
    learn.model.embeds[1].requires_grad_(False)
    learn.fit_one_cycle(20, 5e-3)
    test_dl = dls.test_dl(test_df)
    test_preds,_ = learn.get_preds(dl = test_dl)
    print(rmspe(test_preds.view(-1), torch.tensor(real_test_targets)))
    return test_preds

In [234]:
kfold = GroupKFold(n_splits = 5)
preds=[]
for split in kfold.split(train_df, groups=train_df.time_id):
    preds.append(train_fold(split))

epoch,train_loss,valid_loss,rmspe,time
0,9.347883,6.597923,6.664089,00:03
1,2.795744,1.554739,1.562973,00:03
2,1.061178,0.536698,0.565613,00:03
3,0.789233,0.341136,0.448057,00:03
4,0.24343,0.201926,0.204889,00:03
5,0.306893,0.233807,0.240153,00:03
6,0.238646,0.208667,0.216151,00:03
7,0.227316,0.204898,0.209271,00:03
8,0.218624,0.197215,0.201488,00:03
9,0.215175,0.199942,0.203095,00:03


tensor(0.2220, dtype=torch.float64)


epoch,train_loss,valid_loss,rmspe,time
0,8.971078,6.318727,6.404065,00:03
1,2.865655,1.304353,1.338313,00:03
2,0.960066,0.893656,0.908531,00:03
3,0.260857,0.216662,0.220812,00:03
4,0.225708,0.207939,0.211902,00:03
5,0.216992,0.214709,0.218727,00:03
6,0.228172,0.252215,0.26216,00:03
7,0.215989,0.205325,0.20943,00:03
8,0.209702,0.202746,0.206781,00:03
9,0.203854,0.193048,0.196501,00:03


tensor(0.2218, dtype=torch.float64)


epoch,train_loss,valid_loss,rmspe,time
0,9.60502,7.138633,7.198597,00:03
1,2.789817,1.30117,1.352451,00:03
2,1.007821,0.429325,0.443874,00:03
3,0.424229,0.346194,0.367595,00:03
4,0.233216,0.217438,0.233004,00:03
5,0.216692,0.206241,0.209007,00:03
6,0.21766,0.2011,0.204767,00:03
7,0.215639,0.202247,0.205034,00:03
8,0.213305,0.193577,0.198098,00:03
9,0.207601,0.196065,0.201284,00:03


tensor(0.2234, dtype=torch.float64)


epoch,train_loss,valid_loss,rmspe,time
0,10.936513,7.99562,8.073772,00:03
1,3.174969,1.184907,1.248181,00:03
2,1.371408,0.737983,0.877382,00:03
3,0.481981,0.294406,0.29859,00:03
4,0.240185,0.213382,0.217829,00:03
5,0.227674,0.209022,0.212193,00:03
6,0.227555,0.215756,0.217975,00:03
7,0.217888,0.211231,0.213747,00:03
8,0.211594,0.196112,0.198886,00:03
9,0.223228,0.209739,0.221576,00:03


tensor(0.2234, dtype=torch.float64)


epoch,train_loss,valid_loss,rmspe,time
0,9.731443,7.884107,7.931853,00:03
1,3.038719,1.615034,1.665258,00:03
2,0.509527,0.24875,0.251453,00:03
3,0.231211,0.205593,0.207968,00:03
4,0.208365,0.216445,0.224988,00:03
5,0.23082,0.215485,0.22333,00:03
6,0.211077,0.195879,0.197682,00:03
7,0.204396,0.191683,0.193619,00:03
8,0.201782,0.19333,0.195815,00:03
9,0.207735,0.190188,0.193866,00:03


tensor(0.2222, dtype=torch.float64)


In [235]:
mp =torch.cat(preds, dim=1).mean(dim=1)

In [236]:
score =rmspe(mp, torch.tensor(real_test_targets))
score

tensor(0.2218, dtype=torch.float64)

In [237]:
score, baseline, 100* (baseline-score)/baseline

(tensor(0.2218, dtype=torch.float64),
 0.2246709295068485,
 tensor(1.2798, dtype=torch.float64))

In [238]:
rmspe(mp, torch.tensor(pseudo))

tensor(0.0467, dtype=torch.float64)