In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GroupKFold
import lightgbm as lgb
from optiver_features import *

In [3]:
train_df = pd.read_csv('train_ffill_feat.csv')
#train_df = generate_train_df(True, True)

#train_df.to_csv('train_ffill_feat.csv', index=False)

In [4]:
time_ids = train_df.time_id.unique()

np.random.shuffle(time_ids)

splt = int(len(time_ids)*.6)
t_ids, v_ids = time_ids[:splt], time_ids[splt:]

test_df = train_df[train_df.time_id.isin( v_ids)]
train_df = train_df[train_df.time_id.isin( t_ids)]

In [5]:
len(train_df), len(test_df)

(257359, 171573)

In [6]:
real_test_targets = test_df.target.to_numpy()
test_df = test_df.drop('target',axis=1)

In [60]:
emb_sizes = {'stock_id':10, 'time_id':10}
lin_sizes = [100, 50, 20]

ps=0#[.2,.1,0]

## Generate predictions for pseudo labels

In [8]:
def rmspe_np(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe_np(y_true, y_pred), False
def train_models(train):
    # Hyperparammeters (optimized)
    seed = 29
    params = {
        'learning_rate': 0.1,        
        'lambda_l1': 2,
        'lambda_l2': 7,
        'num_leaves': 800,
        'min_sum_hessian_in_leaf': 20,
        'feature_fraction': 0.8,
        'feature_fraction_bynode': 0.8,
        'bagging_fraction': 0.9,
        'bagging_freq': 42,
        'min_data_in_leaf': 700,
        'max_depth': 4,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'objective': 'rmse',
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
    }   
    
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    models =[]
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create a KFold object
    kfold = GroupKFold()
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x, groups = train.time_id)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 3000, 
                          early_stopping_rounds = 25, 
                          verbose_eval = 100,
                          feval = feval_rmspe)
        models.append(model)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        # Predict the test set
        #test_predictions += model.predict(x_test) / 10
        
    rmspe_score = rmspe_np(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return models

In [9]:
models = train_models(train_df)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.00046983	training's RMSPE: 0.216064	valid_1's rmse: 0.000484415	valid_1's RMSPE: 0.226707
Early stopping, best iteration is:
[141]	training's rmse: 0.000459417	training's RMSPE: 0.211276	valid_1's rmse: 0.000482476	valid_1's RMSPE: 0.225799
Training fold 2




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000462288	training's RMSPE: 0.215112	valid_1's rmse: 0.000512425	valid_1's RMSPE: 0.228527
Early stopping, best iteration is:
[120]	training's rmse: 0.000455945	training's RMSPE: 0.21216	valid_1's rmse: 0.00051163	valid_1's RMSPE: 0.228172
Training fold 3




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000468826	training's RMSPE: 0.214428	valid_1's rmse: 0.000487063	valid_1's RMSPE: 0.232669
Early stopping, best iteration is:
[132]	training's rmse: 0.000460842	training's RMSPE: 0.210776	valid_1's rmse: 0.000486162	valid_1's RMSPE: 0.232239
Training fold 4




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000468934	training's RMSPE: 0.216469	valid_1's rmse: 0.000514907	valid_1's RMSPE: 0.237413
Early stopping, best iteration is:
[150]	training's rmse: 0.000456775	training's RMSPE: 0.210857	valid_1's rmse: 0.00051316	valid_1's RMSPE: 0.236608
Training fold 5




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000463462	training's RMSPE: 0.214735	valid_1's rmse: 0.000509313	valid_1's RMSPE: 0.231309
[200]	training's rmse: 0.000440312	training's RMSPE: 0.20401	valid_1's rmse: 0.000506524	valid_1's RMSPE: 0.230042
Early stopping, best iteration is:
[234]	training's rmse: 0.000435972	training's RMSPE: 0.201999	valid_1's rmse: 0.000505486	valid_1's RMSPE: 0.22957
Our out of folds RMSPE is 0.23050988799649366


In [10]:
def pred_lgb(test_df, models):
    test_df = test_df.drop(['row_id', 'time_id'], axis=1)
    res = np.zeros(len(test_df))
    for model in models:
        preds = model.predict(test_df)
        res += preds / 5
    return res

In [11]:
pseudo = pred_lgb(test_df, models)

In [12]:
baseline = rmspe_np(real_test_targets, pseudo)
baseline

0.22569155599349788

## Train embeddings

In [13]:
train_pseudo = generate_train_df(False, False)

In [20]:
train_pseudo.loc[train_pseudo.time_id.isin( v_ids), 'target'] = pseudo

In [349]:
# test_with_pseudo = test_df.copy()
# test_with_pseudo['target'] = mp#pseudo
# train_pseudo = pd.concat([train_df, test_with_pseudo])

In [24]:
train_pseudo = train_pseudo.drop(['row_id'], axis=1)

In [25]:
cont_nn,cat_nn = cont_cat_split(train_pseudo, max_card=9000, dep_var='target')
cat_nn

['stock_id', 'time_id']

In [26]:
categorify = Categorify()
procs_nn = [categorify, FillMissing, Normalize]
splits = RandomSplitter()(train_pseudo)
to_nn = TabularPandas(train_pseudo, procs_nn, cat_nn, cont_nn,
                      splits=splits, y_names='target')

dls = to_nn.dataloaders(1024)

In [27]:
def rmspe(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

config={'lin_first':True, 'ps':ps, 'embed_p':0.5, }
learn = tabular_learner(dls, y_range=(0,.1), layers=lin_sizes, 
                        emb_szs=emb_sizes, 
                        n_out=1, loss_func = rmspe, metrics=AccumMetric(rmspe), config=config,wd=0)

In [28]:
learn.fit_one_cycle(8, 5e-3)

epoch,train_loss,valid_loss,rmspe,time
0,2.776543,1.875637,1.882712,00:10
1,0.34672,0.2706,0.432115,00:05
2,0.231727,0.203746,0.20394,00:05
3,0.207089,0.28613,0.443295,00:05
4,0.179497,0.160414,0.160662,00:05
5,0.169046,0.154829,0.155008,00:05
6,0.164082,0.154424,0.154609,00:05
7,0.164113,0.153509,0.153691,00:05


In [29]:
torch.save(learn.model.embeds[0].weight.data, 'stock_embed.pt')
torch.save(learn.model.embeds[1].weight.data, 'time_embed.pt')

In [30]:
test_dl = dls.test_dl(test_df)

In [31]:
test_preds,_ = learn.get_preds(dl = test_dl)

In [32]:
rmspe(test_preds.view(-1), torch.tensor(real_test_targets))

tensor(0.2261, dtype=torch.float64)

In [33]:
rmspe(test_preds.view(-1), torch.tensor(pseudo))

tensor(0.0530, dtype=torch.float64)

## Train with pretrained embedding

In [34]:
class MyCategorify(Categorify):
    def setups(self, to):
        pass
categorify2 = MyCategorify()
categorify2.classes = categorify.classes

In [71]:
def train_fold(splits):
    #train_df = train_pseudo.loc[train_pseudo.time_id.isin( t_ids)]
    
    cont_nn,cat_nn = cont_cat_split(train_df, max_card=9000, dep_var='target')
    cat_nn.remove('row_id')
    procs_nn = [categorify2, FillMissing, Normalize]
    
    to_nn = TabularPandas(train_df, procs_nn, cat_nn, cont_nn,
                      splits=[list(splits[0]), list(splits[1])], y_names='target')

    dls = to_nn.dataloaders(1024)
    config={'lin_first':True, 'ps':ps, 'embed_p':0.5, }
    learn = tabular_learner(dls, y_range=(0,.1), layers=lin_sizes,
                        emb_szs=emb_sizes, 
                        n_out=1, loss_func = rmspe, metrics=AccumMetric(rmspe), config=config,wd=0)
    learn.model.embeds[0].weight.data[:,:]=torch.load( 'stock_embed.pt')
    learn.model.embeds[1].weight.data[:,:]=torch.load( 'time_embed.pt')
    learn.model.embeds[0].requires_grad_(False)
    learn.model.embeds[1].requires_grad_(False)
    learn.fit_one_cycle(20, 5e-3)
    test_dl = dls.test_dl(test_df)
    test_preds,_ = learn.get_preds(dl = test_dl)
    print(rmspe(test_preds.view(-1), torch.tensor(real_test_targets)))
    return test_preds

In [72]:
kfold = GroupKFold(n_splits = 5)
preds=[]
for split in kfold.split(train_df, groups=train_df.time_id):
    preds.append(train_fold(split))

epoch,train_loss,valid_loss,rmspe,time
0,7.260633,4.965076,5.054761,00:03
1,2.265727,1.333321,1.367008,00:03
2,0.467597,0.303635,0.336972,00:03
3,0.235599,0.214093,0.218488,00:03
4,0.219467,0.20625,0.210479,00:03
5,0.212929,0.205836,0.208615,00:03
6,0.211921,0.201236,0.203816,00:03
7,0.206616,0.193946,0.198114,00:03
8,0.228528,0.20528,0.209597,00:03
9,0.203995,0.191156,0.193691,00:03


tensor(0.2209, dtype=torch.float64)


epoch,train_loss,valid_loss,rmspe,time
0,7.487132,5.413876,5.504634,00:03
1,2.136081,1.865705,1.877349,00:03
2,0.457945,0.281688,0.284188,00:03
3,0.248202,0.2136,0.220654,00:03
4,0.218973,0.201338,0.206023,00:03
5,0.215233,0.238621,0.319671,00:03
6,0.231235,0.205885,0.214453,00:03
7,0.219331,0.206491,0.21518,00:03
8,0.219069,0.201514,0.204642,00:03
9,0.215311,0.197294,0.200083,00:03


tensor(0.2254, dtype=torch.float64)


epoch,train_loss,valid_loss,rmspe,time
0,7.146356,4.909788,4.997787,00:03
1,1.827811,1.009338,1.015407,00:03
2,0.515664,0.403151,0.408204,00:03
3,0.255362,0.255071,0.259369,00:03
4,0.228149,0.218264,0.223299,00:03
5,0.215151,0.202176,0.206876,00:03
6,0.203988,0.20536,0.209063,00:03
7,0.203365,0.20216,0.205379,00:03
8,0.203264,0.238867,0.420861,00:03
9,0.211262,0.19454,0.198197,00:03


tensor(0.3447, dtype=torch.float64)


epoch,train_loss,valid_loss,rmspe,time
0,7.356498,5.051651,5.11518,00:03
1,1.946977,0.875455,0.897781,00:03
2,0.863812,0.503872,0.52849,00:03
3,0.334953,0.242866,0.249061,00:03
4,0.230773,0.218774,0.224861,00:03
5,0.279226,0.240502,0.24815,00:03
6,0.216067,0.212397,0.21727,00:03
7,0.205242,0.200619,0.208466,00:03
8,0.208118,0.189923,0.192089,00:03
9,0.257397,0.200944,0.210818,00:03


tensor(0.2222, dtype=torch.float64)


epoch,train_loss,valid_loss,rmspe,time
0,6.531056,4.574511,4.630472,00:03
1,1.577737,0.78498,0.80403,00:03
2,0.661034,0.417438,0.431712,00:03
3,0.240932,0.222068,0.228761,00:03
4,0.213508,0.208399,0.211297,00:03
5,0.279593,0.289253,0.299979,00:03
6,0.22343,0.20263,0.211454,00:03
7,0.211756,0.25177,0.258401,00:03
8,0.204665,0.216661,0.21956,00:03
9,0.20257,0.191898,0.195227,00:03


tensor(0.2214, dtype=torch.float64)


In [73]:
mp=torch.median(torch.cat(preds, dim=1), dim=1)[0]
#mp=torch.mean(torch.cat(preds, dim=1), dim=1)

In [74]:
score =rmspe(mp, torch.tensor(real_test_targets))
score

tensor(0.2211, dtype=torch.float64)

In [75]:
score, baseline, 100* (baseline-score)/baseline

(tensor(0.2211, dtype=torch.float64),
 0.22569155599349788,
 tensor(2.0509, dtype=torch.float64))

In [76]:
rmspe(mp, torch.tensor(pseudo))

tensor(0.0495, dtype=torch.float64)