In [1]:
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

In [2]:
PATH = Path('../input/optiver-realized-volatility-prediction')


In [3]:
df_train = pd.read_csv(PATH/'train_with_ftrs.csv').drop('row_id', axis=1)

In [43]:
def rmspe_np(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [5]:
X = df_train.drop(['target'],axis=1)
y = df_train['target']

In [6]:
kf = KFold(n_splits=5, random_state=19901028, shuffle=True)
trn_idx, val_idx = first(kf.split(X))

# Random Forest baseline

In [8]:

X_train, y_train = X.loc[trn_idx], y[trn_idx]
X_valid, y_valid = X.loc[val_idx], y[val_idx]
weights = 1/np.square(y_train)
    
model = RandomForestRegressor(min_samples_leaf=60,max_features='log2', n_jobs=-1)
model.fit(X_train, y_train, weights)

RandomForestRegressor(max_features='log2', min_samples_leaf=60, n_jobs=-1)

In [44]:
y_pred = model.predict(X_valid)
rmspe_np(y_true = y_valid, y_pred = y_pred)

0.23402246526135287

# Neural Net

In [63]:
df_train.stock_id = df_train.stock_id.astype('category')

In [92]:
df_train = df_train.drop('stock_id_target_enc', axis=1)

In [93]:
cont_nn,cat_nn = cont_cat_split(df_train, max_card=9000, dep_var='target')

In [94]:
procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(df_train, procs_nn, cat_nn, cont_nn,
                      splits=[list(trn_idx), list(val_idx)], y_names='target')

In [95]:
dls = to_nn.dataloaders(2048)

In [96]:
def rmspe(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

In [97]:
def rmspe_loss(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

In [98]:
learn = tabular_learner(dls, y_range=(0,.1), layers=[400,200,100],
                        n_out=1, loss_func=rmspe_loss, metrics=AccumMetric(rmspe), wd=.1)

In [99]:
learn.fit_one_cycle(100, 5e-3)

epoch,train_loss,valid_loss,rmspe,time
0,7.489787,6.286733,6.329895,00:02
1,4.499174,3.556994,3.607786,00:02
2,2.614301,1.943685,1.997542,00:01
3,1.136932,0.659845,0.677282,00:01
4,0.549261,1.113331,1.159381,00:02
5,0.33404,0.261252,0.265284,00:02
6,0.259264,0.249437,0.254648,00:01
7,0.250774,0.258611,0.265679,00:01
8,0.290323,0.257747,0.260584,00:02
9,0.254938,0.244949,0.248769,00:01


In [100]:
preds, tars = learn.get_preds()

In [101]:
preds.shape

torch.Size([85787, 1])

In [102]:
preds_nn = preds.view(-1).numpy()

In [103]:
rmspe_np(y_true = y_valid, y_pred = preds_nn)

0.22675025973279012

In [104]:
preds_ens = (y_pred + preds_nn)/2

In [105]:
rmspe_np(y_true = y_valid, y_pred = preds_ens)

0.22713026508344736

In [106]:
learn.save('tabular226.pth')

Path('models/tabular226.pth.pth')