In [1]:
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

In [2]:
PATH = Path('../input/optiver-realized-volatility-prediction')


In [3]:
df_train = pd.read_csv(PATH/'train_with_ftrs.csv').drop('row_id', axis=1)

In [4]:
def rmspe_np(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [5]:
X = df_train.drop(['target'],axis=1)
y = df_train['target']

In [6]:
kf = KFold(n_splits=5, random_state=19901028, shuffle=True)
trn_idx, val_idx = first(kf.split(X))

# Random Forest baseline

In [7]:

X_train, y_train = X.loc[trn_idx], y[trn_idx]
X_valid, y_valid = X.loc[val_idx], y[val_idx]
weights = 1/np.square(y_train)
    
model = RandomForestRegressor(min_samples_leaf=60,max_features='log2', n_jobs=-1)
model.fit(X_train, y_train, weights)

RandomForestRegressor(max_features='log2', min_samples_leaf=60, n_jobs=-1)

In [8]:
y_pred = model.predict(X_valid)
rmspe_np(y_true = y_valid, y_pred = y_pred)

0.2340761102454038

# Neural Net

In [9]:
df_train.stock_id = df_train.stock_id.astype('category')

In [10]:
df_train = df_train.drop('stock_id_target_enc', axis=1)

In [11]:
cont_nn,cat_nn = cont_cat_split(df_train, max_card=9000, dep_var='target')

In [12]:
procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(df_train, procs_nn, cat_nn, cont_nn,
                      splits=[list(trn_idx), list(val_idx)], y_names='target')

In [13]:
dls = to_nn.dataloaders(2048)

In [14]:
def rmspe(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

In [15]:
def rmspe_loss(preds, targs):
    x = (targs-preds)/targs
    return (x**2).mean().sqrt()

In [16]:
learn = tabular_learner(dls, y_range=(0,.1), layers=[400,200,100],
                        n_out=1, loss_func=rmspe_loss, metrics=AccumMetric(rmspe), wd=.1)

In [17]:
learn.fit_one_cycle(100, 5e-3)

epoch,train_loss,valid_loss,rmspe,time
0,7.372415,6.275041,6.310737,00:05
1,4.524455,3.659428,3.70602,00:04
2,2.566679,1.865285,1.903317,00:04
3,1.134678,0.77121,0.798391,00:04
4,0.667999,0.705063,0.994441,00:04
5,0.382304,0.300373,0.306341,00:03
6,0.262088,0.249998,0.253496,00:03
7,0.253992,0.249048,0.251041,00:05
8,0.262106,0.261682,0.265814,00:02
9,0.25864,0.249595,0.253502,00:04


In [100]:
preds, tars = learn.get_preds()

In [101]:
preds.shape

torch.Size([85787, 1])

In [102]:
preds_nn = preds.view(-1).numpy()

In [103]:
rmspe_np(y_true = y_valid, y_pred = preds_nn)

0.22675025973279012

In [104]:
preds_ens = (y_pred + preds_nn)/2

In [105]:
rmspe_np(y_true = y_valid, y_pred = preds_ens)

0.22713026508344736

In [106]:
learn.save('tabular226.pth')

Path('models/tabular226.pth.pth')