In [22]:
from fastai.vision.all import *

from tqdm.notebook import  tqdm

PATH = Path('../input/optiver-realized-volatility-prediction')

In [2]:
def ffill(data_df):
    data_df=data_df.set_index(['time_id', 'seconds_in_bucket'])
    data_df = data_df.reindex(pd.MultiIndex.from_product([data_df.index.levels[0], np.arange(0,600)], names = ['time_id', 'seconds_in_bucket']), method='ffill')
    return data_df.reset_index()

In [65]:
def fix_offsets(data_df):
    offsets = data_df.groupby(['time_id']).agg({'seconds_in_bucket':'min'})
    offsets.columns = ['offset']
    data_df = data_df.join(offsets, on='time_id')
    data_df.seconds_in_bucket = data_df.seconds_in_bucket - data_df.offset
    return data_df

In [66]:
data_df = pd.read_parquet(first(data_dir.ls()))


In [67]:
data_df

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,1.000094,1.000281,0.999906,1.000468,300,900,815,917
1,5,1,1.000094,1.000281,0.999906,1.000468,530,402,815,800
2,5,2,1.000094,1.000281,0.999906,1.000468,430,601,815,1075
3,5,3,1.000094,1.000281,0.999906,1.000468,530,302,815,1075
4,5,4,1.000094,1.000281,0.999906,1.000468,530,400,815,901
...,...,...,...,...,...,...,...,...,...,...
1891259,32767,587,1.001343,1.001522,1.001164,1.001701,1270,1101,970,800
1891260,32767,589,1.001343,1.001522,1.001164,1.001701,900,1001,970,900
1891261,32767,590,1.001343,1.001522,1.001164,1.001701,900,1001,970,1000
1891262,32767,593,1.001343,1.001522,1.001164,1.001701,900,1101,970,1000


In [68]:
fix_offsets(data_df)

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,offset
0,5,0,1.000094,1.000281,0.999906,1.000468,300,900,815,917,0
1,5,1,1.000094,1.000281,0.999906,1.000468,530,402,815,800,0
2,5,2,1.000094,1.000281,0.999906,1.000468,430,601,815,1075,0
3,5,3,1.000094,1.000281,0.999906,1.000468,530,302,815,1075,0
4,5,4,1.000094,1.000281,0.999906,1.000468,530,400,815,901,0
...,...,...,...,...,...,...,...,...,...,...,...
1891259,32767,587,1.001343,1.001522,1.001164,1.001701,1270,1101,970,800,0
1891260,32767,589,1.001343,1.001522,1.001164,1.001701,900,1001,970,900,0
1891261,32767,590,1.001343,1.001522,1.001164,1.001701,900,1001,970,1000,0
1891262,32767,593,1.001343,1.001522,1.001164,1.001701,900,1101,970,1000,0


In [51]:
data_df.at[0, 'seconds_in_bucket'] = 5


In [52]:
offsets = data_df.groupby(['time_id']).agg({'seconds_in_bucket':'min'})

In [59]:
data_df = data_df.join(offsets, on='time_id')
data_df.seconds_in_bucket = data_df.seconds_in_bucket - data_df.offset

In [60]:
data_df

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,offset
0,5,4,1.000094,1.000281,0.999906,1.000468,300,900,815,917,1
1,5,0,1.000094,1.000281,0.999906,1.000468,530,402,815,800,1
2,5,1,1.000094,1.000281,0.999906,1.000468,430,601,815,1075,1
3,5,2,1.000094,1.000281,0.999906,1.000468,530,302,815,1075,1
4,5,3,1.000094,1.000281,0.999906,1.000468,530,400,815,901,1
...,...,...,...,...,...,...,...,...,...,...,...
1891259,32767,587,1.001343,1.001522,1.001164,1.001701,1270,1101,970,800,0
1891260,32767,589,1.001343,1.001522,1.001164,1.001701,900,1001,970,900,0
1891261,32767,590,1.001343,1.001522,1.001164,1.001701,900,1001,970,1000,0
1891262,32767,593,1.001343,1.001522,1.001164,1.001701,900,1101,970,1000,0


In [36]:
means = tensor([  0.9997,   1.0003, 769.9902, 766.7346,   0.9995,   1.0005, 959.3417,
        928.2203])
stds = tensor([3.6881e-03, 3.6871e-03, 5.3541e+03, 4.9549e+03, 3.7009e-03, 3.6991e-03,
        6.6838e+03, 5.7353e+03])



In [17]:
def load_data(fname):
    data = pd.read_parquet(fname)
    data = data[data.time_id > 26000]
    stock_id = str(fname).split('=')[1]
    time_ids = data.time_id.unique()
    row_ids = list(map(lambda x:f'{stock_id}-{x}', time_ids))
    data = ffill(data)
    data = data[['bid_price1', 'ask_price1', 'bid_size1', 'ask_size1','bid_price2', 'ask_price2', 'bid_size2', 'ask_size2']].to_numpy()
    data = torch.tensor(data.astype('float32'))
    data = (data - means) / stds
    return data, row_ids

In [6]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [7]:
def get_preds(data, model):
    bs=512
    data = data.view(-1,1,600,8)
    res=[]
    for idx in range(0, len(data), bs):
        batch = data[idx:min(idx+bs, len(data))]
        with torch.no_grad():
            preds = model(batch.cuda())
            res.append(preds)
    return torch.cat(res)

In [8]:
class ResBlock(nn.Module):
    def __init__(self, ch):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(ch, ch, kernel_size = (5,1), padding = (2,0), padding_mode='replicate'),
            nn.BatchNorm2d(ch),
            nn.ReLU(),
            nn.Conv2d(ch, ch, kernel_size = (5,1), padding = (2,0), padding_mode='replicate'),
            nn.BatchNorm2d(ch),
        )
        
    def forward(self, x):
        res = self.layers(x) + x
        res = F.relu(res)
        return res

In [9]:
class ResnetRegression(nn.Module):
    def __init__(self,  chan):
        super().__init__()
        layers = [
            nn.Conv2d(1,chan, kernel_size=(3,8), padding=(1,0)),
            nn.BatchNorm2d(chan),
            nn.ReLU()
        ]
        for _ in range(8):
            layers += [ResBlock(chan), ResBlock(chan), nn.AvgPool2d((2,1))]            
        layers += [Flatten(), nn.Dropout(),nn.Linear(2*chan, num_outputs)]        
        self.stem = nn.Sequential(*layers)
        self.classifier = nn.Sequential(
            nn.Linear(6 * chan, 1),
            SigmoidRange(0, .1)
        )
        
    def forward(self, x):
        return self.classifier(self.stem(x)).view(-1)

In [69]:
model = torch.load('res248.pth').cuda()

In [14]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]

In [24]:
all_preds = []
for fname in tqdm(data_dir.ls()):
    data, row_ids = load_data(fname)
    preds = get_preds(data, model)
    df_pred = pd.DataFrame(zip(row_ids, preds.tolist()),columns=['row_id', 'pred'])
    all_preds.append(df_pred)

  0%|          | 0/112 [00:00<?, ?it/s]

In [30]:
df_pred = pd.concat(all_preds)

df_joined = train.merge(df_pred, on=['row_id'], how = 'right')

RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(RMSPE)