In [None]:
from fastai.vision.all import *
from tqdm.notebook import  tqdm

PATH = '../input/optiver-realized-volatility-prediction/'

# AWesome notebook previously at  https://www.kaggle.com/slawekbiel/deep-learning-approach-with-a-cnn-inference

# Solution overview

### This notebook demonstrates an approach where a neural network is trained on the raw book data. I'm not adding any engineered features, so the network starts with no concept of prices, returns, volatility or logarithms - and still achives score comparable to other public notebooks at the moment of writing.

### Each input sample is simply a 600x8 tensor representing the 8 numerical columns of the book data at each second of the 10 minute window.

## The model
I'm using a convolutional neural network with architecture inspired by ResNet. With a total of 65 convolutional layers, followed by a single dense layer.

With a small number of channels and 5x1 convolutions this is still fairly lightweight and doesn't take long to infere, nor train.

In [None]:
class ResBlock(nn.Module):
    def __init__(self, ch):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(ch, ch, kernel_size = (5,1), padding = (2,0), padding_mode='replicate'),
            nn.BatchNorm2d(ch),
            nn.ReLU(),
            nn.Conv2d(ch, ch, kernel_size = (5,1), padding = (2,0), padding_mode='replicate'),
            nn.BatchNorm2d(ch),
        )
        
    def forward(self, x):
        res = self.layers(x) + x
        res = F.relu(res)
        return res

class ResnetRegression(nn.Module):
    def __init__(self,  chan):
        super().__init__()
        layers = [
            nn.Conv2d(1,chan, kernel_size=(3,8), padding=(1,0)),
            nn.BatchNorm2d(chan),
            nn.ReLU()
        ]
        for _ in range(8):
            layers += [ResBlock(chan), ResBlock(chan), nn.AvgPool2d((2,1))]            
        layers += [Flatten(), nn.Dropout(),nn.Linear(2*chan, num_outputs)]        
        self.stem = nn.Sequential(*layers)
        self.classifier = nn.Sequential(
            nn.Linear(6 * chan, 1),
            SigmoidRange(0, .1)
        )
        
    def forward(self, x):
        return self.classifier(self.stem(x)).view(-1)

In [None]:
data_dir = PATH+'book_test.parquet'
model_file = '../input/resnetmodel/resnet_model.pth'
model = torch.load(model_file)

### Stats from the train data used for normalization:

In [None]:
means = tensor([  0.9997,   1.0003, 769.9902, 766.7346,   0.9995,   1.0005, 959.3417,
        928.2203])
stds = tensor([3.6881e-03, 3.6871e-03, 5.3541e+03, 4.9549e+03, 3.7009e-03, 3.6991e-03,
        6.6838e+03, 5.7353e+03])

### See the discussion [here](https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/251775)

In [None]:
def fix_offsets(data_df):
    offsets = data_df.groupby(['time_id']).agg({'seconds_in_bucket':'min'})
    offsets.columns = ['offset']
    data_df = data_df.join(offsets, on='time_id')
    data_df.seconds_in_bucket = data_df.seconds_in_bucket - data_df.offset
    return data_df

### Explained [here](https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/251277)

In [None]:
def ffill(data_df):
    data_df=data_df.set_index(['time_id', 'seconds_in_bucket'])
    data_df = data_df.reindex(pd.MultiIndex.from_product([data_df.index.levels[0], np.arange(0,600)], names = ['time_id', 'seconds_in_bucket']), method='ffill')
    return data_df.reset_index()

In [None]:
def load_data(fname):
    data = pd.read_parquet(fname)
    stock_id = str(fname).split('=')[1]
    time_ids = data.time_id.unique()
    row_ids = list(map(lambda x:f'{stock_id}-{x}', time_ids))
    data = fix_offsets(data)
    data = ffill(data)
    data = data[['bid_price1', 'ask_price1', 'bid_size1', 'ask_size1','bid_price2', 'ask_price2', 'bid_size2', 'ask_size2']].to_numpy()
    data = torch.tensor(data.astype('float32'))
    data = (data - means) / stds
    return data, row_ids

In [None]:
train=pd.read_csv(PATH + 'train.csv')
test=pd.read_csv(PATH + 'test.csv')
submi=pd.read_csv(PATH + 'sample_submission.csv')

# Store numpy files for further training

In [None]:
# for j in tqdm(train.stock_id.unique()):
#     fname=PATH + 'book_train.parquet/stock_id='+str(j)
#     data=pd.read_parquet(fname)
#     stock_id = str(fname).split('=')[1]
#     time_ids = data.time_id.unique()
#     row_ids = list(map(lambda x:f'{stock_id}-{x}', time_ids))
#     data = fix_offsets(data)
#     data = ffill(data)
#     data = data[['bid_price1', 'ask_price1', 'bid_size1', 'ask_size1','bid_price2', 'ask_price2', 'bid_size2', 'ask_size2']].to_numpy()
#     data = (data - np.array(means))/ np.array(stds)
#     data=data.reshape(-1, 20, 30,8)
#     indi=np.where(j in train.stock_id== True)
#     np.savez_compressed('stock_'+str(j), target=train.target[indi[0]], Images=data)
 

In [None]:
Stock=np.load('../input/numpyfiles/stock_0.npz')
plt.figure(figsize=(20, 4))
for i in range(30):
    row=np.random.choice(range(Stock['Images'].shape[0]), size=1)
    plt.subplot(3, 10, i+1)
    image=Stock['Images'][int(row),:,:,0:3]
    for k in range(3):
        image[:,:,k] = np.interp(image[:,:,k], (image[:,:,k].min(), image[:,:,k].max()), (0, 1))
    plt.imshow(image,cmap=plt.cm.binary)
    plt.axis('off')
plt.show()

In [None]:
def get_preds(data, model):
    data = data.view(-1,1,600,8)
    with torch.no_grad():
        preds = model(data.cuda())

    return preds

In [None]:
%%time

all_preds = []
for j in tqdm(train.stock_id.unique()):
    fname='../input/numpyfiles/stock_'+str(j)+'.npz'
    data =np.load(fname)['Images'].reshape((-1,600,8))
    data = torch.tensor(data.astype('float32'))
    preds = get_preds(data, model)
    df_pred = pd.DataFrame(zip(preds.tolist()),columns=['target'])
    all_preds.append(df_pred)

In [None]:
Predi=pd.DataFrame(all_preds[0])

for j in range(1, len(all_preds)):
    Predi=pd.concat([Predi, all_preds[j]])
    
import matplotlib.pyplot as plt

yt=train.target.values
yh=Predi.target.values

plt.scatter(yt, yh, marker="o", s=0.1)
plt.plot(yt, yt, 'r')

plt.title('RMSPE =' + str(np.round(np.sqrt(np.mean(((yt-yh)/yt)**2)),5)))

In [None]:
%%time
all_preds = []

for j in tqdm(test.stock_id.unique()):
    fname= PATH + 'book_test.parquet/stock_id='+str(j)
    data, row_ids = load_data(fname)
    preds = get_preds(data, model)
    df_pred = pd.DataFrame(zip(row_ids, preds.tolist()),columns=['row_id', 'target'])
    all_preds.append(df_pred)

In [None]:
df_pred = pd.concat(all_preds)
df_pred.to_csv('submission.csv', index=False)