In [None]:
import os
import gc
import json
import pickle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn
import random
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

print(torch.__version__)

In [None]:
USE_TPU=False

if USE_TPU:
    !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
    !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev


    import torch_xla
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.parallel_loader as pl
    import torch_xla.distributed.xla_multiprocessing as xmp   
    import torch_xla.debug.metrics as met

    device=xm.xla_device()
else:
    device=torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )

print(device)

In [None]:
def seed_everything(s):
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)
    torch.cuda.manual_seed_all(s)
    torch.cuda.manual_seed(s)
    torch.backends.cudnn.deterministic=True
    
seed_everything(10)

In [None]:
%%time
def load_numpy(filename):
    folder_path="../input/optiverdatasetmultitaskwitharrays"
    data_path=os.path.join(folder_path, filename)
    arr=np.load(data_path)
    return arr
bucket=load_numpy('bucket.npy')
features=load_numpy('features.npy')
target=load_numpy('target.npy')

print(bucket.shape)
print(features.shape)
print(target.shape)

In [None]:
class config:
    num_buckets= 60
    num_features= 28
    batch_size=128
    epochs=10

# Dataset

In [None]:
class OptiverDataset(torch.utils.data.Dataset):
    def __init__(self, indices, bucket, features, target):
        self.indices=indices
        self.bucket=bucket
        self.features=features
        self.target=target
        
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        i=self.indices[idx]
        
        bucket_num=self.bucket[i]
        bucket_features=self.features[i]
        y=self.target[i]
        
        X=torch.zeros((config.num_buckets, config.num_features), dtype=torch.float32)
        ywap1=torch.zeros((config.num_buckets, 1), dtype=torch.float32)
        ybid1=torch.zeros((config.num_buckets, 1), dtype=torch.float32)
        yask1=torch.zeros((config.num_buckets, 1), dtype=torch.float32)
        
        
        for i, bucket_id in enumerate(bucket_num):
            X[bucket_id] = torch.tensor(bucket_features[i], dtype=torch.float32)
            
            ybid1[bucket_id] = torch.tensor( bucket_features[i][0], dtype=torch.float32)
            yask1[bucket_id] = torch.tensor( bucket_features[i][1], dtype=torch.float32)
            ywap1[bucket_id]=torch.tensor( bucket_features[i][19], dtype=torch.float32)

        y=torch.tensor(y, dtype=torch.float32)
        return {
            'X': X,
            'yprimary': y,
            'ybid1': ybid1,
            'yask1': yask1,
            'ywap1': ywap1
        }

In [None]:
random.seed(0)
indices=np.arange(bucket.shape[0])
random.shuffle(indices)

train_idx=indices[:int(len(indices) * 0.8)]
valid_idx=indices[int(len(indices) * 0.8):]

In [None]:
train_dataset=OptiverDataset(train_idx, bucket, features, target)
valid_dataset=OptiverDataset(valid_idx, bucket, features, target)


train_dataloader=torch.utils.data.DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    drop_last=True,
    pin_memory=True,
    num_workers=2
)

valid_dataloader=torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=128,
    shuffle=False,
    drop_last=False,
    pin_memory=True,
    num_workers=0
)

print(len(train_dataloader), len(valid_dataloader))

# Model

In [None]:
def get_activation_fn(activation):
    if activation=='gelu':
        return nn.GELU()
    elif activation=='relu':
        return nn.ReLU()
    
def attention(query, key, value, dropout=None):
    d_k=query.size(-1)
    scores=torch.matmul( query, key.transpose(-1, -2) )/np.sqrt(d_k)
    scores=torch.tril(scores)
    scores=scores.masked_fill(scores == 0, -1e9)
    p_attn=torch.softmax(scores, dim=-1)
    
    x_attn=torch.matmul(p_attn, value)
    
    if dropout:
        x_attn=dropout(x_attn)
    return p_attn, x_attn

class MultiHeadAttention(nn.Module):
    def __init__(self, dmodel, nhead,activation,norm,dropout):
        super().__init__()
        self.dmodel=dmodel
        self.nhead=nhead
        self.d_k=dmodel//nhead #Size
        
        self.activation=activation
        self.norm=norm
        self.dropout=dropout
        
        self.Q=nn.Linear(dmodel, dmodel)
        self.K=nn.Linear(dmodel, dmodel)
        self.V=nn.Linear(dmodel, dmodel)
        self.W=nn.Linear(dmodel, dmodel)
        
    def forward(self, x):
        bsize=x.size(0)
        query=self.Q(x).view(bsize, -1, self.nhead, self.d_k)
        key=self.K(x).view(bsize, -1, self.nhead, self.d_k)
        value=self.V(x).view(bsize, -1, self.nhead, self.d_k)
        
        p_attn, x_attn=attention(query, key, value, self.dropout)
        x_attn=x_attn.view(bsize, -1, self.nhead*self.d_k)
        x_attn=self.W(x_attn)
        x=self.norm(x+x_attn)
        return x

class TimeSeriesAttentionLayer(nn.Module):
    def __init__(self,
                 dmodel=128,
                 nhead=4,
                 dim_feed_forward=512,
                 activation='gelu', 
                 dropout=0.1):
        
        super().__init__()
        self.dmodel=dmodel
        self.nhead=nhead
        self.dim_feed_forward=dim_feed_forward
        self.activation=get_activation_fn(activation)
        self.norm=nn.LayerNorm(dmodel)
        self.dropout=nn.Dropout(dropout)
        
        self.multihead_attn=MultiHeadAttention(dmodel,
                                               nhead,
                                               self.activation,
                                               self.norm,
                                               self.dropout)
        
        self.linear1=nn.Linear(dmodel, dim_feed_forward)
        self.linear2=nn.Linear(dim_feed_forward, dmodel)
        
    def forward(self, x):
        x=self.multihead_attn(x)
        x_ffn=self.linear2(self.dropout(self.activation(self.linear1(x))))
        x=self.norm(x+x_ffn)
        return x

class FeatureExtractor(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.pre_bn=nn.BatchNorm1d(config.num_buckets , config.num_features)
        
        self.linear1=nn.Linear(config.num_features, 2*sz)
        self.bn1=nn.BatchNorm1d(config.num_buckets, 2*sz)
        
        self.linear2=nn.Linear(2*sz, sz)
        self.bn2=nn.BatchNorm1d(config.num_buckets, sz)
        
        self.dropout=nn.Dropout(0.2)
        self.activation=nn.GELU()
    def forward(self, x):
        x=self.pre_bn(x)
        x=self.dropout( self.bn1( self.activation(self.linear1(x)) ) )
        x=self.dropout( self.bn2( self.activation(self.linear2(x)) ) )
        return x
    
    
class FeatureExtractorWith1DConv(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.pre_bn=nn.BatchNorm1d(config.num_buckets , config.num_features)
        
        self.conv1=nn.Conv1d(config.num_features, sz, 3, padding=1)
        self.bn1=nn.BatchNorm1d(config.num_buckets, sz)
        
        
        self.conv2=nn.Conv1d(sz, sz, 3, padding=1)
        self.bn2=nn.BatchNorm1d(config.num_buckets, sz)
        
        self.activation=nn.ReLU()
        
    def forward(self, x):
        x=self.pre_bn(x)
        x=self.activation( self.bn1( self.conv1(x.transpose(1, 2)).transpose(1, 2) ))
        #x=self.activation( self.bn2( self.conv2(x.transpose(1, 2)).transpose(1, 2) ))
        
        return x

In [None]:
class Auxilary_FFN(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.linear=nn.Linear(sz, sz)
        self.bn=nn.BatchNorm1d(config.num_buckets , sz)
        self.activation=nn.GELU()
        self.dropout=nn.Dropout(0.2)
        
    def forward(self, x):
        x=self.bn(x)
        x=self.activation(x)
        x=self.dropout(x)
        x=self.linear(x)
        return x


class FFN(nn.Module):
    def __init__(self, sz):
        super().__init__()
        self.linear=nn.Linear(sz, sz)
        self.bn=nn.BatchNorm1d(sz)
        self.activation=nn.GELU()
        self.dropout=nn.Dropout(0.2)
        
    def forward(self, x):
        x=self.bn(x)
        x=self.activation(x)
        x=self.dropout(x)
        x=self.linear(x)    
        return x


class AttentionHead(nn.Module):
    def __init__(self, dmodel, dropout):
        super().__init__()
        self.dropout=dropout
        self.W=nn.Linear(dmodel, 1)
    def forward(self, x):
        scores=self.W(x).squeeze(-1)
        p_attn=torch.softmax(scores, dim=-1)
        
        if self.dropout:
            p_attn=self.dropout(p_attn)
        x_attn=torch.matmul(p_attn, x)
        x_attn=x_attn.sum(dim=1)
        return p_attn, x_attn

In [None]:
class PrimaryHead(nn.Module):
    def __init__(self, hsize):
        super().__init__()
        self.hsize=hsize
        self.attn_dropout=nn.Dropout(0.1)
        self.attn_head=AttentionHead(hsize, self.attn_dropout)
        self.ffn=nn.ModuleList(
            [FFN(hsize) for _ in range(3)]
        )
        self.primary_out=nn.Linear(hsize, 1)
        
    def forward(self, x):
        p_attn, x=self.attn_head(x)
        for i, _ in enumerate(self.ffn):
            if i==0:
                continue
            x=self.ffn[i](x + self.ffn[i-1](x))
        y=self.primary_out(x)
        return p_attn, y

class AuxilaryHead(nn.Module):
    def __init__(self, hsize):
        super().__init__()
        self.hsize=hsize
        self.ffn=nn.ModuleList(
            [Auxilary_FFN(hsize) for _ in range(3)]
        )
        self.aux_out=nn.Linear(hsize, 1)
    def forward(self, x):
        for i, _ in enumerate(self.ffn):
            if i==0:
                continue
            x=self.ffn[i](x + self.ffn[i-1](x))
        y=self.aux_out(x)
        return y

In [None]:
class OptiverModel(nn.Module):
    def __init__(self, model_size=128):
        super().__init__()
        self.model_size=model_size
        self.feature_extractor=FeatureExtractorWith1DConv(model_size)
        self.pos_embeddings=nn.Embedding(config.num_buckets, model_size, max_norm=1)
        self.positions=torch.arange(config.num_buckets, dtype=torch.long).to(device)
        self.attn_layers=nn.ModuleList([TimeSeriesAttentionLayer() for _ in range(5)])
        
        self.primary_model=PrimaryHead(model_size)
        self.auxilary_ask1=AuxilaryHead(model_size)
        self.auxilary_bid1=AuxilaryHead(model_size)
        self.auxilary_wap1=AuxilaryHead(model_size)
        
    def forward(self, x):
        x=self.feature_extractor(x)
        x=x+(self.pos_embeddings(self.positions).unsqueeze(0)/np.sqrt(self.model_size))
        for attn_layer in self.attn_layers:
            x=attn_layer(x)
            
        p_attn, yprimary=self.primary_model(x)
        yaux_ask1=self.auxilary_ask1(x)
        yaux_bid1=self.auxilary_bid1(x)
        yaux_wap1=self.auxilary_wap1(x)
        
        return {
            'p_attn': p_attn,
            'yprimary': yprimary,
            'yask1': yaux_ask1,
            'ybid1': yaux_bid1,
            'ywap1': yaux_wap1
        }

# custom losses

In [None]:
class CustomLosses:
    @staticmethod
    def MSE(y, yhat):
        yerr=y-yhat
        yerr=torch.square(yerr)
        return yerr.mean()

    @staticmethod
    def RMSE(y, yhat):
        err=(y-yhat)
        return torch.sqrt( torch.mean(err**2) )

    @staticmethod
    def RMSPE(y, yhat):
        err=(y-yhat)
        err/=y
        err=torch.square(err)
        return torch.sqrt( torch.mean(err) )

    
    @staticmethod
    def get_auxilary_loss(y, yhat):
        y=y[:, 1:config.num_buckets].squeeze(-1)
        yhat=yhat[:, 0: config.num_buckets-1].squeeze(-1)

        #Validating only the last 20 time-steps
        y=y[:, -20:]
        yhat=yhat[:, -20:]

        yerr=100 * (y - yhat)/y
        yerr=torch.square(yerr).view(-1).mean()
        yerr=torch.sqrt(yerr)
        return yerr

# trainer

In [None]:
class Evaluator:
    def __init__(self, model, val_dataloader):
        self.model=model
        self.dataloader=val_dataloader
        
    def evaluate(self):
        self.model.eval()
        print("Evaluating")
        yprimary_true=[]; yprimary_pred=[]
        for batchid, data in enumerate(self.dataloader):
            if batchid%300 == 0:
                print(batchid)
            (X, yprimary, _, _, _)=(data['X'], data['yprimary'],data['yask1'], data['ybid1'], data['ywap1'])
            X=X.to(device)
            yprimary_true+=yprimary.cpu().tolist()
            with torch.no_grad():
                outputs=model(X)
                yprimary_pred+=outputs['yprimary'].view(-1).cpu().tolist()
        
        yprimary_true=torch.tensor(yprimary_true, dtype=torch.float32)
        yprimary_pred=torch.tensor(yprimary_pred, dtype=torch.float32)
        
        rmse_loss=CustomLosses.RMSE(yprimary_true, yprimary_pred)
        rmspe_loss=CustomLosses.RMSPE(yprimary_true, yprimary_pred)
        return (rmse_loss.item(), rmspe_loss.item())
        
class Trainer:
    def __init__(self, model, train_dataloader, val_dataloader,
                 optimizer, schedular=None):
        self.best_rmse=None
        self.best_rmspe=None
        self.evaluator=Evaluator(model, val_dataloader)
        
        self.train_dataloader=train_dataloader
        self.val_dataloader=val_dataloader
        
        self.model=model
        self.optimizer=optimizer
        self.schedular=schedular
    
    def train_ops(self, data):
        self.model.train()
        (X, yprimary, yask1, ybid1, ywap1)=(data['X'], data['yprimary'],data['yask1'], data['ybid1'], data['ywap1'])
        X=X.to(device)
        
        yprimary=yprimary.to(device)
        yask1=yask1.to(device)
        ybid1=ybid1.to(device)
        ywap1=ywap1.to(device)
        
        outputs=self.model(X)
        
        
        rmse_loss=CustomLosses.RMSE(yprimary, outputs['yprimary'].view(-1))
        rmspe_loss=CustomLosses.RMSPE(yprimary, outputs['yprimary'].view(-1))
        
        ask1_loss=CustomLosses.get_auxilary_loss(yask1, outputs['yask1'])
        bid1_loss=CustomLosses.get_auxilary_loss(ybid1, outputs['ybid1'])
        wap1_loss=CustomLosses.get_auxilary_loss(ywap1, outputs['ywap1'])
        
        self.optimizer.zero_grad(set_to_none=True)
        loss= (0.7 * rmspe_loss) + (0.3*(ask1_loss + bid1_loss+wap1_loss)/3)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
        self.optimizer.step()
        
        del outputs
        if self.schedular:
            self.schedular.step()
            
        return {
            'loss': loss.item(),
            'rmse_loss': rmse_loss.item(),
            'rmspe_loss': rmspe_loss.item(),
            'ask1_loss': ask1_loss.item(),
            'bid1_loss': bid1_loss.item(),
            'wap1_loss': wap1_loss.item()
        }
                
    def train(self):
        for e in range(config.epochs):
            train_loss=[]
            train_rmse_loss=[];train_rmspe_loss=[];train_bid1_loss=[]
            train_ask1_loss=[]; train_wap1_loss=[];
            
            self.model.train()
            for i, data in enumerate(self.train_dataloader):
                losses=self.train_ops(data)
                
                loss=losses['loss']
                rmse_loss=losses['rmse_loss']
                rmspe_loss=losses['rmspe_loss']
                
                ask1_loss=losses['ask1_loss']
                bid1_loss=losses['bid1_loss']
                wap1_loss=losses['wap1_loss']
                
                if i%300==0:
                    print("Iteration:{}|Loss:{:.3f}|RMSE:{:.3f}|RMSPE:{:.3f}".format(i, loss, rmse_loss, rmspe_loss))
                    print("ask1:{:.3f}|bid1:{:.3f}|wap1:{:.3f}".format(ask1_loss, bid1_loss, wap1_loss))
                
                del losses
                train_loss.append(loss)
                train_rmse_loss.append(rmse_loss)
                train_rmspe_loss.append(rmspe_loss)
                
                train_bid1_loss.append(bid1_loss)
                train_ask1_loss.append(ask1_loss)
                train_wap1_loss.append(wap1_loss)
            
            (eval_rmse, eval_rmspe) = self.evaluator.evaluate()
            if (self.best_rmse is None) or (self.best_rmse > eval_rmse):
                self.best_rmse=eval_rmse
                torch.save(self.model, 'best_rmse.pt')
            if (self.best_rmspe is None) or (self.best_rmspe > eval_rmspe):
                self.best_rmspe=eval_rmspe
                torch.save(self.model, 'best_rmspe.pt')

            print()
            print()
            print("***************End of Epoch{}***************".format(e))
            print("epoch:{}-LOSS:{:.4f}|RMSE Loss:{:.4f} |RMSPE Loss:{:.4f}".format(e,np.mean(train_loss),
                                                                                    np.mean(train_rmse_loss),
                                                                                    np.mean(train_rmspe_loss)))
            
            print("Train BID1:{:.4f} | Train ASK1:{:.4f} | Train WAP1:{:.4f}".format(np.mean(train_bid1_loss),
                                                                                     np.mean(train_ask1_loss),
                                                                                     np.mean(train_wap1_loss)))
            print("Val RMSE:{:.4f} | Val RMSPE:{:.4f}".format(eval_rmse, eval_rmspe))
    
    def lr_range_test(self):
        min_lr=5e-7
        max_lr=1e-3
        optimizer=torch.optim.AdamW(self.model.parameters(), lr=min_lr, weight_decay=1e-5)
        scheduler=torch.optim.lr_scheduler.StepLR(optimizer, 1, 1.03)
        
        losses=[]
        lrs=[]
        self.model.train()
        for _ in range(50):
            for i, data in enumerate(self.train_dataloader):
                (X, yprimary, yask1, ybid1, ywap1)=(data['X'], data['yprimary'],data['yask1'], data['ybid1'], data['ywap1'])
                
                outputs=self.model(X)
                rmse_loss=CustomLosses.RMSE(yprimary, outputs['yprimary'])
                rmspe_loss=CustomLosses.RMSPE(yprimary, outputs['yprimary'])

                ask1_loss=CustomLosses.get_auxilary_loss(yask1, outputs['yask1'])
                bid1_loss=CustomLosses.get_auxilary_loss(ybid1, outputs['ybid1'])
                wap1_loss=CustomLosses.get_auxilary_loss(ywap1, outputs['ywap1'])

                #loss=(rmspe_loss + (ask1_loss + bid1_loss+wap1_loss)/3)/2
                
                loss= (0.7 * rmspe_loss) + (0.3*(ask1_loss + bid1_loss+wap1_loss)/3)
                
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
                optimizer.step()
                scheduler.step()

                losses.append(loss.item())
                lrs.append(scheduler.get_last_lr()[0])

                if i%10==0:
                    print(i, '-->', lrs[-1], losses[-1])
                if lrs[-1] > max_lr:
                    break
            if lrs[-1] > max_lr:
                break
        return lrs, losses

In [None]:
model=OptiverModel()
model=model.to(device)


max_lr=4.2e-4
optimizer=torch.optim.AdamW(model.parameters(), lr=max_lr)
schedular=torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                              max_lr=max_lr,
                                              pct_start=0.1,
                                              steps_per_epoch=len(train_dataloader),
                                              epochs=config.epochs,
                                              final_div_factor=1e3)

trainer=Trainer(model, train_dataloader, valid_dataloader, optimizer, schedular)
trainer.train()

In [None]:
#trainer=Trainer(model, train_dataloader, valid_dataloader, optimizer, schedular)
#lrs, losses=trainer.lr_range_test()

In [None]:
#lmt=250
#plt.plot(lrs[:lmt], losses[:lmt])