In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import json

In [21]:
bs = 128

#load just the first batch
X = torch.tensor(np.load("../data/input/dyconex_252901/X.npy",allow_pickle=True).astype(float),dtype=torch.float)[:bs]
Y = torch.tensor(np.load("../data/input/dyconex_252901/Y.npy",allow_pickle=True).astype(float))[:bs]

var_vocab_filename = "../data/input/dyconex_252901/variables_vocabulary.json"
pos_vocab_filename = "../data/input/dyconex_252901/position_vocabulary.json"

with open(var_vocab_filename) as f:
    var_vocab = json.load(f)
with open(pos_vocab_filename) as f:
    pos_vocab = json.load(f)

In [22]:
id_idx = 0
process_idx = 1
var_idx = 2
pos_idx = 3
val_idx = 4
time_idx = 5

## Embedding
$$\mathcal{\Phi}=\text{concat}(v,\phi_\text{v},\phi_\text{t},\phi_\text{p})$$

- idea: add also embedding for value with powers? e.g $av,bv^2,cv^3,dv^4$ where $a,b,c,d$ are learnable parameters


### Mask

In [36]:
mask = torch.logical_not(X[:,:,val_idx].isnan()).unsqueeze(-1)
print(mask.shape)
mask

torch.Size([128, 1453, 1])


tensor([[[ True],
         [ True],
         [ True],
         ...,
         [False],
         [False],
         [False]],

        [[ True],
         [ True],
         [ True],
         ...,
         [False],
         [False],
         [False]],

        [[ True],
         [ True],
         [ True],
         ...,
         [False],
         [False],
         [False]],

        ...,

        [[ True],
         [ True],
         [ True],
         ...,
         [False],
         [False],
         [False]],

        [[ True],
         [ True],
         [ True],
         ...,
         [False],
         [False],
         [False]],

        [[ True],
         [ True],
         [ True],
         ...,
         [False],
         [False],
         [False]]])

In [37]:
exp_mask = mask.expand(-1,1453,10)
print(exp_mask.shape)
exp_mask

torch.Size([128, 1453, 10])


tensor([[[ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         ...,
         [False, False, False,  ..., False, False, False],
         [

### Variable-Value Embedding

In [5]:
d_var_emb = 3
var_vocab_size = len(var_vocab)
var_emb = nn.Embedding(var_vocab_size,d_var_emb)
var_embed_idx = torch.nan_to_num(X[:,:,var_idx]).type(torch.int)
var_emb(var_embed_idx).shape

torch.Size([3358, 1453, 3])

In [6]:
embed = nn.Identity()
len(X[:,:,val_idx].shape)

2

In [7]:
var_val_emb = torch.cat((X[:,:,val_idx].unsqueeze(-1),var_emb(var_embed_idx)),dim=-1)
print(var_val_emb.shape)
var_val_emb

torch.Size([3358, 1453, 4])


tensor([[[ 0.3035,  0.2188,  0.3355,  0.4864],
         [ 0.6504, -1.5821, -1.4712,  0.5450],
         [ 0.6370, -0.2556, -0.6311,  2.2377],
         ...,
         [    nan,  0.2188,  0.3355,  0.4864],
         [    nan,  0.2188,  0.3355,  0.4864],
         [    nan,  0.2188,  0.3355,  0.4864]],

        [[ 0.3035,  0.2188,  0.3355,  0.4864],
         [ 0.6504, -1.5821, -1.4712,  0.5450],
         [ 0.6370, -0.2556, -0.6311,  2.2377],
         ...,
         [    nan,  0.2188,  0.3355,  0.4864],
         [    nan,  0.2188,  0.3355,  0.4864],
         [    nan,  0.2188,  0.3355,  0.4864]],

        [[ 0.3035,  0.2188,  0.3355,  0.4864],
         [ 0.6504, -1.5821, -1.4712,  0.5450],
         [ 0.6370, -0.2556, -0.6311,  2.2377],
         ...,
         [    nan,  0.2188,  0.3355,  0.4864],
         [    nan,  0.2188,  0.3355,  0.4864],
         [    nan,  0.2188,  0.3355,  0.4864]],

        ...,

        [[    nan,  0.2188,  0.3355,  0.4864],
         [    nan, -1.5821, -1.4712,  0.5450]

### Positional Encoding

##### Input - PaPos

In [8]:
d_pos_emb = 5

##### Target - sequence length

In [9]:
max_pos_trg = int(Y[:,:,pos_idx].flatten().nan_to_num().max()+1)
pos_trg_emb = nn.Embedding(max_pos_trg,d_pos_emb)
pos_embed_idx = torch.nan_to_num(Y[:,:,pos_idx]).type(torch.int)
phi_pos_trg = pos_trg_emb(pos_embed_idx)
phi_pos_trg


tensor([[[ 1.4385, -0.3589, -0.6868, -1.0939,  0.8427],
         [-0.7602,  1.0481, -0.5870, -0.1887,  1.2394],
         [ 0.3057,  0.6494, -0.8286,  0.0674,  0.1681],
         ...,
         [-0.6453,  1.5275,  1.9571,  0.9368, -0.3315],
         [-0.6453,  1.5275,  1.9571,  0.9368, -0.3315],
         [-0.6453,  1.5275,  1.9571,  0.9368, -0.3315]],

        [[ 1.4385, -0.3589, -0.6868, -1.0939,  0.8427],
         [-0.7602,  1.0481, -0.5870, -0.1887,  1.2394],
         [ 0.3057,  0.6494, -0.8286,  0.0674,  0.1681],
         ...,
         [-0.6453,  1.5275,  1.9571,  0.9368, -0.3315],
         [-0.6453,  1.5275,  1.9571,  0.9368, -0.3315],
         [-0.6453,  1.5275,  1.9571,  0.9368, -0.3315]],

        [[ 1.4385, -0.3589, -0.6868, -1.0939,  0.8427],
         [-0.7602,  1.0481, -0.5870, -0.1887,  1.2394],
         [ 0.3057,  0.6494, -0.8286,  0.0674,  0.1681],
         ...,
         [-0.6453,  1.5275,  1.9571,  0.9368, -0.3315],
         [-0.6453,  1.5275,  1.9571,  0.9368, -0.3315],
  

### Time Embeddings
#### Time2Vec

In [10]:
class Time2Vec(nn.Module):
    def __init__(self, input_dim:int=6, embed_dim:int=512, activation=torch.sin):
        super(Time2Vec, self).__init__()
        
        assert embed_dim % input_dim == 0
        
        self.embed_dim = embed_dim // input_dim # so that the final dimension is embed_dim
        self.input_dim = input_dim
        self.activation = activation
        
        # initialize learnable weights and biases
        self.embed_weight = nn.parameter.Parameter(torch.rand(self.input_dim,self.embed_dim))
        self.embed_bias = nn.parameter.Parameter(torch.rand(self.input_dim,self.embed_dim))

    def forward(self, x: torch.Tensor):
            
        x_diag = torch.diag_embed(x)
        #print(f"Xdiag: {x_diag.shape} emb: {self.embed_weight.shape}")
        # x.shape = (bs, sequence_length, input_dim, input_dim)
        x_affine = torch.matmul(x_diag, self.embed_weight) + self.embed_bias
        # x_affine.shape = (bs, sequence_length, input_dim, time_embed_dim)
        x_affine_0, x_affine_remain = torch.split(x_affine, [1, self.embed_dim - 1], dim=-1)
        x_affine_remain = self.activation(x_affine_remain)
        x_out = torch.cat([x_affine_0, x_affine_remain], dim=-1)
        x_out = x_out.view(x_out.size(0), x_out.size(1), -1)
        return x_out

d_time = 5
time_emb_dim = 5
time2vec_emb = Time2Vec(d_time, embed_dim=time_emb_dim*d_time)

In [11]:
time_emb = time2vec_emb(X[:,:,time_idx:])
time_emb.shape

torch.Size([3358, 1453, 25])

In [12]:
torch.cat((var_val_emb,time_emb),dim=-1)

tensor([[[ 0.3035,  0.2188,  0.3355,  ..., -0.7933,  0.4403, -0.7082],
         [ 0.6504, -1.5821, -1.4712,  ..., -0.7933,  0.4403, -0.7082],
         [ 0.6370, -0.2556, -0.6311,  ..., -0.7933,  0.4403, -0.7082],
         ...,
         [    nan,  0.2188,  0.3355,  ...,     nan,     nan,     nan],
         [    nan,  0.2188,  0.3355,  ...,     nan,     nan,     nan],
         [    nan,  0.2188,  0.3355,  ...,     nan,     nan,     nan]],

        [[ 0.3035,  0.2188,  0.3355,  ..., -0.7933,  0.4403, -0.7082],
         [ 0.6504, -1.5821, -1.4712,  ..., -0.7933,  0.4403, -0.7082],
         [ 0.6370, -0.2556, -0.6311,  ..., -0.7933,  0.4403, -0.7082],
         ...,
         [    nan,  0.2188,  0.3355,  ...,     nan,     nan,     nan],
         [    nan,  0.2188,  0.3355,  ...,     nan,     nan,     nan],
         [    nan,  0.2188,  0.3355,  ...,     nan,     nan,     nan]],

        [[ 0.3035,  0.2188,  0.3355,  ..., -0.7933,  0.4403, -0.7082],
         [ 0.6504, -1.5821, -1.4712,  ..., -0

## Mask and Attention

In [13]:
n = 2
L = 5
d_emb = 3
K = torch.randn(n,L,d_emb)
Q = torch.randn(n,L,d_emb)


# NaNs are found along L
K[0,1,0]=torch.nan
K[1,4,0]=torch.nan
Q[0,0,0]=torch.nan
K

tensor([[[-0.6169, -0.2145,  1.9997],
         [    nan,  1.2196,  0.7723],
         [ 0.8318, -0.2092, -0.2938],
         [ 0.9720, -0.4684, -0.2061],
         [ 0.0520,  1.2877, -0.0369]],

        [[-0.7380,  0.1939, -0.9375],
         [-1.2621,  0.7015, -0.1555],
         [ 0.5386,  0.1562,  0.5686],
         [-1.9712, -1.3248, -1.1392],
         [    nan, -1.2711, -0.0214]]])

In [14]:
Q

tensor([[[    nan, -1.2260,  0.3419],
         [ 0.9186, -0.2038, -0.7597],
         [ 0.5567, -0.8945,  0.2760],
         [-1.7344, -0.2097, -1.0253],
         [-2.1004, -1.6605,  0.6282]],

        [[ 1.7459, -1.7729,  0.7039],
         [ 0.8620,  2.2945,  0.3033],
         [-1.4279,  0.2278, -0.2120],
         [-0.1705,  1.3434, -1.2678],
         [-1.1626,  0.8139,  0.3990]]])

In [15]:
# get NaNs index (note: we don't need the hidden dimension, just n and L)
K_nan_idx = torch.isnan(K).nonzero()[:,:-1]
Q_nan_idx = torch.isnan(Q).nonzero()[:,:-1]

# build M nxLXd
M = torch.zeros(n,L,L) 
M[K_nan_idx.squeeze()[0],:,K_nan_idx.squeeze()[1]] = -torch.inf # cols (keys)
M[Q_nan_idx.squeeze()[0],Q_nan_idx.squeeze()[1],:] = -torch.inf # rows (queries)
M

tensor([[[-inf, -inf, -inf, -inf, -inf],
         [0., -inf, 0., 0., 0.],
         [0., -inf, 0., 0., 0.],
         [0., -inf, 0., 0., 0.],
         [0., -inf, 0., 0., 0.]],

        [[0., 0., 0., 0., -inf],
         [0., 0., 0., 0., -inf],
         [0., 0., 0., 0., -inf],
         [0., 0., 0., 0., -inf],
         [0., 0., 0., 0., -inf]]])

In [16]:
# K = torch.nan_to_num(K)
# Q = torch.nan_to_num(Q)
A = torch.matmul(K,Q.view(n,d_emb,L))
A.masked_fill_(torch.isnan(A),-torch.inf)

tensor([[[   -inf, -1.4133, -4.2192, -3.9464,  1.7540],
         [   -inf,    -inf,    -inf,    -inf,    -inf],
         [   -inf, -0.8350,  1.0886,  1.1942,  0.0087],
         [   -inf, -1.2412,  1.1841,  1.1058,  0.4848],
         [   -inf,  0.6910, -1.0564,  0.4646, -2.2671]],

        [[-2.4890,  2.2201,  0.6146, -1.4403, -2.1004],
         [-2.1997,  1.4332, -0.5478, -1.3633, -3.0777],
         [ 1.7515, -1.8988, -0.2463,  0.8939,  1.4359],
         [-5.3736,  6.8307, -0.3649, -2.3456, -4.7514],
         [   -inf,    -inf,    -inf,    -inf,    -inf]]])

In [17]:
# using nan_to_num() after the softmax solves two problems
# 1) remove missing queries
# 2) forces to zero the output when the whole input is masked (i.e. missing)
score = F.softmax(A,dim=-1).nan_to_num()
score

tensor([[[0.0000e+00, 4.0189e-02, 2.4294e-03, 3.1912e-03, 9.5419e-01],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 5.6248e-02, 3.8505e-01, 4.2793e-01, 1.3077e-01],
         [0.0000e+00, 3.5241e-02, 3.9840e-01, 3.6838e-01, 1.9797e-01],
         [0.0000e+00, 4.9418e-01, 8.6100e-02, 3.9406e-01, 2.5658e-02]],

        [[7.2170e-03, 8.0075e-01, 1.6079e-01, 2.0598e-02, 1.0644e-02],
         [2.1385e-02, 8.0880e-01, 1.1157e-01, 4.9358e-02, 8.8884e-03],
         [4.3192e-01, 1.1224e-02, 5.8584e-02, 1.8322e-01, 3.1505e-01],
         [5.0046e-06, 9.9913e-01, 7.4922e-04, 1.0337e-04, 9.3235e-06],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]]])

In [18]:
torch.matmul(score,Q)

tensor([[[    nan, -1.5955,  0.5663],
         [    nan,  0.0000,  0.0000],
         [    nan, -0.6628, -0.2930],
         [    nan, -0.7695, -0.1701],
         [    nan, -0.3030, -0.7395]],

        [[ 0.4574,  1.8975,  0.1920],
         [ 0.5565,  1.9168,  0.1777],
         [ 0.2826, -0.2241,  0.1884],
         [ 0.8602,  2.2928,  0.3028],
         [ 0.0000,  0.0000,  0.0000]]])