In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../src")

In [3]:
from fluidvec import *

In [4]:
import torch
import pickle
from torch.optim import AdamW
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
from pathlib import Path
from fluidvec.dataset import TrainDataset, get_dataloader
# torch.autograd.set_detect_anomaly(True)

In [5]:
with open("../data/train_items/train_items_001.pkl", "rb") as fin:
    items = pickle.load(fin)

In [38]:
vs.word_vocab.decode(60)

'由'

In [39]:
vs.char_vocab.decode(97)

'_由_'

In [6]:
items[100:102]

[({'compos': [67, 68, 69, 70, 48, 49], 'chars': [47, 125, 31], 'word': 77},
  [{'compos': [8], 'chars': [97], 'word': 60},
   {'compos': [119, 157, 107, 131, 8], 'chars': [122, 123, 124], 'word': 76},
   {'compos': [67, 68, 69, 70, 158, 159], 'chars': [47, 125, 126], 'word': 78},
   {'compos': [78, 160, 50, 52, 8], 'chars': [127, 35, 128], 'word': 79}]),
 ({'compos': [67, 68, 69, 70, 158, 159], 'chars': [47, 125, 126], 'word': 78},
  [{'compos': [119, 157, 107, 131, 8], 'chars': [122, 123, 124], 'word': 76},
   {'compos': [67, 68, 69, 70, 48, 49], 'chars': [47, 125, 31], 'word': 77},
   {'compos': [78, 160, 50, 52, 8], 'chars': [127, 35, 128], 'word': 79},
   {'compos': [55, 56, 8], 'chars': [37, 38], 'word': 20}])]

In [7]:
vs = VocabSet.load()
word_weights = [vs.word_vocab.freq[idx]**0.75 for idx in range(len(vs.word_vocab))]

use_cuda = False and torch.cuda.is_available()
model = FluidVecSG(len(vs.word_vocab), len(vs.char_vocab), 0, 
                   dim=6, n_neg_sample=3, weights=word_weights, use_cuda=use_cuda)
optim = AdamW(model.parameters(), lr=1e-3)
if use_cuda:
    model.to(torch.device("cuda"))

device:  cpu
n_neg_sample:  3


## Try Loss function 

In [8]:
vec_dict = model.transform_batch_data(items[100:101])

In [9]:
model.char_emb(torch.tensor([47,125,31])).mean(0)

tensor([ 0.8685,  0.4740, -0.5630,  0.0669, -0.2706, -0.3580],
       grad_fn=<MeanBackward1>)

In [10]:
vec_dict

{'tgt': tensor([[ 0.8685,  0.4740, -0.5630,  0.0669, -0.2706, -0.3580]],
        grad_fn=<StackBackward>),
 'ctx': tensor([[[ 0.1706, -1.1125, -1.5387,  2.0441, -0.2650,  0.7361],
          [ 0.2482,  1.1317, -0.7563, -0.9331, -0.2241,  0.2972],
          [-0.5176, -0.6282, -0.8034,  0.5513,  0.0451,  0.6017],
          [ 0.5939,  0.6047,  1.1013,  1.3819,  0.3913,  0.5915]]],
        grad_fn=<StackBackward>),
 'ctx_mask': tensor([[1, 1, 1, 1]])}

In [40]:
model.word_emb(torch.tensor(79))

tensor([0.5939, 0.6047, 1.1013, 1.3819, 0.3913, 0.5915],
       grad_fn=<EmbeddingBackward>)

In [14]:
tgt

tensor([[ 0.8685,  0.4740, -0.5630,  0.0669, -0.2706, -0.3580]],
       grad_fn=<StackBackward>)

In [15]:
ctx

tensor([[[ 0.1706, -1.1125, -1.5387,  2.0441, -0.2650,  0.7361],
         [ 0.2482,  1.1317, -0.7563, -0.9331, -0.2241,  0.2972],
         [-0.5176, -0.6282, -0.8034,  0.5513,  0.0451,  0.6017],
         [ 0.5939,  0.6047,  1.1013,  1.3819,  0.3913,  0.5915]]],
       grad_fn=<StackBackward>)

In [26]:
noise_vec

tensor([[[-1.6861, -0.1233, -0.4947, -1.4695, -1.4075, -0.9184],
         [ 0.0946,  1.8954,  1.0712,  1.1584,  0.7281,  0.4916],
         [-1.3748, -0.4124,  0.3517,  1.6456,  0.4747, -0.2277],
         [-1.5799, -1.9747,  1.0651,  2.0222, -0.5379, -1.1553],
         [ 1.1998, -0.8193,  1.0410, -1.2435,  0.1773,  0.4780],
         [ 1.2114, -1.1760, -0.5726,  0.9438,  1.4270, -0.7126],
         [ 0.1268,  0.9072, -0.4544,  1.9798,  0.1110, -0.7260],
         [-0.8510,  1.5819, -0.1254, -1.4437, -0.0067, -0.2119],
         [ 1.9090,  2.2520,  0.8745,  0.5411,  0.2419,  0.2300],
         [ 0.6196,  0.6806,  1.0948,  0.2976, -0.4565,  0.0359],
         [ 1.5301,  0.2887,  0.5422, -0.1980,  0.0766, -0.4669],
         [ 0.4405, -0.2741,  1.0912,  0.4701,  0.5924,  0.1188]]],
       grad_fn=<EmbeddingBackward>)

In [27]:
(tgt.unsqueeze(1)*noise_vec).neg().sum(2)

tensor([[ 0.6331, -0.0819,  1.5244,  2.2135,  0.2346, -0.7493, -1.1583, -0.0623,
         -2.1215, -0.3749, -1.2937,  0.5331]], grad_fn=<SumBackward1>)

In [28]:
(tgt.unsqueeze(1)*noise_vec.neg()).sum(2)

tensor([[ 0.6331, -0.0819,  1.5244,  2.2135,  0.2346, -0.7493, -1.1583, -0.0623,
         -2.1215, -0.3749, -1.2937,  0.5331]], grad_fn=<SumBackward1>)

In [29]:
noise_vec.shape

torch.Size([1, 12, 6])

In [30]:
tgt.shape

torch.Size([1, 6])

In [33]:
torch.bmm(noise_vec.neg(), tgt.unsqueeze(2))

tensor([[[ 0.6331],
         [-0.0819],
         [ 1.5244],
         [ 2.2135],
         [ 0.2346],
         [-0.7493],
         [-1.1583],
         [-0.0623],
         [-2.1215],
         [-0.3749],
         [-1.2937],
         [ 0.5331]]], grad_fn=<BmmBackward0>)

In [12]:
tgt = vec_dict["tgt"] # (batch_size, dim)
ctx = vec_dict["ctx"] # (batch_size, win_size, dim)
mask = vec_dict["ctx_mask"] # (batch_size, win_size)

batch_size = ctx.size(0)
win_size = ctx.size(1)
n_noise = batch_size * win_size * model.n_neg_sample
draw = torch.multinomial(model.weights, n_noise, True)
noise = draw.view(batch_size, win_size*model.n_neg_sample)
# noise = noise.to(self.device)
noise_vec = model.word_emb(noise)  # (batch_size, win_size*n_neg, dim)

log_target = ((tgt.unsqueeze(1) * ctx).sum(2).sigmoid()+1e-5).log()
log_target = log_target * mask
log_target_val = log_target.sum()        

sum_log_noise = ((tgt.unsqueeze(1)*noise_vec)
                .neg().sum(2).sigmoid()+1e-5).log()
sum_log_noise = (sum_log_noise.view(batch_size, win_size, -1)
                 * mask.unsqueeze(2)).view(batch_size, -1)
sum_log_noise_val = sum_log_noise.sum()        
loss = log_target_val + sum_log_noise_val

In [13]:
loss

tensor(-12.9421, grad_fn=<AddBackward0>)