In [1]:
import torch
import torch.nn.functional as F
from src.layer import TransformerBlock, HierarchicalTransformerBlock

%load_ext autoreload
%autoreload 2

In [None]:
768 * 4 * 2/3
# d_ff = d_model * 4 * 2/3

In [None]:
768 / 8
# d_k = d_model / h

#### test transformer block

In [None]:
encoder = TransformerBlock(
    d_model=768,
    d_ff=2048,
    h=12,
    dropout=0,
    norm_type="layer",
    ffn_type="swiglu",
)

In [None]:
encoder = encoder.to("cuda")
encoder

In [None]:
inputs = torch.randn(2, 128, 512).to("cuda")
masks = torch.ones(2, 128).to("cuda")

In [None]:
encoder(inputs, masks).shape

#### test hierarchical transformer block

In [104]:
hat_block = HierarchicalTransformerBlock(
    d_model=512,
    d_ff=1024,
    h=4,
    dropout=0,
    norm_type="layer",
    ffn_type="swiglu",
).cuda()

In [105]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(hat_block) / 1e6

5.250048

In [106]:
(2e4 * 512 + 5.25e6 * 6) / 1e6


41.74

In [3]:
seg_hidden_state = torch.randn(32, 32, 128, 768).cuda()
token_mask = torch.rand(32, 32, 128).bernoulli().cuda()
seg_mask = torch.rand(32, 32).bernoulli().cuda()


In [4]:
hat_block(seg_hidden_state, token_mask, seg_mask).shape

torch.Size([32, 32, 128, 768])

#### RoPE

In [None]:
from src.layer import RoPE

rope = RoPE(d=256)
x = torch.randn(2, 12, 32, 256)

In [None]:
rope(x).shape

### test model

In [2]:
from src.fm import FMConfig, FMBase

In [3]:
cfg = FMConfig(d_model=512, n_heads=8, d_ff=1024, dropout=0.1,
               **{"lr": 1e-5, "batch_size": 8})
model = FMBase(cfg).to("cuda")
cfg.to_diff_dict()

{'pad_token_id': 0,
 'transformers_version': '4.55.2',
 'lr': 1e-05,
 'batch_size': 8,
 'vocab_size': 15000,
 'd_model': 512,
 'n_blocks': 6,
 'n_heads': 8,
 'd_ff': 1024,
 'dropout': 0.1,
 'norm_type': 'layer',
 'ffn_type': 'swiglu',
 'weight_tying': False,
 'attn_backend': 'base',
 'model_type': 'fm'}

#### simcse

In [8]:
optimizer = torch.optim.AdamW(model.parameters())

In [9]:
from src.loss import SimCSE
from src.utils.data_utils import random_masking
from tokenizers import Tokenizer

In [10]:
input_ids = torch.randint(0, 15000, (2, 2, 16)).to("cuda")
attention_mask = torch.full(input_ids.shape, True).to("cuda")
segment_attention_mask = torch.full((2, 2), True).to("cuda")


In [11]:
tk = Tokenizer.from_file('./test.json')

In [13]:
masked_input_ids, labels = random_masking(input_ids, tk)
logits = model(
    input_ids=masked_input_ids,
    attention_mask=attention_mask,
    segment_attention_mask=segment_attention_mask,
)

loss = torch.nn.CrossEntropyLoss(ignore_index=-100)(logits.view(-1, logits.size(-1)), labels.view(-1))
loss.backward()
optimizer.step()

In [14]:
optimizer.zero_grad()

In [16]:
simcse = SimCSE(model, temperature=1)

In [19]:
masked_input_ids, labels = random_masking(input_ids, tk)
logits = model(
    input_ids=masked_input_ids,
    attention_mask=attention_mask,
    segment_attention_mask=segment_attention_mask,
)

mlm_loss = torch.nn.CrossEntropyLoss(ignore_index=-100)(logits.view(-1, logits.size(-1)), labels.view(-1))
simcse_loss = simcse(input_ids, attention_mask, segment_attention_mask)
(mlm_loss + simcse_loss).backward()
optimizer.step()



tensor([[[-1.3865,  1.4914, -0.5007,  ...,  0.9131,  1.0111,  2.9045],
         [-1.9676,  3.1304, -0.4825,  ..., -0.5632,  1.6516,  3.4589]],

        [[-1.1420,  1.7328,  0.4275,  ..., -1.5038,  2.2406,  3.0776],
         [ 0.3998, -1.7918,  2.2506,  ..., -0.4982, -1.9180,  0.6739]]],
       device='cuda:0', grad_fn=<SliceBackward0>)
tensor([[[-0.4825,  3.0431,  0.5561,  ...,  0.4529,  1.8380,  5.0330],
         [-2.7726,  2.9079, -0.1113,  ...,  0.4662,  1.5168,  4.2401]],

        [[-1.0057,  1.0679,  0.2046,  ..., -0.8703,  2.2319,  2.3849],
         [ 0.7065, -3.0003,  1.0444,  ..., -1.4253, -2.3259,  1.7178]]],
       device='cuda:0', grad_fn=<SliceBackward0>)


In [18]:
simcse_loss

tensor(1.7569, device='cuda:0', grad_fn=<NllLossBackward0>)

In [20]:
d = {'a': 1, 'b': 2}
'a' in d

True