In [1]:
import os, sys
sys.path.append("../../../")

from src.core.module import Module
from src.core.losses import CrossEntropyWithLogits
from src.core.optim import AdamW
from src.core.tensor import Tensor
from src.utils.backend import xp

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math


src = np.random.randint(low=0, high=16, size=(15, 15))
x = src[:, :-1]
y = src[:, 1:]

x_mine = Tensor(x, requires_grad=False)
y_mine = Tensor(y, requires_grad=False)

In [2]:


class Net(Module):
    def __init__(self, d_model, n_heads, vocab_size, max_seq_len, pad_idx=0):
        super().__init__()

        self.e = self.embedding(vocab_size, d_model, max_seq_len, pad_idx, name="Embedding")

        self.head1 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.head2 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.head3 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.head4 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.head5 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.head6 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.head7 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.head8 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.project = self.linear(d_model, vocab_size, name="project")
    
    def forward(self, idx):
        x, padding_mask = self.e.get_sentence_embedding(idx)
        x = Tensor(x.data, requires_grad=False)
        x = self.head1(x, padding_mask)
        x = self.head2(x, padding_mask)
        x = self.head3(x, padding_mask)
        x = self.head4(x, padding_mask)
        x = self.head5(x, padding_mask)
        x = self.head6(x, padding_mask)
        x = self.head7(x, padding_mask)
        x = self.head8(x, padding_mask)
        x = self.project(x)
        return x

    def train(self, x, y, epochs, optimizer):
        for epoch in range(epochs):
            y_hat = self.forward(x)
            # print(y_hat.shape, y.shape)
            loss = CrossEntropyWithLogits(y_hat, y, axis=-1)
    
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if epoch % 5 == 0:
                print(f"Epoch {epoch}, Loss: {loss.data}")
                
if __name__ == "__main__":
    D_MODEL = 16
    VOCAB_SIZE = 20
    N_HEADS = 2
    MAX_SEQ_LEN = 16
    PAD_IDX = 0

    model = Net(d_model=D_MODEL, n_heads=N_HEADS, vocab_size=VOCAB_SIZE, max_seq_len=MAX_SEQ_LEN, pad_idx=PAD_IDX)
    model._build((15, 15))
    optimizer = AdamW(model.parameters(), lr=0.01, precision=(xp.float32, xp.float32), clip_norm=1.0)


    model.train(x_mine, y_mine, epochs=1000, optimizer=optimizer)


    
        

Epoch 0, Loss: 5.42578125
Epoch 5, Loss: 2.517578125
Epoch 10, Loss: 2.068359375
Epoch 15, Loss: 1.69140625
Epoch 20, Loss: 1.30078125
Epoch 25, Loss: 0.880859375
Epoch 30, Loss: 0.56982421875
Epoch 35, Loss: 0.412841796875
Epoch 40, Loss: 0.340576171875
Epoch 45, Loss: 0.274169921875
Epoch 50, Loss: 0.201171875
Epoch 55, Loss: 0.1375732421875
Epoch 60, Loss: 0.158447265625
Epoch 65, Loss: 0.1175537109375
Epoch 70, Loss: 0.091796875
Epoch 75, Loss: 0.08978271484375
Epoch 80, Loss: 0.079833984375
Epoch 85, Loss: 0.09771728515625


KeyboardInterrupt: 

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, d_model, n_heads, vocab_size, max_seq_len, num_layers=1, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        self.pos_embed = nn.Embedding(max_seq_len, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=n_heads, 
            dim_feedforward=d_model * 4, 
            batch_first=True,
            bias=False,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.project = nn.Linear(d_model, vocab_size, bias=False)

    def forward(self, idx):
        batch_size, seq_len = idx.size()
        pos = torch.arange(seq_len, device=idx.device).unsqueeze(0).expand(batch_size, seq_len)
        
        x = self.embedding(idx) + self.pos_embed(pos)
        padding_mask = (idx == 0)
        x = self.encoder(x, src_key_padding_mask=padding_mask)
        logits = self.project(x)
        return logits

    def train_model(self, x, y, epochs, optimizer, criterion):
        for epoch in range(epochs):
            optimizer.zero_grad()
            logits = self.forward(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()
            if epoch %  50== 0:
                print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# --- Config ---
D_MODEL = 16
VOCAB_SIZE = 20
N_HEADS = 2
MAX_SEQ_LEN = 16
PAD_IDX = 0
BATCH_SIZE = 16

# --- Model Training ---
model = Net(D_MODEL, N_HEADS, VOCAB_SIZE, MAX_SEQ_LEN, num_layers=8, pad_idx=PAD_IDX)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

x_pt = torch.tensor(x).long()
y_pt = torch.tensor(y).long()

model.train_model(x_pt, y_pt, epochs=1000, optimizer=optimizer, criterion=criterion)




Epoch 0, Loss: 3.0786
Epoch 50, Loss: 3.0600
Epoch 100, Loss: 3.0440
Epoch 150, Loss: 3.0137
Epoch 200, Loss: 2.9994
Epoch 250, Loss: 2.9729
Epoch 300, Loss: 2.9510
Epoch 350, Loss: 2.9523
Epoch 400, Loss: 2.9230
Epoch 450, Loss: 2.9120
Epoch 500, Loss: 2.9084
Epoch 550, Loss: 2.8781
Epoch 600, Loss: 2.8804
Epoch 650, Loss: 2.8725
Epoch 700, Loss: 2.8471
Epoch 750, Loss: 2.8459
Epoch 800, Loss: 2.8414
Epoch 850, Loss: 2.8311
Epoch 900, Loss: 2.8118
Epoch 950, Loss: 2.8179
