In [6]:
import os, sys
sys.path.append("../../../")

from src.core.module import Module
from src.core.losses import CrossEntropy
from src.core.optim import AdamW
from src.core.tensor import Tensor
from src.utils.backend import xp

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np




In [7]:
src = np.random.randint(low=1, high=16, size=(128, 16))
x = src[:, :-1]
y = src[:, 1:]

x_mine = Tensor(x, requires_grad=False)
y_mine = Tensor(y, requires_grad=False)

In [8]:



class Net(Module):
    def __init__(self, d_model, n_heads, vocab_size, max_seq_len, pad_idx=0):
        super().__init__()
        self.e = self.embedding(vocab_size, d_model, max_seq_len, pad_idx, name="Embedding")

        self.head1 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.head2 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.head3 = self.transformer(d_model=d_model, n_heads=n_heads)
        # self.head4 = self.transformer(d_model=d_model, n_heads=n_heads)
        # self.head5 = self.transformer(d_model=d_model, n_heads=n_heads)
        # self.head6 = self.transformer(d_model=d_model, n_heads=n_heads)
        # self.head7 = self.transformer(d_model=d_model, n_heads=n_heads)
        # self.head8 = self.transformer(d_model=d_model, n_heads=n_heads)
        self.project = self.linear(d_model, vocab_size, name="project")
    
    def forward(self, idx):
        x, padding_mask = self.e.get_sentence_embedding(idx)
        x = Tensor(x.data, requires_grad=False)
        x = self.head1(x, padding_mask)
        x = self.head2(x, padding_mask)
        x = self.head3(x, padding_mask)
        # x = self.head4(x, padding_mask)
        # x = self.head5(x, padding_mask)
        # x = self.head6(x, padding_mask)
        # x = self.head7(x, padding_mask)
        # x = self.head8(x, padding_mask)
        x = self.project(x)
        return x

    def train(self, x, y, epochs, optimizer):
        for epoch in range(epochs):
            y_hat = self.forward(x)
            # print(y_hat.shape, y.shape)
            loss = CrossEntropy(y_hat, y, axis=-1)
    
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if epoch % 1 == 0:
                print(f"Epoch {epoch}, Loss: {loss.data}")
                
if __name__ == "__main__":
    D_MODEL = 48
    VOCAB_SIZE = 20
    N_HEADS = 2
    MAX_SEQ_LEN = 32
    PAD_IDX = 0

    model = Net(d_model=D_MODEL, n_heads=N_HEADS, vocab_size=VOCAB_SIZE, max_seq_len=MAX_SEQ_LEN, pad_idx=PAD_IDX)
    model._build((128, 15))
    optimizer = AdamW(model.parameters(), lr=0.001, precision=(xp.float32, xp.float32))


    model.train(x_mine, y_mine, epochs=1000, optimizer=optimizer)


    
        

Epoch 0, Loss: 4.942246417381714
Epoch 1, Loss: 4.597368511060105
Epoch 2, Loss: 4.316350560006901
Epoch 3, Loss: 4.092902537022834
Epoch 4, Loss: 3.9088633955914367
Epoch 5, Loss: 3.7525124249007047
Epoch 6, Loss: 3.6169669054783187
Epoch 7, Loss: 3.4976377500777565
Epoch 8, Loss: 3.3910307819509677
Epoch 9, Loss: 3.2951318378147656
Epoch 10, Loss: 3.209537837078543
Epoch 11, Loss: 3.1345608690868167
Epoch 12, Loss: 3.0700026870043375
Epoch 13, Loss: 3.0144319330515525
Epoch 14, Loss: 2.965474099527802
Epoch 15, Loss: 2.920931483682663
Epoch 16, Loss: 2.879727777553396
Epoch 17, Loss: 2.84185556167119
Epoch 18, Loss: 2.807610832358027
Epoch 19, Loss: 2.7769319213105748
Epoch 20, Loss: 2.7492484621049766
Epoch 21, Loss: 2.7237519859220654
Epoch 22, Loss: 2.6997845098523197
Epoch 23, Loss: 2.6770407309302953
Epoch 24, Loss: 2.655556546957777
Epoch 25, Loss: 2.6356526048559252
Epoch 26, Loss: 2.6177603391206987
Epoch 27, Loss: 2.601982577841769
Epoch 28, Loss: 2.587749287047771
Epoch 29,

KeyboardInterrupt: 

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        return self.pe[:seq_len].unsqueeze(0)

class Net(nn.Module):
    def __init__(self, d_model, n_heads, vocab_size, max_seq_len, num_layers=1, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        self.pe = SinusoidalPositionalEncoding(d_model, max_seq_len)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=n_heads, 
            dim_feedforward=d_model * 4, 
            batch_first=True,
            bias=False,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.project = nn.Linear(d_model, vocab_size, bias=False)

    def forward(self, idx):
        x = self.embedding(idx) + self.pe(idx)
        padding_mask = (idx == 0)
        x = self.encoder(x, src_key_padding_mask=padding_mask)
        logits = self.project(x)
        return logits

    def train_model(self, x, y, epochs, optimizer, criterion):
        for epoch in range(epochs):
            optimizer.zero_grad()
            logits = self.forward(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()
            if epoch % 1 == 0:
                print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# --- Config ---
B = 128
T = 15
VOCAB_SIZE = 20
PAD_IDX = 0
D_MODEL = 48
N_HEADS = 2
MAX_SEQ_LEN = 32
NUM_LAYERS = 2  # 🧠 Change this to whatever you want

# --- Model Training ---
model = Net(D_MODEL, N_HEADS, VOCAB_SIZE, MAX_SEQ_LEN, num_layers=NUM_LAYERS, pad_idx=PAD_IDX)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

x = torch.randint(1, VOCAB_SIZE, (B, T))
y = torch.randint(0, VOCAB_SIZE, (B, T))

model.train_model(x, y, epochs=1000, optimizer=optimizer, criterion=criterion)




Epoch 0, Loss: 3.1529
Epoch 1, Loss: 3.0984
Epoch 2, Loss: 3.0595
Epoch 3, Loss: 3.0340
Epoch 4, Loss: 3.0041
Epoch 5, Loss: 2.9938
Epoch 6, Loss: 2.9768
Epoch 7, Loss: 2.9614
Epoch 8, Loss: 2.9550
Epoch 9, Loss: 2.9471
Epoch 10, Loss: 2.9372
Epoch 11, Loss: 2.9307
Epoch 12, Loss: 2.9292
Epoch 13, Loss: 2.9238
Epoch 14, Loss: 2.9240
Epoch 15, Loss: 2.9193
Epoch 16, Loss: 2.9165
Epoch 17, Loss: 2.9145
Epoch 18, Loss: 2.9127
Epoch 19, Loss: 2.9053
Epoch 20, Loss: 2.9057
Epoch 21, Loss: 2.8959
Epoch 22, Loss: 2.8958
Epoch 23, Loss: 2.8890
Epoch 24, Loss: 2.8915
Epoch 25, Loss: 2.8841
Epoch 26, Loss: 2.8799
Epoch 27, Loss: 2.8781
Epoch 28, Loss: 2.8695
Epoch 29, Loss: 2.8734
Epoch 30, Loss: 2.8602
Epoch 31, Loss: 2.8602
Epoch 32, Loss: 2.8567
Epoch 33, Loss: 2.8524
Epoch 34, Loss: 2.8471
Epoch 35, Loss: 2.8456
Epoch 36, Loss: 2.8325
Epoch 37, Loss: 2.8338
Epoch 38, Loss: 2.8332
Epoch 39, Loss: 2.8240
Epoch 40, Loss: 2.8180
Epoch 41, Loss: 2.8080
Epoch 42, Loss: 2.8065
Epoch 43, Loss: 2.802

KeyboardInterrupt: 