In [1]:
import torch
import numpy as np
import torch.nn as nn

In [2]:
## parameters to define first

# X is an input matrix, having shape (seq_len, dmodel)
# Wq is query trainable, having shape (dmodel, dk)
# Wk is key trainable, having shape (dmodel, dk)
# Wv is value trainable, having shape (dmodel, dk), dv = dk
# Wo trainable, having shape (dk*h, dmodel)
# Q = X @ Wq, shape (seq_len, dk)
# K = X @ Wk, shape (seq_len, dk)
# V  = X @ Wv, shape (seq_len, dk), dk= dv
# A = Attention(Q, K.t()), shape (seq_len, seq_len) where Attention(Q, K.t()) = (Q @ K.t())/sqrt(dk)
# Masking -- shape(seq_len, seq_len)
# then  Softmax(A), shape (seq_len, seq_len)
# second_last => Softmax(A) @ V, shape (seq_len, dk)
# lastly,  (Softmax(A) @ V) @ Wo, shape (seq_len, dmodel)

In [3]:
X = torch.tensor([
    [0.72,0.45,0.31], # Dream
    [0.75,0.20,0.55], # big
    [0.30,0.80,0.40], # and
    [0.85,0.35,0.60], # work
    [0.55,0.15,0.75], # for
    [0.25,0.20,0.85] # it
])

In [4]:
## implementation of Scaled Dot Product Attention using Class





class CausalAttentionSingleHead(nn.Module):
    def __init__(self, dk, dmodel, dropout):
        super(CausalAttentionSingleHead, self).__init__()
        self.dk = dk
        self.dmodel = dmodel
        self.weight_query = nn.Linear(self.dmodel, self.dk)
        self.weight_value = nn.Linear(self.dmodel, self.dk)
        self.weight_key = nn.Linear(self.dmodel, self.dk)
        ## defining Softmax
        self.softmax = nn.Softmax(dim = -1)
        ## defining dropout
        self.dropout = nn.Dropout(dropout)


    def forward(self, X):
        # calculating the Query, Key and Value tensors
        Q = self.weight_query(X)
        K = self.weight_key(X)
        V = self.weight_value(X)
        # calculating the attention score and scaling
        attn_score = (Q @ K.t())/( self.dk**0.5)
        # applying masking, first defining mask then applying
        mask = torch.triu(torch.ones((X.shape[0], X.shape[0])), diagonal=1)
        masked_attn_score = attn_score.masked_fill(mask.bool(), -torch.inf)
        # calculating the Softmax of the attn_score and adding dropouts
        A = self.dropout(self.softmax(masked_attn_score))
        # multiplying A with V
        return A @ V, A
    





In [5]:
obj = CausalAttentionSingleHead(dk = 2, dropout=0.2, dmodel= X.shape[1])

In [6]:
obj.forward(X)

(tensor([[0.2055, 1.1526],
         [0.2465, 1.2458],
         [0.2133, 1.1259],
         [0.1262, 0.6373],
         [0.1801, 0.9241],
         [0.1382, 0.6207]], grad_fn=<MmBackward0>),
 tensor([[1.2500, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.6336, 0.6164, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4398, 0.4349, 0.3753, 0.0000, 0.0000, 0.0000],
         [0.3227, 0.3166, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2613, 0.2496, 0.2490, 0.0000, 0.2349, 0.0000],
         [0.2226, 0.0000, 0.0000, 0.2145, 0.0000, 0.1886]],
        grad_fn=<MulBackward0>))

In [7]:
import torch
X = torch.tensor([
    [0.72,0.45,0.31], # Dream
    [0.75,0.20,0.55], # big
    [0.30,0.80,0.40], # and
    [0.85,0.35,0.60], # work
    [0.55,0.15,0.75], # for
    [0.25,0.20,0.85] # it
])

In [8]:
from model.attention import CausalAttentionSingleHead

In [9]:
causal_attn = CausalAttentionSingleHead(dk = 512, dmodel = 3, dropout=0.2)

In [10]:
causal_attn.forward(X)

tensor([[-0.2411, -0.9794,  1.2128,  ...,  0.5255, -0.1772, -0.1950],
        [-0.1152, -0.4679,  0.5794,  ...,  0.2511, -0.0847, -0.0931],
        [-0.3026, -0.9404,  1.1810,  ...,  0.6016, -0.1390, -0.2028],
        [-0.1690, -0.4631,  0.6101,  ...,  0.3045, -0.0395, -0.1343],
        [-0.2308, -1.0082,  1.1772,  ...,  0.6957, -0.0901, -0.1754],
        [-0.2192, -0.7978,  0.9430,  ...,  0.5812, -0.0740, -0.1420]],
       grad_fn=<MmBackward0>)

## Masked MultiHead Attention Implementation 

In [11]:
import torch
import torch.nn as nn
import numpy as np
import math


In [12]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h, dmodel, dropout):
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.dmodel = dmodel
        assert self.dmodel % self.h == 0, "head and dmodel configuration failed"
        self.dk = self.dmodel // self.h
        self.weight_query = nn.Linear(dmodel, dmodel)
        self.weight_key = nn.Linear(dmodel, dmodel)
        self.weight_value = nn.Linear(dmodel, dmodel)
        self.w_o = nn.Linear(dmodel, dmodel)
        # softmax
        self.softmax = nn.Softmax(dim = -1)
        # drop out
        self.dropout = nn.Dropout(dropout)

    def forward(self, X):
        seq_len = X.shape[0]
        Q = self.weight_query(X)
        K = self.weight_key(X)
        V = self.weight_value(X)
        Q_head = torch.permute(torch.reshape(Q, shape=(seq_len, self.h, self.dk)), dims= (1, 0, 2))
        K_head = torch.permute(torch.reshape(K, shape=(seq_len, self.h, self.dk)), dims= (1, 0, 2))
        V_head = torch.permute(torch.reshape(V, shape=(seq_len, self.h, self.dk)), dims= (1, 0, 2))
        # calculating attention score and scaling
        attn_score = (torch.matmul(Q_head, K_head.transpose(-1,-2)))/(math.sqrt(self.dk))
        # applying masking
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
        attn_score_masked = attn_score.masked_fill(mask.bool(), -torch.inf)
        # applying softmax
        attn_weights = self.softmax(attn_score_masked)
        # applying dropout
        attn_weights_with_dropout = self.dropout(attn_weights)
        # multiplying with V
        A = torch.matmul(attn_weights_with_dropout, V_head)
        # concatenation of the vector
        concate_heads = torch.reshape(torch.permute(A, dims = (1, 0, 2)), shape = (seq_len, self.dmodel))
        return self.w_o(concate_heads)
        







In [13]:
MHA = MultiHeadAttention(h = 4, dmodel= 512, dropout=0.2)

In [14]:
sentences = 10
features = 512

X = torch.randn(sentences, features)


In [15]:
X.shape

torch.Size([10, 512])

In [16]:
MHA.forward(X).shape

torch.Size([10, 512])

## Transformer Block Complete


In [1]:
import torch
import torch.nn as nn
import numpy as np
import math

In [2]:
class TransformerBlock(nn.Module):
    def __init__(self, heads, d_model, dropout):
        super(TransformerBlock, self).__init__()
        self.heads = heads
        self.d_model = d_model
        assert self.d_model % self.heads == 0, "Choose Correct Head Number"
        self.dk = self.d_model // self.heads
        # defining dropout
        self.dropout = nn.Dropout(dropout)
        # defining softmax
        self.softmax = nn.Softmax(dim=-1)
        # query_weight, key_weight, value_weight
        self.weight_query = nn.Linear(self.d_model, self.d_model)
        self.weight_key = nn.Linear(self.d_model, self.d_model)
        self.weight_value = nn.Linear(self.d_model, self.d_model)
        self.w_o = nn.Linear(self.d_model, self.d_model)
        self.ffn1 = nn.Linear(self.d_model, self.d_model*4)
        self.ffn2 = nn.Linear(self.d_model*4, self.d_model)
        self.layer_norm_1 = nn.LayerNorm(self.d_model)
        self.layer_norm_2 = nn.LayerNorm(self.d_model)
        # define activation
        self.relu = nn.ReLU()

    def forward(self, X):
        # extract shape of seq_len
        seq_len = X.shape[1]
        batch_size = X.shape[0]

        ##layer normalization
        layer_norm1_output = self.layer_norm_1(X)

        #project X (B, SEQ_LEN, Dmodel) into (B, Dmodel, Dmodel)
        Q = self.weight_query(layer_norm1_output)
        K = self.weight_key(layer_norm1_output)
        V = self.weight_value(layer_norm1_output)
        Q_heads = torch.permute(torch.reshape(Q, shape = (batch_size, seq_len, self.heads, self.dk)), dims = (0, 2, 1,3))
        K_heads = torch.permute(torch.reshape(K, shape = (batch_size, seq_len, self.heads, self.dk)), dims = (0, 2, 1,3))
        V_heads = torch.permute(torch.reshape(V, shape = (batch_size, seq_len, self.heads, self.dk)), dims = (0, 2, 1,3))

        # calculate attn score and scale
        attn_score = (torch.matmul(Q_heads, K_heads.transpose(-1, -2)))/(math.sqrt(self.dk))

        # masking 
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
        # applying mask
        attn_score_with_mask = attn_score.masked_fill(mask.bool(), -torch.inf)
        # applying softmax
        attn_weight_with_mask = self.softmax(attn_score_with_mask)
        # apply dropouts 
        attn_wgts_mask_drpt = self.dropout(attn_weight_with_mask)

        ### multiply with V_heads
        A = torch.matmul(attn_wgts_mask_drpt, V_heads)
        ## concatenation
        concat_A = torch.reshape(torch.permute(A, dims = (0,2,1,3)), shape = (batch_size, seq_len, self.d_model))
        output_masked_attn = self.w_o(concat_A)
        #return layer_norm1_output

        # residual connection 1
        residual_connection_1 = X + output_masked_attn

        #layer norm 2
        layer_norm_2_output =self.layer_norm_2(residual_connection_1)
        # ffnn
        ffn1 = self.ffn1(layer_norm_2_output)
        # activation
        activation = self.relu(ffn1)
        # final layer
        ffn2 = self.ffn2(activation)
        
        # residual connection 2
        final_output = layer_norm_2_output + ffn2

        #return the fnal vector
        return final_output




        



In [3]:
block = TransformerBlock(heads = 8, d_model=512, dropout=0.2)

In [4]:
X = torch.randn(32, 20, 512)

In [5]:
block.forward(X)

tensor([[[ 7.5563e-02,  1.4333e+00, -1.6501e+00,  ...,  1.3692e+00,
           1.7790e+00,  1.5419e-01],
         [-1.2688e+00,  1.3711e+00, -1.1390e+00,  ..., -6.4300e-01,
          -8.2388e-01,  3.6798e-01],
         [-1.3032e+00, -1.1401e-02,  2.5159e-01,  ...,  1.4954e+00,
           7.1369e-01,  9.5313e-03],
         ...,
         [ 1.8574e-01,  1.4225e+00,  1.0207e+00,  ...,  1.0022e+00,
          -5.7860e-02, -3.5444e-02],
         [-2.4432e+00, -1.0321e+00,  8.7815e-01,  ...,  1.4494e+00,
           3.3746e-01,  6.7837e-01],
         [-4.7686e-03, -1.8188e-01,  9.3257e-02,  ..., -6.7405e-01,
          -4.3429e-03, -1.0248e+00]],

        [[ 4.3783e-01,  2.6260e-01,  5.4969e-01,  ..., -9.9060e-01,
          -6.9231e-01,  7.0725e-01],
         [-3.2725e-01, -1.5383e+00,  3.0135e-01,  ...,  8.6273e-01,
           4.0771e-02, -2.9588e+00],
         [ 9.5258e-01,  8.3034e-01,  1.3435e+00,  ...,  7.9139e-01,
           1.7850e+00,  7.1305e-01],
         ...,
         [-5.2161e-02,  9

In [None]:
## class for PointWIseFeedForward Neural Network

import torch
import torch.nn as nn
import numpy as np


class PointWiseFeedForward(nn.Module):
    "Implements Point Wise Feedforward Neural Network"
    def __init__(self, d_model, dffn, dropout):
        super(PointWiseFeedForward, self).__init__()
        self.dropout = nn.Dropout(dropout)
        #defining layers
        self.w_1 = nn.Linear(d_model, dffn)
        self.w_2 = nn.Linear(dffn, d_model)

        # defining forward function
        def forward(self, X):
            return self.dropout(self.w_2_2(nn.ReLU(self.w_1(X))))

## Positional Encoding Implementation -  sin, cos

In [4]:
import math
import torch

d_model = 512

def sin(pos, i):
        return math.sin(pos/10000**(2*i/d_model))

def cos(pos, i):
        return math.cos(pos/10000**(2*i/d_model))



In [5]:
seq_len = 20
random_embeddings = torch.randn(seq_len, d_model)

In [7]:
postional_encoding_vector = []

for pos in range(seq_len):
    temp = []
    for i in range(d_model//2):
            temp.append(sin(pos,i))
            temp.append(cos(pos,i))

    postional_encoding_vector.append(temp)

postional_encoding_vector = torch.tensor(postional_encoding_vector)

postional_encoding_vector

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00],
        ...,
        [-9.6140e-01, -2.7516e-01, -6.3753e-01,  ...,  1.0000e+00,
          1.7623e-03,  1.0000e+00],
        [-7.5099e-01,  6.6032e-01, -9.9638e-01,  ...,  1.0000e+00,
          1.8659e-03,  1.0000e+00],
        [ 1.4988e-01,  9.8870e-01, -4.9773e-01,  ...,  1.0000e+00,
          1.9696e-03,  1.0000e+00]])

In [9]:
positional = [[sin(pos,i), cos(pos,i)] for i in range(d_model//2) for pos in range(seq_len)]

In [12]:
torch.tensor(positional).shape

torch.Size([5120, 2])

In [None]:
## implement positional encoding using torch

import math

class PositionalEncoding:
    def __init__(self, seq_len, d_model):
        super(PositionalEncoding, self).__init__()
        self.seq_len = seq_len
        self.d_model = d_model
    
    def sin(self, pos, i):
        return math.sin(pos/10000**(2*i/self.d_model))
    
    def cos(self, pos, i):
        return math.cos(pos/10000**(2*i/self.d_model))
    
    def calculate_pe(self):
        positional_encoding_vector = []
        for pos in range(self.seq_len):
            temp = []
            for i in range(self.d_model//2):
                    temp.append(sin(pos,i))
                    temp.append(cos(pos,i))
            positional_encoding_vector.append(temp)
        return torch.tensor(positional_encoding_vector)
        
                
    


In [3]:
from model.positional_encoding import PositionalEncoding

pe = PositionalEncoding(seq_len=1024, d_model=512)

## Decoder full Implementation

In [1]:
## class of full transfprmer block

import torch
import torch.nn as nn
import numpy as np
import math
from model.positional_encoding import PositionalEncoding
from model.transformer_block import TransformerBlock


In [2]:
class TransformerDecoder(nn.Module):
    def __init__(self, d_model, h, dropout, blocks, vocab_size):
        super(TransformerDecoder, self).__init__()
        self.d_model = d_model
        self.h = h
        self.vocab_size = vocab_size
        self.layernorm = nn.LayerNorm(self.d_model)
        self.linear = nn.Linear(self.d_model, self.vocab_size)
        self.dropout = dropout
        self.blocks = blocks
        # defining transformer blocks
        self.blocks_list = nn.ModuleList()
        for i in range(self.blocks):
            self.blocks_list.append(TransformerBlock(heads = self.h, d_model= d_model, dropout=self.dropout))
        # positional encoding 
        self.positional_encoding = PositionalEncoding(d_model = self.d_model)


    # forward, assuming X (B, Seq_len, d_model)
    def forward(self, X):
        seq_len = X.shape[1]
        d_model = X.shape[-1]
        pe = self.positional_encoding.calculate_pe(X)
        # modifying input tensor by adding X + positional encoding
        input_tensor = X + pe
        # this input will go into 6 transformer blocks
        ## here also I will need to put them automatically
        x = input_tensor.clone()
        for block in self.blocks_list:
            y = block(x)
            x = y        
        # adding layer norm 1 
        layer_norm_1 = self.layernorm(x)

        # adding linear layer
        linear_layer_output = self.linear(layer_norm_1)
        # returning final layer output
        return linear_layer_output






In [3]:
decoder = TransformerDecoder(d_model= 512, h = 8, dropout=0.1, vocab_size=50247, blocks = 6)

In [4]:
X = torch.randn(32, 20, 512)

In [6]:
decoder.forward(X).shape

torch.Size([32, 20, 50247])

In [1]:
from model.decoder import TransformerDecoder

In [2]:
decdr = TransformerDecoder(d_model=512, h = 8, dropout=0.2, blocks=6,vocab_size=50247)

In [4]:
import torch
X = torch.randn(64, 121, 512)

In [6]:
decdr.forward(X).shape

torch.Size([64, 121, 50247])

## Embeddings

In [7]:
import torch
import torch.nn as nn

In [9]:
vocab_size = 50247
d_model = 512

embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

In [14]:
x = torch.LongTensor([[1, 2, 4], [0, 9, 3]])

In [16]:
embedding_layer(x).shape

torch.Size([2, 3, 512])

## Tokenizer

In [8]:
from utils.tokenizer import CharTokenizer

In [9]:
character_tokenizer = CharTokenizer(file = r"C:\Users\amanm\Desktop\learning\transformer_from_scratch\data\tiny-shakespeare.txt")

In [3]:
encoded_list = character_tokenizer.encode(file_path= "testing.txt")[0:5]

In [4]:
encoded_list

[25, 63, 1, 52, 39]

In [5]:
decode = character_tokenizer.decode(encoded_list)

In [6]:
decode

'My na'

## Dataset Class 

In [1]:
from training.dataset import CustomTextData
from utils.tokenizer import CharTokenizer

In [2]:
## reading the entire file

with open(r"C:\Users\amanm\Desktop\learning\transformer_from_scratch\data\tiny-shakespeare.txt") as f:
    full_text = f.read()

In [3]:
N = len(full_text)

In [4]:
train_text = full_text[:int((0.9)*N)]
val_text = full_text[int((0.9)*N):]

In [5]:
# preparing tokenizer
tokenizer = CharTokenizer(train_text)

In [6]:
train_custom = CustomTextData(text=train_text, tokenizer=tokenizer, seq_len=20)

In [7]:
val_custom = CustomTextData(text=val_text, tokenizer=tokenizer, seq_len=20)

In [8]:
## Dataloader
from torch.utils.data import DataLoader

train_loader = DataLoader(train_custom, batch_size=4, shuffle = True)
val_loader = DataLoader(val_custom, batch_size=4, shuffle = False)


In [9]:
print(f"Training Set has data instances of {len(train_custom)}")
print(f"Testing Set has data instances of {len(val_custom)}")

Training Set has data instances of 1003834
Testing Set has data instances of 111520


## Training 

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# --------------------------------
# 1. Device
# --------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# --------------------------------
# 2. Hyperparameters
# --------------------------------
epochs = 10
learning_rate = 1e-3
d_model = 512

# --------------------------------
# 3. Vocabulary size
# --------------------------------
vocab_size = len(tokenizer.stoi)

# --------------------------------
# 4. Model
# --------------------------------
from model.decoder import TransformerDecoder

model = TransformerDecoder(
    d_model=d_model,
    h=8,
    dropout=0.1,
    blocks=6,
    vocab_size=vocab_size
).to(device)

# --------------------------------
# 5. Loss & Optimizer
# --------------------------------
loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=learning_rate
)

# --------------------------------
# 6. Training loop
# --------------------------------
model.train()

for epoch in range(epochs):

    total_loss = 0.0
    num_batches = 0

    for x, y in train_loader:

        # move data to device
        x = x.to(device)   # (B, T)
        y = y.to(device)   # (B, T)

        # zero gradients
        optimizer.zero_grad()

        # forward pass
        logits = model.forward(x)  # (B, T, vocab_size)

        # compute loss
        loss = loss_fn(
            logits.view(-1, vocab_size),
            y.view(-1)
        )

        # backward pass
        loss.backward()

        # update parameters
        optimizer.step()

        # bookkeeping
        total_loss += loss.item()
        num_batches += 1

    avg_loss = total_loss / num_batches

    print(f"Epoch [{epoch+1}/{epochs}] - Training Loss: {avg_loss:.4f}")


KeyboardInterrupt: 

In [3]:
from model.decoder import TransformerDecoder

model = TransformerDecoder(
    d_model=512,
    h=8,
    dropout=0.1,
    blocks=6,
    vocab_size=65
)

In [4]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)


In [5]:
trainable_params

18981953

## Training - Device Agnostic

In [8]:
import torch
import torch.nn as nn
from training.dataset import CustomTextData
from utils.tokenizer import CharTokenizer
from torch.utils.data import DataLoader
from model.decoder import TransformerDecoder



## reading the entire file

with open(r"C:\Users\amanm\Desktop\learning\transformer_from_scratch\data\tiny-shakespeare.txt") as f:
    full_text = f.read()


N = len(full_text)


## Splitting train and validation and data split

train_text = full_text[:int((0.9)*N)]
val_text = full_text[int((0.9)*N):]


# preparing tokenizer
tokenizer = CharTokenizer(train_text)

train_custom = CustomTextData(text=train_text, tokenizer=tokenizer, seq_len=20)
val_custom = CustomTextData(text=val_text, tokenizer=tokenizer, seq_len=20)


train_loader = DataLoader(train_custom, batch_size=2, shuffle = True)
val_loader = DataLoader(val_custom, batch_size=2, shuffle = False)


# --------------------------------
# 1. Device
# --------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device, flush=True)

# --------------------------------
# 2. Hyperparameters
# --------------------------------
epochs = 10
learning_rate = 1e-3
d_model = 512

# --------------------------------
# 3. Vocabulary size
# --------------------------------
vocab_size = len(tokenizer.stoi)

# --------------------------------
# 4. Model
# --------------------------------
model = TransformerDecoder(
    d_model=d_model,
    h=8,
    dropout=0.1,
    blocks=6,
    vocab_size=vocab_size
).to(device)

# üîç Sanity check
print("Model device:", next(model.parameters()).device, flush=True)

# --------------------------------
# 5. Loss & Optimizer
# --------------------------------
loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=learning_rate
)

# --------------------------------
# 6. Training loop
# --------------------------------
import time

model.train()

for epoch in range(epochs):
    total_loss = 0.0
    num_batches = 0
    epoch_start = time.time()

    for x, y in train_loader:
        batch_start = time.time()

        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        optimizer.zero_grad()
        logits = model(x)

        loss = loss_fn(
            logits.view(-1, vocab_size),
            y.view(-1)
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

        # üî¥ PRINT EVERY BATCH
        print(
            f"[Epoch {epoch+1}] "
            f"Batch {num_batches}/{len(train_loader)} | "
            f"Loss {loss.item():.4f} | "
            f"Batch time {time.time() - batch_start:.2f}s | "
            f"Epoch time {time.time() - epoch_start:.1f}s",
            flush=True
        )

    avg_loss = total_loss / max(1, num_batches)
    print(
        f"Epoch [{epoch+1}/{epochs}] DONE | "
        f"Avg Loss {avg_loss:.4f} | "
        f"Total Epoch Time {time.time() - epoch_start:.1f}s",
        flush=True
    )



Using device: cpu
Model device: cpu
[Epoch 1] Batch 1/501917 | Loss 4.3881 | Batch time 0.23s | Epoch time 0.4s
[Epoch 1] Batch 2/501917 | Loss 3.8391 | Batch time 0.14s | Epoch time 0.5s
[Epoch 1] Batch 3/501917 | Loss 4.6828 | Batch time 0.15s | Epoch time 0.7s
[Epoch 1] Batch 4/501917 | Loss 4.3010 | Batch time 0.15s | Epoch time 0.8s
[Epoch 1] Batch 5/501917 | Loss 4.5986 | Batch time 0.15s | Epoch time 1.0s
[Epoch 1] Batch 6/501917 | Loss 3.6799 | Batch time 0.14s | Epoch time 1.1s
[Epoch 1] Batch 7/501917 | Loss 3.6179 | Batch time 0.14s | Epoch time 1.3s
[Epoch 1] Batch 8/501917 | Loss 3.9406 | Batch time 0.15s | Epoch time 1.4s
[Epoch 1] Batch 9/501917 | Loss 4.1029 | Batch time 0.14s | Epoch time 1.5s
[Epoch 1] Batch 10/501917 | Loss 3.8688 | Batch time 0.15s | Epoch time 1.7s
[Epoch 1] Batch 11/501917 | Loss 3.3396 | Batch time 0.16s | Epoch time 1.9s
[Epoch 1] Batch 12/501917 | Loss 4.2421 | Batch time 0.17s | Epoch time 2.0s
[Epoch 1] Batch 13/501917 | Loss 3.7271 | Batch t

KeyboardInterrupt: 