In [7]:
# clone the BLT repository
!git clone https://github.com/sathishkumar67/Byte-Latent-Transformer.git
# move the files to the current directory
!mv /kaggle/working/Byte-Latent-Transformer/* /kaggle/working/
# upgrade pip
!pip install --upgrade pip
# install latest version pytorch
# install the required packages
!pip install -r requirements.txt

Cloning into 'Byte-Latent-Transformer'...
remote: Enumerating objects: 169, done.[K
remote: Counting objects: 100% (169/169), done.[K
remote: Compressing objects: 100% (117/117), done.[K
remote: Total 169 (delta 96), reused 113 (delta 47), pack-reused 0 (from 0)[K
Receiving objects: 100% (169/169), 55.13 KiB | 5.51 MiB/s, done.
Resolving deltas: 100% (96/96), done.


In [8]:
# import os
# import shutil

# # Get current working directory
# current_dir = os.getcwd()

# for item in os.listdir(current_dir):
#     item_path = os.path.join(current_dir, item)
#     try:
#         if os.path.isfile(item_path) or os.path.islink(item_path):
#             os.remove(item_path)  # remove file or symlink
#         elif os.path.isdir(item_path):
#             shutil.rmtree(item_path)  # remove folder and its contents
#     except Exception as e:
#         print(f"Failed to delete {item_path}. Reason: {e}")

In [9]:
import torch
import torch.nn as nn
import numpy as np
import lightning as L
from lightning.pytorch import Trainer
from BLT.entropy import EntropyModel, EntropyConfig
from BLT.dataset import TokenDataset
from huggingface_hub import hf_hub_download

In [10]:
# download the tokenized text
hf_hub_download(repo_id="pt-sk/Text_Bytes_Tokens",
                filename="wikipedia_512_pretraining/tokenized_text1.npy",
                repo_type="dataset",
                local_dir="/kaggle/working/")

# load the tokenized text
tokens = np.load("/kaggle/working/wikipedia_512_pretraining/tokenized_text1.npy", allow_pickle=True)

wikipedia_512_pretraining/tokenized_text(…):   0%|          | 0.00/146M [00:00<?, ?B/s]

In [11]:
# Initialize model and config
config = EntropyConfig()
model = EntropyModel(config)

# count the number of parameters in the model
num_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters in the model: {num_params/1e6}M")

def configure_optimizer(model: nn.Module) -> torch.optim.Optimizer:
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in model.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for _, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for _, p in param_dict.items() if p.dim() < 2]

        # Create AdamW optimizer and use the fused version if available
        return torch.optim.AdamW([{'params': decay_params, 'weight_decay': 0.1},
                                {'params': nodecay_params, 'weight_decay': 0.0}],
                                lr=0.0001,
                                betas=(0.9, 0.999),
                                eps=1e-8,
                                fused=True)

optimizer = configure_optimizer(model)

Number of parameters in the model: 35.800064M


In [12]:
class GPTWrapper(L.LightningModule):
    def __init__(self, config: EntropyConfig, model: EntropyModel) -> None:
        super().__init__()
        self.config = config
        self.model = model
        self.optimizer = self.configure_optimizers()
    
    def training_step(self, batch, batch_idx):
        self.model.train()
        optimizer = self.optimizers()
        optimizer.zero_grad()
        
        inputs, targets = batch
        _, loss = self.model(inputs, targets)
        self.log("Train_Loss", loss, prog_bar=True)

        return loss
    
    def configure_optimizers(self):
        optimizer = configure_optimizer(self.model)
        return optimizer

In [None]:
dataset = TokenDataset(block_size=2048, input_ids=tokens)
dataloader = torch.utils.data.DataLoader(dataset, 
                                        batch_size=8, 
                                        shuffle=True,
                                        pin_memory=True,
                                        pinmemory_device='cuda',)

model_wrapper = GPTWrapper(config, model)

In [14]:
trainer = Trainer(max_epochs=1,
                  accelerator="cuda",
                  devices=1)

INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs


In [15]:
trainer.fit(model_wrapper, dataloader)

2025-09-01 05:46:11.394096: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756705571.417240     204 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756705571.424231     204 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type         | Params | Mode 
-----------------------------------------------
0 | model | EntropyModel | 35.8 M | train
-----------------------------------------------
35.8 M    Trainable params
0         Non-trainable params
35.8 M    Total params
143.200   Total estimated model params size (MB)
118       Modules in train mode
0         Modules in eval mode
/usr/loca

Training: |          | 0/? [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 189.12 MiB is free. Process 10485 has 15.70 GiB memory in use. Of the allocated memory 15.16 GiB is allocated by PyTorch, and 256.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)