In [1]:
# clone the BLT repository
!git clone https://github.com/sathishkumar67/Byte-Latent-Transformer.git
# move the files to the current directory
!mv /kaggle/working/Byte-Latent-Transformer/* /kaggle/working/
# upgrade pip
!pip install --upgrade pip
# install latest version pytorch
# install the required packages
!pip install -r requirements.txt

Cloning into 'Byte-Latent-Transformer'...
remote: Enumerating objects: 154, done.[K
remote: Counting objects: 100% (154/154), done.[K
remote: Compressing objects: 100% (108/108), done.[K
remote: Total 154 (delta 86), reused 102 (delta 41), pack-reused 0 (from 0)[K
Receiving objects: 100% (154/154), 47.70 KiB | 5.96 MiB/s, done.
Resolving deltas: 100% (86/86), done.
Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting lightning==2.5.3 (from -r requirements.txt (line 1))
  Downloading lightning-2.5.3-py3-none-any.whl.metadata (39 kB)
Collecting nvidia-cuda-nvrtc-

In [2]:
# import os
# import shutil

# # Get current working directory
# current_dir = os.getcwd()

# for item in os.listdir(current_dir):
#     item_path = os.path.join(current_dir, item)
#     try:
#         if os.path.isfile(item_path) or os.path.islink(item_path):
#             os.remove(item_path)  # remove file or symlink
#         elif os.path.isdir(item_path):
#             shutil.rmtree(item_path)  # remove folder and its contents
#     except Exception as e:
#         print(f"Failed to delete {item_path}. Reason: {e}")

In [3]:
import torch
import torch.nn as nn
import numpy as np
import lightning as L
from BLT.entropy import EntropyModel, EntropyConfig
from huggingface_hub import hf_hub_download

In [4]:
# download the tokenized text
hf_hub_download(repo_id="pt-sk/Text_Bytes_Tokens",
                filename="wikipedia_512_pretraining/tokenized_text1.npy",
                repo_type="dataset",
                local_dir="/kaggle/working/")

# load the tokenized text
tokens = np.load("/kaggle/working/wikipedia_512_pretraining/tokenized_text1.npy", allow_pickle=True)

wikipedia_512_pretraining/tokenized_text(…):   0%|          | 0.00/146M [00:00<?, ?B/s]

In [5]:
# Initialize model and config
config = EntropyConfig()
model = EntropyModel(config)

# count the number of parameters in the model
num_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters in the model: {num_params/1e6}M")

def configure_optimizer(model: nn.Module) -> torch.optim.Optimizer:
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in model.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for _, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for _, p in param_dict.items() if p.dim() < 2]

        # Create AdamW optimizer and use the fused version if available
        return torch.optim.AdamW([{'params': decay_params, 'weight_decay': 0.1},
                                {'params': nodecay_params, 'weight_decay': 0.0}],
                                lr=0.0001,
                                betas=(0.9, 0.999),
                                eps=1e-8,
                                fused=True)

optimizer = configure_optimizer(model)

Number of parameters in the model: 35.800064M


In [None]:
# class GPTWrapper(L.LightningModule):
#     def __init__(self, config: EntropyConfig, model: EntropyModel) -> None:
#         super().__init__()
#         self.config = config
#         self.model = model
#         self.optimizer = self.configure_optimizers()
    
#     def training_step(self, batch, batch_idx):
#         self.model.train()
#         optimizer = self.optimizers()
#         optimizer.zero_grad()
        
#         inputs, targets = batch
#         _, loss = self.model(inputs, targets)
#         self.log("Train_Loss", loss, prog_bar=True)

#         return loss
    
#     def validation_step(self, batch, batch_idx):
#         self.model.eval()
#         inputs, targets = batch
#         _, loss = self.model(inputs, targets)
#         self.log("Val_Loss", loss, prog_bar=True)

#         return loss
    
#     def configure_optimizers(self):
#         optimizer = configure_optimizer(self.model)
#         return optimizer


In [26]:
model_wrapper = GPTWrapper(config, model)

In [28]:
input_tensor = torch.randint(0, 256, (2048,))
targets = torch.randint(0, 256, (2048,))

In [9]:
class TokenDataset(torch.utils.data.Dataset):
    def __init__(self, block_size: int, input_ids: list[int] | np.ndarray) -> None:
        self.block_size = block_size
        self.input_ids = input_ids

    def __len__(self) -> int:
        return (len(self.input_ids) - 1) // self.block_size

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        start_idx = idx * self.block_size
        end_idx = start_idx + self.block_size
        return torch.tensor(self.input_ids[start_idx:end_idx], dtype=torch.long), torch.tensor(self.input_ids[start_idx+1:end_idx+1], dtype=torch.long)

In [10]:
dataset = TokenDataset(block_size=2048, input_ids=tokens)