In [30]:
# clone the BLT repository
!git clone https://github.com/sathishkumar67/Byte-Latent-Transformer.git
# move the files to the current directory
!mv /kaggle/working/Byte-Latent-Transformer/* /kaggle/working/
# upgrade pip
!pip install --upgrade pip
# install latest version pytorch
!pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126
# install the required packages
!pip install -r requirements.txt

Cloning into 'Byte-Latent-Transformer'...
remote: Enumerating objects: 177, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (123/123), done.[K
remote: Total 177 (delta 101), reused 118 (delta 49), pack-reused 0 (from 0)[K
Receiving objects: 100% (177/177), 66.33 KiB | 5.10 MiB/s, done.
Resolving deltas: 100% (101/101), done.
Looking in indexes: https://download.pytorch.org/whl/cu126


In [31]:
import os
import torch
import torch.nn as nn
import numpy as np
import lightning as L
from lightning.pytorch import Trainer
from BLT.entropy import EntropyModel, EntropyConfig
from BLT.dataset import TokenDataset
from BLT.utils import clear_directory
from huggingface_hub import hf_hub_download

In [32]:
# clear_directory(os.getcwd())

In [33]:
# download the checkpoint for the model
hf_hub_download(repo_id="pt-sk/BLT_Entropy_Checkpoints",
                filename="entropy_ckpt_1.ckpt",
                repo_type="model",
                local_dir="/kaggle/working/")

# download the tokenized text
hf_hub_download(repo_id="pt-sk/Text_Bytes_Tokens",
                filename="wikipedia_512_pretraining/tokenized_text2.npy",
                repo_type="dataset",
                local_dir="/kaggle/working/")

# load the tokenized text
tokens = np.load("/kaggle/working/wikipedia_512_pretraining/tokenized_text2.npy", allow_pickle=True)

entropy_ckpt_1.ckpt:   0%|          | 0.00/438M [00:00<?, ?B/s]

wikipedia_512_pretraining/tokenized_text(…):   0%|          | 0.00/146M [00:00<?, ?B/s]

In [34]:
# Initialize model and config
config = EntropyConfig()
model = EntropyModel(config)

# count the number of parameters in the model
num_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters in the model: {num_params/1e6}M")

def configure_optimizer(model: nn.Module) -> torch.optim.Optimizer:
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in model.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for _, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for _, p in param_dict.items() if p.dim() < 2]

        # Create AdamW optimizer and use the fused version if available
        return torch.optim.AdamW([{'params': decay_params, 'weight_decay': 0.1},
                                {'params': nodecay_params, 'weight_decay': 0.0}],
                                lr=0.0001,
                                betas=(0.9, 0.999),
                                eps=1e-8,
                                fused=True)

# initialize optimizer
optimizer = configure_optimizer(model)

Number of parameters in the model: 35.800064M


In [35]:
# EntropyWrapper: PyTorch Lightning wrapper for the EntropyModel
class EntropyWrapper(L.LightningModule):
    def __init__(self, config: EntropyConfig, model: EntropyModel) -> None:
        super().__init__()
        self.config = config
        self.model = model
        self.optimizer = self.configure_optimizers()

    def training_step(self, batch, batch_idx):
        self.model.train()
        optimizer = self.optimizers()
        optimizer.zero_grad()
        
        inputs, targets = batch
        _, loss = self.model(inputs, targets)
        self.log("Train_Loss", loss, prog_bar=True)

        return loss
    
    def configure_optimizers(self):
        optimizer = configure_optimizer(self.model)
        return optimizer

In [38]:
# Create dataset and dataloader
dataset = TokenDataset(block_size=2048, input_ids=tokens)
dataloader = torch.utils.data.DataLoader(dataset, 
                                        batch_size=8, 
                                        shuffle=True,
                                        pin_memory=True,
                                        pin_memory_device='cuda',
                                        num_workers=os.cpu_count(),
                                        prefetch_factor=2)

# Initialize model wrapper
model_wrapper = EntropyWrapper.load_from_checkpoint("/kaggle/working/entropy_ckpt_1.ckpt", config=config, model=model)

In [39]:
# Initialize trainer
trainer = Trainer(max_epochs=2,
                  accelerator="cuda",
                  devices=1)

# Train the model
trainer.fit(model_wrapper, dataloader)

INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
2025-09-02 09:13:42.075591: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756804422.233682      76 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756804422.281345      76 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type         | Params | M

Training: |          | 0/? [00:00<?, ?it/s]

INFO: 
Detected KeyboardInterrupt, attempting graceful shutdown ...


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/lightning/pytorch/trainer/call.py", line 49, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/lightning/pytorch/trainer/trainer.py", line 599, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/usr/local/lib/python3.11/dist-packages/lightning/pytorch/trainer/trainer.py", line 1012, in _run
    results = self._run_stage()
              ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/lightning/pytorch/trainer/trainer.py", line 1056, in _run_stage
    self.fit_loop.run()
  File "/usr/local/lib/python3.11/dist-packages/lightning/pytorch/loops/fit_loop.py", line 216, in run
    self.advance()
  File "/usr/local/lib/python3.11/dist-packages/lightning/pytorch/loops/fit_loop.py", line 455, in advance
    self.epoch_loop.run(self._data_fetcher)
  File "/usr/local/lib/python3.11

TypeError: object of type 'NoneType' has no len()