In [1]:
!pip install -U "huggingface_hub[cli]"
!pip install transformers datasets tqdm numpy torch
!pip install mambapy

from IPython.display import clear_output
clear_output(wait=False)

In [2]:
# Should get access to llama 3.2 1B
import os
os.environ['HF_TOKEN'] = 'ENTER YOUR TOKEN'
!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `598Project` has been saved to /home/sbhushan/.cache/huggingface/stored_tokens
Your token has been saved to /home/sbhushan/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
import torch
def print_gpu_memory():
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            total_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
            allocated_memory = torch.cuda.memory_allocated(i) / 1024**3
            cached_memory = torch.cuda.memory_reserved(i) / 1024**3
            free_memory = total_memory - allocated_memory
            
            print(f"GPU {i} - {torch.cuda.get_device_name(i)}")
            print(f"Total Memory: {total_memory:.2f} GB")
            print(f"Allocated Memory: {allocated_memory:.2f} GB")
            print(f"Cached Memory: {cached_memory:.2f} GB")
            print(f"Free Memory: {free_memory:.2f} GB")
            print("-" * 50)
    else:
        print("No GPU available")

# Call the function
print_gpu_memory()

GPU 0 - Tesla V100-PCIE-16GB
Total Memory: 15.77 GB
Allocated Memory: 0.00 GB
Cached Memory: 0.00 GB
Free Memory: 15.77 GB
--------------------------------------------------


In [4]:
def memory_check(step=""):
    print(f"\nMemory Check - {step}")
    print_gpu_memory()

In [5]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from mambapy.mamba import Mamba, MambaConfig

In [6]:
class LLMDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

In [7]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token

# Load streaming dataset
dataset = load_dataset("HuggingFaceFW/fineweb-edu", 
                      name="sample-10BT", 
                      split="train", 
                      streaming=True)

Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

In [8]:
# Initialize storage
collected_tokens = []
total_tokens = 0
target_tokens = 5_000_000

# Collect samples
for sample in dataset:
    # Get text from sample
    text = sample['text']
    
    # Tokenize
    tokens = tokenizer(text, truncation=False, padding=False)['input_ids']
    
    # Add to collection
    collected_tokens.extend(tokens)
    total_tokens += len(tokens)
    
    # Print progress every 100k tokens
    if total_tokens // 100_000 > (total_tokens - len(tokens)) // 100_000:
        print(f"Collected {total_tokens:,} tokens")
    
    # Stop when we hit target
    if total_tokens >= target_tokens:
        break

# Convert to numpy array and trim to exact size
collected_tokens = np.array(collected_tokens[:target_tokens])
print(f"\nFinal token count: {len(collected_tokens):,}")

Collected 101,374 tokens
Collected 201,135 tokens
Collected 301,190 tokens
Collected 400,573 tokens
Collected 500,666 tokens
Collected 600,893 tokens
Collected 701,230 tokens
Collected 800,062 tokens
Collected 900,056 tokens
Collected 1,000,303 tokens
Collected 1,104,261 tokens
Collected 1,200,768 tokens
Collected 1,300,278 tokens
Collected 1,400,226 tokens
Collected 1,500,380 tokens
Collected 1,600,015 tokens
Collected 1,700,608 tokens
Collected 1,810,446 tokens
Collected 1,902,046 tokens
Collected 2,002,909 tokens
Collected 2,100,052 tokens
Collected 2,200,058 tokens
Collected 2,303,549 tokens
Collected 2,401,422 tokens
Collected 2,500,434 tokens
Collected 2,600,557 tokens
Collected 2,700,392 tokens
Collected 2,801,133 tokens
Collected 2,900,204 tokens
Collected 3,001,947 tokens
Collected 3,101,877 tokens
Collected 3,200,123 tokens
Collected 3,300,891 tokens
Collected 3,400,373 tokens
Collected 3,500,236 tokens
Collected 3,600,082 tokens
Collected 3,700,818 tokens
Collected 3,802,260

In [9]:
sequence_length = 512  # Changed from 1024
n_sequences = len(collected_tokens) // sequence_length

# Reshape tokens into sequences
sequences = collected_tokens[:n_sequences * sequence_length].reshape(-1, sequence_length)

# Create input and target pairs for causal language modeling
input_sequences = sequences[:, :-1]  # all tokens except last
target_sequences = sequences[:, 1:]  # all tokens except first

# Convert to torch tensors
inputs = torch.tensor(input_sequences)
masks = torch.tensor(np.ones_like(input_sequences))
targets = torch.tensor(target_sequences)

print(f"Input shape: {inputs.shape}")
print(f"Target shape: {targets.shape}")

Input shape: torch.Size([9765, 511])
Target shape: torch.Size([9765, 511])


In [10]:
val_split = 0.1
val_idx = int(len(inputs) * (1 - val_split))

# Split into train/val
train_inputs = inputs[:val_idx]
train_masks = masks[:val_idx]
train_targets = targets[:val_idx]

val_inputs = inputs[val_idx:]
val_masks = masks[val_idx:]
val_targets = targets[val_idx:]

# Create datasets
train_dataset = LLMDataset(train_inputs, train_masks, train_targets)
val_dataset = LLMDataset(val_inputs, val_masks, val_targets)

# Create dataloaders
batch_size = 2

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,  # Changed to 4
    shuffle=True,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    pin_memory=True
)

In [11]:
print("\nDataset Statistics:")
print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

# Sample a batch
sample_batch = next(iter(train_loader))
print("\nBatch shapes:")
for k, v in sample_batch.items():
    print(f"{k}: {v.shape}")

# Decode a sample sequence
sample_seq = sample_batch['input_ids'][0][:10].tolist()
decoded = tokenizer.decode(sample_seq)
print(f"\nSample decoded text:\n{decoded}")


Dataset Statistics:
Training batches: 4394
Validation batches: 489

Batch shapes:
input_ids: torch.Size([2, 511])
attention_mask: torch.Size([2, 511])
labels: torch.Size([2, 511])

Sample decoded text:
, is one that has a high level of viscosity


In [12]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
#model = model.half()

if torch.cuda.is_available():
    model = model.cuda()
    print("Using CUDA")
else:
    print("Using CPU")

Using CUDA


In [13]:
# Initialize 8 Mamba blocks
mamba_blocks = {}

# Configuration for all blocks
mamba_config = MambaConfig(
    d_model=2048,          # Matches Llama's hidden size
    n_layers=1             
)

# Create blocks for layers 0,2,4,6,8,10,12,14
#for layer_idx in [1, 3, 5, 7, 8, 10, 12, 14]:
for layer_idx in [0, 2, 4, 6, 8, 10, 12, 14]:
    
    mamba_blocks[layer_idx] = Mamba(mamba_config).cuda()

# Verify shapes for each block
test_input = torch.randn(1, 5, 2048).cuda()  # Changed dtype
for layer_idx, block in mamba_blocks.items():
    test_output = block(test_input)
    print(f"Layer {layer_idx} - Input shape: {test_input.shape}, Output shape: {test_output.shape}")
    print(f"Layer {layer_idx} - Input dtype: {test_input.dtype}, Output dtype: {test_output.dtype}")

Layer 0 - Input shape: torch.Size([1, 5, 2048]), Output shape: torch.Size([1, 5, 2048])
Layer 0 - Input dtype: torch.float32, Output dtype: torch.float32
Layer 2 - Input shape: torch.Size([1, 5, 2048]), Output shape: torch.Size([1, 5, 2048])
Layer 2 - Input dtype: torch.float32, Output dtype: torch.float32
Layer 4 - Input shape: torch.Size([1, 5, 2048]), Output shape: torch.Size([1, 5, 2048])
Layer 4 - Input dtype: torch.float32, Output dtype: torch.float32
Layer 6 - Input shape: torch.Size([1, 5, 2048]), Output shape: torch.Size([1, 5, 2048])
Layer 6 - Input dtype: torch.float32, Output dtype: torch.float32
Layer 8 - Input shape: torch.Size([1, 5, 2048]), Output shape: torch.Size([1, 5, 2048])
Layer 8 - Input dtype: torch.float32, Output dtype: torch.float32
Layer 10 - Input shape: torch.Size([1, 5, 2048]), Output shape: torch.Size([1, 5, 2048])
Layer 10 - Input dtype: torch.float32, Output dtype: torch.float32
Layer 12 - Input shape: torch.Size([1, 5, 2048]), Output shape: torch.Size

In [14]:
# Freeze LLaMA
for param in model.parameters():
    param.requires_grad = False

# Verify LLaMA is frozen
trainable_params = [p for p in model.parameters() if p.requires_grad]
print(f"LLaMA trainable parameters: {len(trainable_params)} (should be 0)")

# Ensure Mamba blocks are trainable
for mamba in mamba_blocks.values():
    for param in mamba.parameters():
        param.requires_grad = True

LLaMA trainable parameters: 0 (should be 0)


In [15]:
optimizers = {}
for layer_idx in [0, 2, 4, 6, 8, 10, 12, 14]:
    optimizers[layer_idx] = torch.optim.AdamW(mamba_blocks[layer_idx].parameters(), lr=1e-4)

In [16]:
from tqdm.notebook import tqdm
import time
from datetime import timedelta
import torch.nn.functional as F

In [17]:
batch_size = 4
accumulation_steps = 32  # Define accumulation steps
num_epochs = 1
start_time = time.time()

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
    # Track loss for each layer separately
    layer_losses = {idx: 0.0 for idx in [0, 2, 4, 6, 8, 10, 12, 14]}
    
    # Progress bar for this epoch
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), 
                       desc=f'Epoch {epoch+1}', 
                       leave=True)
    
    for i, batch in progress_bar:
        # Move batch to GPU
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()
        
        # Forward pass through LLaMA
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                output_hidden_states=True
            )
        
        hidden_states = outputs.hidden_states
        
        # Train each Mamba block independently
        for layer_idx in [0, 2, 4, 6, 8, 10, 12, 14]:
            # Get input and target output for this layer
            layer_input = hidden_states[layer_idx]
            attention_output = hidden_states[layer_idx + 1]
            
            optimizers[layer_idx].zero_grad()
            
            # Forward through Mamba
            mamba_output = mamba_blocks[layer_idx](layer_input)
            
            # Compute loss for this layer
            layer_loss = F.mse_loss(mamba_output, attention_output)
            layer_loss = layer_loss / accumulation_steps
            
            # Accumulate loss for reporting
            layer_losses[layer_idx] += layer_loss.item()
            
            # Backward for this layer
            layer_loss.backward()
            
            del mamba_output
            
            if (i + 1) % accumulation_steps == 0:
                optimizers[layer_idx].step()
                optimizers[layer_idx].zero_grad()
        
        # Memory cleanup
        del outputs, hidden_states
        torch.cuda.empty_cache()
        
        # Update progress bar with losses after accumulation steps
        if (i + 1) % accumulation_steps == 0:
            avg_losses = {
                layer_idx: layer_losses[layer_idx] / accumulation_steps 
                for layer_idx in layer_losses
            }
            
            # Update progress bar description with layer losses
            loss_str = " ".join([f"L{idx}: {loss:.4f}" for idx, loss in avg_losses.items()])
            progress_bar.set_description(
                f"Epoch {epoch+1} | {loss_str}"
            )
            
            # Reset loss accumulators
            layer_losses = {idx: 0.0 for idx in [0, 2, 4, 6, 8, 10, 12, 14]}

    # End of epoch timing
    epoch_time = time.time() - start_time
    print(f"\nEpoch {epoch+1} completed in: {timedelta(seconds=int(epoch_time))}")

total_time = time.time() - start_time
print(f"\nTraining completed in: {timedelta(seconds=int(total_time))}")


Epoch 1/1


Epoch 1:   0%|          | 0/4394 [00:00<?, ?it/s]

KeyboardInterrupt: 