# Phase 1: Text Generation Inference

Generate text samples from the pretrained LLM.

In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

In [2]:
import torch
import tomllib
from tokenizers import Tokenizer
from text_pretraining.model import Llama

# Load config
with open(PROJECT_ROOT / "config.toml", "rb") as f:
    config = tomllib.load(f)

# Load tokenizer
tokenizer = Tokenizer.from_file(str(PROJECT_ROOT / "text_pretraining" / "bpe_tokenizer.json"))
print(f"Vocab size: {tokenizer.get_vocab_size()}")

Vocab size: 32000


In [3]:
# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# Compute derived values
d_head = config["d_model"] // config["n_heads"]
kv_d_head = config["d_model"] // config["n_kv_heads"]

model = Llama(
    vocab_size=config["vocab_size"],
    n_layers=config["n_layers"],
    d_model=config["d_model"],
    d_head=d_head,
    n_heads=config["n_heads"],
    n_kv_heads=config["n_kv_heads"],
    kv_d_head=kv_d_head,
    d_ff_standard=config["d_ff_standard"],
    num_experts=config["num_experts"],
    num_experts_per_tok=config["num_experts_per_tok"],
    d_expert=config["d_expert"],
    rope_layers_ratio=config["rope_layers_ratio"],
    chunk_size=config["chunk_size"],
    rope_theta=config["rope_theta"],
)

# Load checkpoint
ckpt_path = PROJECT_ROOT / "text_pretraining" / "checkpoints" / "best.pt"
ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
model.load_state_dict(ckpt["model_state_dict"])
model = model.to(device).eval()

print(f"Loaded checkpoint from epoch {ckpt.get('epoch', 'N/A')}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

Device: cuda
Loaded checkpoint from epoch 1
Parameters: 379,547,916


In [4]:
@torch.no_grad()
def generate(prompt, max_tokens=100, temperature=0.8, top_k=50):
    """Generate text from a prompt."""
    # Tokenize prompt
    input_ids = tokenizer.encode(prompt).ids
    input_ids = torch.tensor([input_ids], device=device)
    
    eos_token_id = tokenizer.token_to_id("</s>")
    
    generated = []
    for _ in range(max_tokens):
        # Forward pass
        logits = model(input_ids)
        next_logits = logits[0, -1, :] / temperature
        
        # Top-k sampling
        if top_k > 0:
            indices_to_remove = next_logits < torch.topk(next_logits, top_k)[0][-1]
            next_logits[indices_to_remove] = float('-inf')
        
        # Sample
        probs = torch.softmax(next_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1).item()
        
        if next_token == eos_token_id:
            break
            
        generated.append(next_token)
        input_ids = torch.cat([input_ids, torch.tensor([[next_token]], device=device)], dim=1)
        
        # Truncate if too long
        if input_ids.shape[1] > config["max_seq_len"]:
            input_ids = input_ids[:, -config["max_seq_len"]:]
    
    return tokenizer.decode(generated)

## Sample Generations

In [5]:
prompts = [
    "The capital of France is",
    "Machine learning is",
    "def fibonacci(n):",
    "The solar system",
    "In the year 2050,",
]

for prompt in prompts:
    print(f"\n{'='*60}")
    print(f"PROMPT: {prompt}")
    print(f"{'='*60}")
    output = generate(prompt, max_tokens=80, temperature=0.7)
    print(f"{prompt}{output}")


PROMPT: The capital of France is
The capital of France is the Andes, which means 'the town of Anhalt.'

But why does this matter to us? Well, understanding how different cultures come together can help us appreciate our shared history today. For instance, did you know that many different languages have been spoken around the globe since ancient times? That means there are countless languages spoken across the world!

So, let's dive into

PROMPT: Machine learning is
Machine learning is a powerful tool that allows us to learn from data and make predictions with new knowledge. In this course unit, we will explore how to train a neural network using the OpenRAP library in Python. We will cover the following topics:

1. The `ML` class
2. Understanding the `neural_neural_neural_neural_neural_neural_

PROMPT: def fibonacci(n):
def fibonacci(n):
    n = len(N)
    if len(N) > 1 and len(N) < 1 and len(N) >= 1 and len(N) != 1:
        all = False
        for i in range(1, N):
            if i >

## Greedy Decoding (Deterministic)

In [6]:
@torch.no_grad()
def generate_greedy(prompt, max_tokens=100):
    """Greedy decoding - always pick highest probability token."""
    input_ids = tokenizer.encode(prompt).ids
    input_ids = torch.tensor([input_ids], device=device)
    
    eos_token_id = tokenizer.token_to_id("</s>")
    
    generated = []
    for _ in range(max_tokens):
        logits = model(input_ids)
        next_token = logits[0, -1, :].argmax().item()
        
        if next_token == eos_token_id:
            break
            
        generated.append(next_token)
        input_ids = torch.cat([input_ids, torch.tensor([[next_token]], device=device)], dim=1)
        
        if input_ids.shape[1] > config["max_seq_len"]:
            input_ids = input_ids[:, -config["max_seq_len"]:]
    
    return tokenizer.decode(generated)

print("Greedy generation:")
for prompt in prompts[:3]:
    print(f"\nPROMPT: {prompt}")
    output = generate_greedy(prompt, max_tokens=200)
    print(f"{prompt}{output}")

Greedy generation:

PROMPT: The capital of France is
The capital of France is the city of Paris, France, known for its stunning architecture, rich culture, and delicious food. But did you know that there are also some amazing places in France? Today, we will explore one such place called the "Les Rivier de L'Ouverture," which is located in the heart of the city of Paris.

Now, let's imagine you are a little girl living in Paris, France. You love playing soccer, going to school, and going to school. But one day, you realize that you don't like your favorite sports team or food. Your parents tell you that you can't play soccer anymore and you can't play with your friends anymore. What would you do?

This is where the L'Ouverture comes in. It's a special place for people who love playing soccer, but they don't like their country. They want to make sure that everyone in Paris is happy and healthy. So, they set a

PROMPT: Machine learning is
Machine learning is a powerful tool for enhancing