In [None]:
!pip install torch tiktoken blobfile

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting blobfile
  Downloading blobfile-3.0.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==

In [None]:
#MODEL_FILE = "llama3.2-1B-instruct.pth"
#MODEL_FILE = "llama3.2-1B-base.pth"
MODEL_FILE = "llama3.2-3B-instruct.pth"
# MODEL_FILE = "llama3.2-3B-base.pth"


In [None]:
MODEL_CONTEXT_LENGTH = 8192  # Supports up to 131_072

# Text generation settings
if "instruct" in MODEL_FILE:
    PROMPT = "What do llamas eat?"
else:
    PROMPT = "Llamas eat"

MAX_NEW_TOKENS = 150
TEMPERATURE = 0.
TOP_K = 1


In [None]:
import os
import urllib.request


url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{MODEL_FILE}"

if not os.path.exists(MODEL_FILE):
    print(f"Downloading {MODEL_FILE}...")
    urllib.request.urlretrieve(url, MODEL_FILE)
    print(f"Downloaded to {MODEL_FILE}")


# URL of the model.py file
model_url = "https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/model.py"
model_filename = "model.py"  # Local filename to save the downloaded file

# Download model.py if it doesn't exist locally
if not os.path.exists(model_filename):
    print(f"Downloading {model_filename}...")
    urllib.request.urlretrieve(model_url, model_filename)
    print(f"Downloaded to {model_filename}")


# URL of the tokenizer.py file
tokenizer_url = "https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/tokenizer.py"
tokenizer_filename = "tokenizer.py"  # Local filename to save the downloaded file

# Download model.py if it doesn't exist locally
if not os.path.exists(tokenizer_filename):
    print(f"Downloading {tokenizer_filename}...")
    urllib.request.urlretrieve(tokenizer_url, tokenizer_filename)
    print(f"Downloaded to {tokenizer_filename}")



Downloading llama3.2-3B-instruct.pth...
Downloaded to llama3.2-3B-instruct.pth


In [None]:
import torch
from model import Llama3Model
# Alternatively:
# from llms_from_scratch.llama3 import Llama3Model

# Set model file and context length before this block
# Example:
# MODEL_FILE = "llama3-1B.pth"
# MODEL_CONTEXT_LENGTH = 2048

if "1B" in MODEL_FILE:
    from model import LLAMA32_CONFIG_1B as LLAMA32_CONFIG
elif "3B" in MODEL_FILE:
    from model import LLAMA32_CONFIG_3B as LLAMA32_CONFIG
else:
    raise ValueError("Incorrect model file name")

LLAMA32_CONFIG["context_length"] = MODEL_CONTEXT_LENGTH

model = Llama3Model(LLAMA32_CONFIG)

# Determine device
device = (
    torch.device("cuda") if torch.cuda.is_available() else
    torch.device("mps") if torch.backends.mps.is_available() else
    torch.device("cpu")
)

# Load model weights with proper device mapping
model.load_state_dict(torch.load(MODEL_FILE, weights_only=True, map_location=device))

model.to(device)


Llama3Model(
  (tok_emb): Embedding(128256, 3072)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_key): Linear(in_features=3072, out_features=1024, bias=False)
        (W_value): Linear(in_features=3072, out_features=1024, bias=False)
        (W_query): Linear(in_features=3072, out_features=3072, bias=False)
        (out_proj): Linear(in_features=3072, out_features=3072, bias=False)
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=3072, out_features=8192, bias=False)
        (fc2): Linear(in_features=3072, out_features=8192, bias=False)
        (fc3): Linear(in_features=8192, out_features=3072, bias=False)
      )
      (norm1): RMSNorm((3072,), eps=1e-05, elementwise_affine=True)
      (norm2): RMSNorm((3072,), eps=1e-05, elementwise_affine=True)
    )
  )
  (final_norm): RMSNorm((3072,), eps=1e-05, elementwise_affine=True)
  (out_head): Linear(in_features=3072, out_features=128256, bias=False)
)

In [None]:
from tokenizer import Llama3Tokenizer, ChatFormat, clean_text
# Alternatively:
# from llms_from_scratch.llama3 Llama3Tokenizer, ChatFormat, clean_text

TOKENIZER_FILE = "tokenizer.model"

url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{TOKENIZER_FILE}"

if not os.path.exists(TOKENIZER_FILE):
    urllib.request.urlretrieve(url, TOKENIZER_FILE)
    print(f"Downloaded to {TOKENIZER_FILE}")

tokenizer = Llama3Tokenizer("tokenizer.model")

if "instruct" in MODEL_FILE:
    tokenizer = ChatFormat(tokenizer)


In [None]:
import time

from model import (
    generate,
    text_to_token_ids,
    token_ids_to_text
)
# Alternatively:
# from llms_from_scratch.ch05 import (
#    generate,
#    text_to_token_ids,
#    token_ids_to_text
# )

torch.manual_seed(123)

start = time.time()

token_ids = generate(
    model=model,
    idx=text_to_token_ids(PROMPT, tokenizer).to(device),
    max_new_tokens=MAX_NEW_TOKENS,
    context_size=LLAMA32_CONFIG["context_length"],
    top_k=TOP_K,
    temperature=TEMPERATURE
)

print(f"Time: {time.time() - start:.2f} sec")

if torch.cuda.is_available():
    max_mem_bytes = torch.cuda.max_memory_allocated()
    max_mem_gb = max_mem_bytes / (1024 ** 3)
    print(f"Max memory allocated: {max_mem_gb:.2f} GB")

output_text = token_ids_to_text(token_ids, tokenizer)

if "instruct" in MODEL_FILE:
    output_text = clean_text(output_text)

print("\n\nOutput text:\n\n", output_text)


Time: 5.47 sec
Max memory allocated: 15.42 GB


Output text:

 Llamas are herbivores, which means they primarily eat plants and plant-based foods. Their diet consists of:

1. **Grasses**: Llamas love to graze on various types of grasses, including timothy grass, orchard grass, and brome grass.
2. **Hay**: Llamas also eat hay, such as alfalfa hay, oat hay, and clover hay.
3. **Browse**: Browse is a type of leafy vegetation that llamas enjoy eating. Examples of browse include shrubs, small trees, and certain types of grasses.
4. **Fruits and vegetables**: Llamas may also eat fruits and vegetables, such as apples, carrots, and sweet


In [None]:

# Chain of Thought prompts
examples = [
    {
        "task": "Math Word Problem",
        "prompt": "Q: If there are 3 apples and you eat one, how many are left?\nA: Let's think step by step.",
    },
    {
        "task": "Logical Deduction",
        "prompt": "Q: John is taller than Mary. Mary is taller than Sam. Who is the shortest?\nA: Let's think step by step.",
    },
    {
        "task": "Symbolic Reasoning",
        "prompt": "Q: If A = 2, B = A + 3, and C = B * 2, what is the value of C?\nA: Let's think step by step.",
    },
    {
        "task": "Commonsense Reasoning",
        "prompt": "Q: You drop a glass on a concrete floor. It breaks. Why?\nA: Let's think step by step.",
    },
    {
        "task": "Simple Arithmetic",
        "prompt": "Q: What is 12 + 23 - 5?\nA: Let's think step by step.",
    },
]

results = []

for ex in examples:
    print(f"🔍 Generating for: {ex['task']}")

    start = time.time()
    input_ids = text_to_token_ids(ex["prompt"], tokenizer).to(device)

    token_ids = generate(
        model=model,
        idx=input_ids,
        max_new_tokens=MAX_NEW_TOKENS,
        context_size=LLAMA32_CONFIG["context_length"],
        top_k=TOP_K,
        temperature=TEMPERATURE
    )

    output_text = token_ids_to_text(token_ids, tokenizer)
    output_text = clean_text(output_text)
    response = output_text[len(ex["prompt"]):].strip()

    results.append([ex["task"], ex["prompt"].split('\n')[0], response])

    print(f"⏱️ Time: {time.time() - start:.2f} sec\n")

# Show results in a table
headers = ["Task", "Question", "Model Output"]
print(tabulate(results, headers=headers, tablefmt="grid"))


🔍 Generating for: Math Word Problem
⏱️ Time: 5.38 sec

🔍 Generating for: Logical Deduction
⏱️ Time: 5.36 sec

🔍 Generating for: Symbolic Reasoning
⏱️ Time: 5.38 sec

🔍 Generating for: Commonsense Reasoning
⏱️ Time: 5.38 sec

🔍 Generating for: Simple Arithmetic
⏱️ Time: 5.37 sec

+-----------------------+----------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Task                  | Question                                                                   | Model Output                                                                                                                                                                                                                           |
| Math Word Problem     | Q: If there are 3 appl