In [7]:
pip install -U transformers datasets peft trl accelerate bitsandbytes

Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.28.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.1 kB)
Downloading datasets-4.5.0-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.28.0-py3-none-any.whl (540 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.5/540.5 kB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl (60.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pyarrow-23.0.1-c

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
ADAPTER = "Siddharth466/tinyllama-gsm8k-math-lora"

# Load tokenizer (can load from adapter repo OR base)
tokenizer = AutoTokenizer.from_pretrained(ADAPTER, use_fast=False)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    BASE,
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
)

# Load LoRA adapter from Hub
model = PeftModel.from_pretrained(
    model,
    ADAPTER,
)

model.eval()

prompt = """You are a helpful math tutor.

Question:
If John has 28 apples and gives 20 away, how many apples remain?

Answer:
Let's solve step by step.
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.3,
        top_p=0.9,
    )

print(tokenizer.decode(output[0], skip_special_tokens=True))

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

You are a helpful math tutor.

Question:
If John has 28 apples and gives 20 away, how many apples remain?

Answer:
Let's solve step by step.
John gives 20 apples away, so he has 28 - 20 = <<28-20=8>>8 apples left.
#### 8


In [3]:
#EVAL/COMPARISION
import re
import torch
import random
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel


# =========================
# CONFIG
# =========================
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
ADAPTER_PATH = "Siddharth466/tinyllama-gsm8k-math-lora"

MAX_NEW_TOKENS = 128        # reduced from 200
BATCH_SIZE = 8              # increase if VRAM allows
NUM_SAMPLES = None
SEED = 42


# =========================
# Reproducibility
# =========================
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.benchmark = True


# =========================
# Extract GSM8K Final Number
# =========================
def extract_number(text):
    match = re.findall(r"####\s*(-?\d+\.?\d*)", text)
    if match:
        return match[-1]

    numbers = re.findall(r"-?\d+\.?\d*", text)
    if numbers:
        return numbers[-1]

    return None


# =========================
# Load Dataset
# =========================
print("Loading GSM8K test set...")
dataset = load_dataset("gsm8k", "main", split="test")

if NUM_SAMPLES:
    dataset = dataset.select(range(NUM_SAMPLES))


def build_prompt(question):
    return f"""You are a helpful math tutor.

Question:
{question}

Answer:
Let's solve step by step.
"""


# =========================
# Load Tokenizer
# =========================
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# =========================
# Load Base Model
# =========================
print("Loading Base Model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto"
)
base_model.eval()


# =========================
# Load + Merge LoRA Model
# =========================
print("Loading and merging LoRA model...")
lora_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto"
)

lora_model = PeftModel.from_pretrained(lora_model, ADAPTER_PATH)
lora_model = lora_model.merge_and_unload()  # IMPORTANT SPEED BOOST
lora_model.eval()


# =========================
# Batched Evaluation
# =========================
def evaluate(model, model_name="Model"):
    correct = 0
    total = 0

    print(f"\nEvaluating {model_name}...\n")

    prompts = [build_prompt(example["question"]) for example in dataset]
    answers = [example["answer"] for example in dataset]

    for i in tqdm(range(0, len(prompts), BATCH_SIZE)):
        batch_prompts = prompts[i:i+BATCH_SIZE]
        batch_answers = answers[i:i+BATCH_SIZE]

        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(model.device)

        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False
            )

        decoded_outputs = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True
        )

        for pred_text, true_text in zip(decoded_outputs, batch_answers):
            pred_number = extract_number(pred_text)
            true_number = extract_number(true_text)

            if pred_number and true_number:
                if pred_number.strip() == true_number.strip():
                    correct += 1

            total += 1

    accuracy = correct / total
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    return accuracy


# =========================
# Run Evaluation
# =========================
base_acc = evaluate(base_model, "Base TinyLlama")
lora_acc = evaluate(lora_model, "Fine-Tuned (Merged)")

print("\n==============================")
print("GSM8K FINAL RESULTS")
print("==============================")
print(f"Base Model Accuracy:       {base_acc * 100:.2f}%")
print(f"Fine-Tuned Model Accuracy: {lora_acc * 100:.2f}%")
print("==============================")

Loading GSM8K test set...
Loading tokenizer...
Loading Base Model...


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading and merging LoRA model...


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]


Evaluating Base TinyLlama...



100%|██████████| 165/165 [12:35<00:00,  4.58s/it]


Base TinyLlama Accuracy: 1.29%

Evaluating Fine-Tuned (Merged)...



100%|██████████| 165/165 [12:40<00:00,  4.61s/it]

Fine-Tuned (Merged) Accuracy: 1.90%

GSM8K FINAL RESULTS
Base Model Accuracy:       1.29%
Fine-Tuned Model Accuracy: 1.90%



