<a href="https://colab.research.google.com/github/srikrish2812/info621_project/blob/mah_issn/src/notebooks/gemma_3_final_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gemma 3 — Inference

insert Google Colab link

## Framework/ Library Installation

In [5]:
%%capture
# Installs unsloth and other dependencies optimized for colab
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --upgrade transformers accelerate bitsandbytes datasets asteval GPUtil

In [6]:
# Imports
import os
import re
import pdb

import unsloth
from unsloth import FastLanguageModel
from datasets import load_dataset

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [7]:
# Use helper functions from repository

colab = True
if colab:
  if not os.path.exists("info621/"):
    !git clone https://github.com/srikrish2812/info621_project info621

Cloning into 'info621'...
remote: Enumerating objects: 49, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 49 (delta 12), reused 39 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (49/49), 11.71 KiB | 1.95 MiB/s, done.
Resolving deltas: 100% (12/12), done.


## Load the final model and dataset

In [8]:
def load_model():
  """
  Load the trained Gemma 3 1B with Unsloth.
  """
  model_name = "abhay2812/gemma-3-1b-4bit-grpo"
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name,
      load_in_4bit=True,
      device_map="auto"
  )
  FastLanguageModel.for_inference(model)

  return model, tokenizer

model, tokenizer = load_model()

==((====))==  Unsloth 2025.4.8: Fast Gemma3 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [9]:
from info621.src.tasks.gsm8k import GSM8kTask

gsm8k = GSM8kTask()
dataset = gsm8k.get_questions()

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]



Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

## Inference
- Run inference on final model

In [10]:
from transformers import TextStreamer

In [11]:
random_sample = gsm8k.__getsamples__(n_samples=1, split="test")
random_sample['prompt']

[[{'content': '\nRespond in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': 'If Marcy works for the same company for 40 years, she gets an annual pension of $50,000/year. Starting after 20 years, she becomes entitled to 5% of the value of the pension per year. If she quits after 30 years, what will her annual pension be?',
   'role': 'user'}]]

In [16]:
def run_model(sample, measure_p=False):
  text = tokenizer.apply_chat_template(
      sample,
      add_generation_prompt = True,
      tokenize = False,
  )
  streamer = TextStreamer(tokenizer, skip_prompt=True)

  if measure_p:
    streamer = None

  tensor = model.generate(
      **tokenizer(text, return_tensors = "pt").to("cuda"),
      max_new_tokens = 128,
      temperature = 1.0, top_p = 0.95, top_k = 64,
      streamer = streamer,
  )
  return tensor

In [27]:
model_output = run_model(random_sample['prompt'])

<reasoning>
Marcy’s pension is calculated as follows: she receives a fixed annual pension of $50,000 for 40 years, and then receives 5% of the *remaining* value of her pension each year after 20 years.  This means the annual pension payment is 5% of the current value of her pension each year.  We need to calculate this value after 30 years, knowing she initially received a $50,000 pension for 40 years.

</reasoning>
<answer>
$52,500


In [28]:
decoded_output = tokenizer.decode(model_output[0])
y = extract_final_number(decoded_output)
random_sample['answer'][0] == y

False

## How Fragile is Mathematical Reasoning in Large Language Models?



In [59]:
import datasets

In [63]:
samples_names = gsm8k.__getsamples__(n_samples=100, split="test")
samples_numbers = gsm8k.__getsamples__(n_samples=100, split="test")

ds_pert = datasets.DatasetDict(
    {"names": samples_names,
     "numbers": samples_numbers})

In [None]:
def run_perturbation_experiment(dataset_dict):
  experiment = {}
  for config in dataset_dict:
    experiment[f"{config}"] = []
    for sample in test:
      model_output = run_model(sample)
      experiment[config].append(model_output)

  return experiment

In [None]:
# experiment_output = run_perturbation_experiment(ds_pert)

In [None]:
def run_perturbation_experiment(experiment_output):
  #TODO compute results from experiment
  pass

## Measure Gemma 3 Performance — Latency & Throughput on GPU

In [None]:
import time
import GPUtil
import numpy as np
import torch

1. Measure latency (i.e., amount of time it takes the model to produce a prediction for a single input sample) for *n* iterations.

In [None]:
def test_latency(sample, iterations=1):
  latencies = []
  gpus = GPUtil.getGPUs()

  if not gpus:
    raise ValueError("No GPUs found")
  gpu = gpus[0]

  for _ in range(iterations):
    start_time = time.time()
    run_model(sample['prompt'], measure_p=True)
    end_time = time.time()
    latencies.append(end_time-start_time)

  return latencies

In [None]:
iterations = 10
latencies = test_latency(random_sample, iterations=iterations)

In [None]:
print(f"Average latencies per {iterations} iterations: {np.mean(latencies):.4f} seconds")
print(f"Maximum latency per {iterations} iterations: {np.max(latencies):.4f} seconds")
print(f"Minimum latency per {iterations} iterations: {np.min(latencies):.4f} seconds")

About 7 seconds per prediction, expensive?

2. Measure throughput (i.e., number of predictions) given *n* amount of time.

In [37]:
def benchmark_throughput(sample, runs=5):
  #TODO
  """Can do batch size = 4, if time permits. 256 generation..."""
  for _ in range(3):
    run_model(sample, measure_p=True)
  torch.cuda.synchronize()

  total_gen_tokens=0

  start_time = time.time()
  for _ in range(runs):
    input_len = len(tokenizer.apply_chat_template(
        random_sample['prompt'],
        add_generation_prompt = True,
        tokenize = True)[0])
    torch.cuda.synchronize()
    output_tensor = run_model(sample, measure_p=True)
    total_gen_tokens += output_tensor.shape[-1] - input_len
  end_time = time.time()
  elapsed = end_time - start_time
  throughput = total_gen_tokens / elapsed

  return throughput, runs

In [38]:
throughput, runs = benchmark_throughput(sample=random_sample["prompt"])

In [39]:
print(f"Throughput: {throughput:.2f} tokens/sec over {runs} runs")

Throughput: 10.37 tokens/sec over 5 runs
