<a href="https://colab.research.google.com/github/srikrish2812/info621_project/blob/mah_issn/src/notebooks/gemma_3_final_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gemma 3 — Inference

insert Google Colab link

## Framework/ Library Installation

In [None]:
%%capture
# Installs unsloth and other dependencies optimized for colab
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --upgrade transformers accelerate bitsandbytes datasets asteval GPUtil

In [None]:
# Imports
import os
import re
import pdb

import unsloth
from unsloth import FastLanguageModel
from datasets import load_dataset

In [None]:
# Use helper functions from repository

colab = True
if colab:
  if not os.path.exists("info621/"):
    !git clone https://github.com/srikrish2812/info621_project info621

## Load the final model and dataset

In [None]:
def load_model():
  """
  Load the trained Gemma 3 1B with Unsloth.
  """
  model_name = "abhay2812/gemma-3-1b-4bit-grpo"
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name,
      load_in_4bit=True,
      device_map="auto"
  )
  FastLanguageModel.for_inference(model)

  return model, tokenizer

model, tokenizer = load_model()

In [None]:
from info621.src.tasks.gsm8k import GSM8kTask

gsm8k = GSM8kTask()
dataset = gsm8k.get_questions()

## Inference
- Run inference on final model

In [None]:
from transformers import TextStreamer

In [None]:
random_sample = gsm8k.__getsamples__(n_samples=1, split="test")
random_sample['prompt']

In [None]:
def run_model(sample, measure_p=False):
  text = tokenizer.apply_chat_template(
      sample,
      add_generation_prompt = True,
      tokenize = False,
  )
  streamer = TextStreamer(tokenizer, skip_prompt=True)

  if measure_p:
    streamer = None

  tensor = model.generate(
      **tokenizer(text, return_tensors = "pt").to("cuda"),
      max_new_tokens = 128,
      temperature = 1.0, top_p = 0.95, top_k = 64,
      streamer = streamer,
  )
  return tensor

In [None]:
model_output = run_model(random_sample['prompt'])

In [None]:
decoded_output = tokenizer.decode(model_output[0])
y = extract_final_number(decoded_output)
random_sample['answer'][0] == y

## How Fragile is Mathematical Reasoning in Large Language Models?



In [None]:
import datasets

In [None]:
samples_names = gsm8k.__getsamples__(n_samples=100, split="test")
samples_numbers = gsm8k.__getsamples__(n_samples=100, split="test")

ds_pert = datasets.DatasetDict(
    {"names": samples_names,
     "numbers": samples_numbers})

In [None]:
def run_perturbation_experiment(dataset_dict):
  experiment = {}
  for config in dataset_dict:
    experiment[f"{config}"] = []
    for sample in test:
      model_output = run_model(sample)
      experiment[config].append(model_output)

  return experiment

In [None]:
# experiment_output = run_perturbation_experiment(ds_pert)

In [None]:
def run_perturbation_experiment(experiment_output):
  #TODO compute results from experiment
  pass

## Measure Gemma 3 Performance — Latency & Throughput on GPU

In [None]:
import time
import GPUtil
import numpy as np
import torch

1. Measure latency (i.e., amount of time it takes the model to produce a prediction for a single input sample) for *n* iterations.

In [None]:
def test_latency(sample, iterations=1):
  latencies = []
  gpus = GPUtil.getGPUs()

  if not gpus:
    raise ValueError("No GPUs found")
  gpu = gpus[0]

  for _ in range(iterations):
    start_time = time.time()
    run_model(sample['prompt'], measure_p=True)
    end_time = time.time()
    latencies.append(end_time-start_time)

  return latencies

In [None]:
iterations = 10
latencies = test_latency(random_sample, iterations=iterations)

In [None]:
print(f"Average latencies per {iterations} iterations: {np.mean(latencies):.4f} seconds")
print(f"Maximum latency per {iterations} iterations: {np.max(latencies):.4f} seconds")
print(f"Minimum latency per {iterations} iterations: {np.min(latencies):.4f} seconds")

About 7 seconds per prediction, expensive?

2. Measure throughput (i.e., number of predictions) given *n* amount of time.

In [None]:
def benchmark_throughput(sample, runs=5):
  #TODO
  """Can do batch size = 4, if time permits. 256 generation..."""
  for _ in range(3):
    run_model(sample, measure_p=True)
  torch.cuda.synchronize()

  total_gen_tokens=0

  start_time = time.time()
  for _ in range(runs):
    input_len = len(tokenizer.apply_chat_template(
        random_sample['prompt'],
        add_generation_prompt = True,
        tokenize = True)[0])
    torch.cuda.synchronize()
    output_tensor = run_model(sample, measure_p=True)
    total_gen_tokens += output_tensor.shape[-1] - input_len
  end_time = time.time()
  elapsed = end_time - start_time
  throughput = total_gen_tokens / elapsed

  return throughput, runs

In [None]:
throughput, runs = benchmark_throughput(sample=random_sample["prompt"])

In [None]:
print(f"Throughput: {throughput:.2f} tokens/sec over {runs} runs")