<a href="https://colab.research.google.com/github/saai07/LLM_finetuning/blob/main/gemma-3-270m-it.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install trl accelerate gradio

Collecting trl
  Downloading trl-0.28.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.28.0-py3-none-any.whl (540 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.5/540.5 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.28.0


In [2]:
import transformers
import trl
import datasets
import accelerate
import torch
import gradio as gr

In [3]:
if torch.cuda.is_available():
  device = torch.cuda.current_device()
  gpu_name = torch.cuda.get_device_name(device)

  total_memory = torch.cuda.get_device_properties(device).total_memory
  print(f"GPU: {gpu_name}")
  print(f"Total Memory:     {total_memory / 1e6:.2f} MB | {total_memory / 1e9:.2f} GB")
else:
  print("no gpu available")

GPU: Tesla T4
Total Memory:     15637.09 MB | 15.64 GB


# Setup of Base model

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "google/gemma-3-270m-it"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    attn_implementation = "eager"
)

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/236 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Gemma requires numbers as token as imput

turn strings into tokens via a tokenizer!

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"[INFO] Model on device: {model.device}")
print(f"[INFO] Model using dtype: {model.dtype}")

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

[INFO] Model on device: cuda:0
[INFO] Model using dtype: torch.bfloat16


In [9]:
tokenizer(" hello i am sai ")

{'input_ids': [2, 29104, 858, 1006, 44951, 236743], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [10]:
outputs = model(torch.tensor(tokenizer("Hello my name is Sai")["input_ids"]).unsqueeze(0).to("cuda"))
outputs.keys()


odict_keys(['logits', 'past_key_values'])

#Loading dataset

In [11]:
from datasets import load_dataset

dataset = load_dataset("mrdbourke/FoodExtract-1k")

print(f"[INFO] Number of sample  in the datasets :{len(dataset["train"])}")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/616k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1420 [00:00<?, ? examples/s]

[INFO] Number of sample  in the datasets :1420


In [15]:
import random
def get_random_id(dataset):
  random_id = random.randint(0, len(dataset)-1)
  return random_id


random_id = get_random_id(dataset["train"])
random_sample = dataset["train"][random_id]

In [20]:
example_input = random_sample['sequence']
example_output = random_sample['gpt-oss-120b-label']
example_output_condensed = random_sample['gpt-oss-120b-label-condensed']
print(f"[INFO] Input:\n{example_input}")
print()
print(f"[INFO] Example structured output (what we want our model to learn to predict):")
print(eval(example_output))
print()
print(f"[INFO] Example output condensed (we'll train our model to predict the condensed output since it uses less tokens than JSON):")
print(example_output_condensed)

[INFO] Input:
From a crisp white plate, a top‑down view captures each element as a distinct component: a small porcelain ramekin in the upper left corner holds glossy dark soy sauce, its surface reflecting the light; beside it, a neat mound of al dente pasta risoni is arranged in a loose nest, lightly tossed with a shimmering veil of extra‑virgin olive oil that creates tiny pearlescent droplets; a delicate dusting of fine white coconut flour rests in a shallow circle next to the pasta, its powdery texture contrasting with the glossy oil; finely chopped bright green chives are scattered in a fine garnish pattern over the olive‑oil drizzle, adding a burst of color; a miniature clear soup bowl contains steaming pho broth, visible strands of rice noodles, thin slices of beef and a garnish of fresh herbs, the steam hinted by a faint mist in the photograph; a copper‑colored spoon holds a rich amber chicken curry, speckled with red chili flakes and flecks of turmeric; a crisp golden hash brow

In [21]:
# Our fine-tuned model will assign tags to text so we can easily filter them by type in the future
tags_dict = {'np': 'nutrition_panel',
 'il': 'ingredient list',
 'me': 'menu',
 're': 'recipe',
 'fi': 'food_items',
 'di': 'drink_items',
 'fa': 'food_advertistment',
 'fp': 'food_packaging'}

# Format the dataset into LLM-style inputs/outputs

In [23]:
random_sample

{'sequence': 'From a crisp white plate, a top‑down view captures each element as a distinct component: a small porcelain ramekin in the upper left corner holds glossy dark soy sauce, its surface reflecting the light; beside it, a neat mound of al dente pasta risoni is arranged in a loose nest, lightly tossed with a shimmering veil of extra‑virgin olive oil that creates tiny pearlescent droplets; a delicate dusting of fine white coconut flour rests in a shallow circle next to the pasta, its powdery texture contrasting with the glossy oil; finely chopped bright green chives are scattered in a fine garnish pattern over the olive‑oil drizzle, adding a burst of color; a miniature clear soup bowl contains steaming pho broth, visible strands of rice noodles, thin slices of beef and a garnish of fresh herbs, the steam hinted by a faint mist in the photograph; a copper‑colored spoon holds a rich amber chicken curry, speckled with red chili flakes and flecks of turmeric; a crisp golden hash brow