In [1]:
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [None]:
"""
Full precision fine-tuning script for Google Gemma-3-4B-IT model
This script provides a complete implementation to:
1. Set up the environment
2. Load and prepare the dataset
3. Configure the model for full precision training
4. Fine-tune the model
5. Evaluate the fine-tuned model
6. Save and export the model
"""

import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from sklearn.metrics import accuracy_score
import evaluate

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set your Hugging Face token
os.environ["HF_TOKEN"] = ""  # Replace with your actual token

# Parameters for fine-tuning
MODEL_NAME = "google/gemma-3-1b-it"
OUTPUT_DIR = "./gemma-3-1b-it-finetuned"
DATASET_NAME = "/content/data"  # Replace with your dataset
MAX_LENGTH = 512
BATCH_SIZE = 4
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 100
LOGGING_STEPS = 100
EVAL_STEPS = 500
SAVE_STEPS = 1000
FP16 = False  # Set to False for full precision training



Using device: cuda


In [4]:

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    token=os.environ["HF_TOKEN"]
)
tokenizer.pad_token = tokenizer.eos_token

# Load model in full precision
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,  # Full precision (FP32)
    trust_remote_code=True,
    token=os.environ["HF_TOKEN"]
)

print(f"Model loaded: {MODEL_NAME}")
print(f"Model parameters: {model.num_parameters()}")

# Load dataset (replace with your own dataset)
# This is a placeholder - adjust according to your dataset format
dataset = load_dataset(DATASET_NAME)

# Define prompt template for instruction tuning
def create_prompt_format(sample):
    """
    Format the instruction and response into a prompt template.
    Adjust this based on your specific dataset format and requirements.
    """
    # Extract user and assistant messages from the 'messages' list
    user_message = next((msg["content"] for msg in sample["messages"] if msg["role"] == "user"), None)
    assistant_message = next((msg["content"] for msg in sample["messages"] if msg["role"] == "assistant"), None)

    # Handle cases where user or assistant message is not found
    if user_message is None or assistant_message is None:
        return {"formatted_prompt": ""}  # or handle it differently

    # Create the prompt
    prompt = f"""<start_of_turn>user
{user_message}
<end_of_turn>
<start_of_turn>model
{assistant_message}
<end_of_turn>"""

    return {"formatted_prompt": prompt}


Model loaded: google/gemma-3-1b-it
Model parameters: 999885952


In [5]:

# Apply formatting to the dataset
formatted_dataset = dataset.map(create_prompt_format)

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["formatted_prompt"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )

# Tokenize the dataset
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=formatted_dataset["train"].column_names,
)

# Split dataset
if "validation" not in tokenized_dataset:
    # Create a train/validation split if not already present
    tokenized_dataset = tokenized_dataset["train"].train_test_split(
        test_size=0.1, seed=42
    )
    train_dataset = tokenized_dataset["train"]
    eval_dataset = tokenized_dataset["test"]
else:
    train_dataset = tokenized_dataset["train"]
    eval_dataset = tokenized_dataset["validation"]

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Not using masked language modeling
)



In [10]:
!pip install accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [7]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=1,  # Reduced batch size
    per_device_eval_batch_size=1,  # Reduced batch size
    eval_steps=EVAL_STEPS,
    save_steps=SAVE_STEPS,
    warmup_steps=WARMUP_STEPS,
    logging_steps=LOGGING_STEPS,
    # The argument `evaluation_strategy` was renamed to `eval_strategy` in newer versions
    # of the transformers library.
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    fp16=FP16,  # Set to False for full precision
    gradient_accumulation_steps=2,  # Reduced accumulation steps
    logging_dir="./logs",
    report_to="tensorboard",
    save_total_limit=3,
)

In [10]:
# Add this function definition before initializing the Trainer
def compute_metrics(eval_pred):
    """
    Computes and returns a dictionary of metrics (e.g., accuracy, loss)
    based on the model's predictions.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
print("Starting training...")
trainer.train()

# Save the model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")

# Evaluate the model on the test set
results = trainer.evaluate()
print(f"Evaluation results: {results}")


  trainer = Trainer(
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Starting training...


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 32.12 MiB is free. Process 54315 has 14.71 GiB memory in use. Of the allocated memory 14.49 GiB is allocated by PyTorch, and 91.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)