In [None]:
# 1. INSTALL LIBRARIES
# --------------------
# Install necessary Hugging Face libraries for training and datasets.
!pip install -q -U torch tensorboard
!pip install -q -U transformers datasets accelerate evaluate trl sentencepiece

# 2. SETUP AND AUTHENTICATION
# ---------------------------
import torch
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, pipeline
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
from google.colab import userdata # Use this if you're storing the token as a Colab secret

# Login to Hugging Face Hub to access the model
# IMPORTANT: Replace with your actual Hugging Face token
# You can get a token here: https://huggingface.co/settings/tokens
HF_TOKEN = "Enter HF Token Here"
login(token=HF_TOKEN)

# 3. CONFIGURE MODEL AND DIRECTORIES
# ----------------------------------
# Define the base model to fine-tune and the directory to save the new model.
base_model = "google/gemma-3-1b-it"
output_dir = "./gemma-natural-farming-qa" # This will be the name of your fine-tuned model

# 4. LOAD AND PREPARE THE DATASET
# -------------------------------
# The path to your local JSONL file
data_file = "/content/nf_dataset_augmented.jsonl"

# Load the dataset from your file
dataset = load_dataset("json", data_files=data_file, split="train")

# Define a function to format your data into the required conversational format.
# The Gemma instruction-tuned model expects a specific chat structure.
def format_dataset(sample):
  """
  Converts a sample from {"question": "...", "answer": "..."} format
  to the required conversational format {"messages": [{"role": "user", ...}, {"role": "assistant", ...}]}.
  """
  return {
      "messages": [
          {"role": "user", "content": sample["question"]},
          {"role": "assistant", "content": sample["answer"]}
      ]
  }

# Apply the formatting function to the entire dataset
formatted_dataset = dataset.map(format_dataset, remove_columns=dataset.features)

# Split the dataset into a training set (80%) and a test set (20%)
formatted_dataset = formatted_dataset.train_test_split(test_size=0.2, shuffle=True)

# Print an example to verify the format
print("Example of a formatted data sample:")
print(formatted_dataset["train"][0]["messages"])

# 5. LOAD MODEL AND TOKENIZER
# ---------------------------
# Load the pre-trained model and tokenizer from Hugging Face.
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype="auto", # Automatically selects the best data type (bfloat16 on modern GPUs)
    device_map="auto",
    attn_implementation="eager" # Use eager attention mechanism
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

print(f"Model loaded on device: {model.device}")
print(f"Model data type: {model.dtype}")

In [None]:
# 6. CONFIGURE THE TRAINING PROCESS
# ---------------------------------
# Use SFTConfig to set up all the training hyperparameters.
sft_config = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=3,
    max_length=512,# Number of times to train on the whole dataset
    per_device_train_batch_size=2,          # Batch size for training
    gradient_accumulation_steps=4,          # Effective batch size will be 4 * 2 = 8
    optim="adamw_torch_fused",              # Use the fused AdamW optimizer for better performance
    logging_steps=1,                        # Log metrics every step
    save_strategy="epoch",                  # Save a model checkpoint at the end of each epoch
    learning_rate=2e-5,                     # The learning rate for the optimizer
    lr_scheduler_type="cosine",
    bf16=True if torch.cuda.is_bf16_supported() else False, # Use bfloat16 precision if available
    fp16=False,
    push_to_hub=False,                      # Set to True to automatically push to Hugging Face Hub
    report_to="tensorboard",                # Log results for visualization in TensorBoard
)

# 7. CREATE AND START THE TRAINER
# -------------------------------
# The SFTTrainer handles the entire training loop.
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=formatted_dataset['train'],
    eval_dataset=formatted_dataset['test'], # Optional: for evaluating loss on a separate set
    processing_class=tokenizer,
)

print("Starting the fine-tuning process...")
# Start the training
trainer.train()

# Save the final model and tokenizer to the specified output directory
trainer.save_model()
print(f"Model successfully saved to {output_dir}")

In [None]:


# 8. TEST THE FINE-TUNED MODEL
# ----------------------------
print("\n--- Testing the Fine-Tuned Model ---")

# Load the fine-tuned model and tokenizer for inference
pipe = pipeline("text-generation", model=output_dir, tokenizer=tokenizer)

while True:
    # Take user input
    question = input("\nEnter your question (or type 'exit' to quit): ").strip()

    # Exit condition
    if question.lower() == "exit":
        print("Exiting...")
        break

    # Format the question into a prompt using the model's chat template
    test_messages = [{"role": "user", "content": question}]
    prompt = pipe.tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)

    # Generate the answer
    outputs = pipe(prompt, max_new_tokens=256)

    # Print the generated answer
    print(f"\nGenerated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
