In [None]:
# ============================================
# 1. INSTALL LIBRARIES
# ============================================
#!pip uninstall -y torch torchvision torchaudio transformers trl accelerate datasets huggingface_hub
#!pip install -q torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124
#!pip install -q transformers==4.45.2 trl==0.11.6 accelerate==1.1.1 datasets==3.1.0 huggingface_hub==0.28.1 sentencepiece pyarrow==18.0.0 evaluate tensorboard
!pip uninstall -y torch torchvision torchaudio transformers trl accelerate datasets huggingface_hub
!pip install -q torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q transformers==4.45.2 trl==0.11.6 accelerate==1.1.1 datasets==3.1.0 huggingface_hub==0.28.1 sentencepiece pyarrow==18.0.0
!pip install -q datasets trl sentencepiece huggingface_hub
!pip install evaluate
# ============================================
# 2. SETUP AND AUTHENTICATION
# ============================================
import torch
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    pipeline,
)
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
import evaluate

# Login to Hugging Face
HF_TOKEN = "hf_token_here"
login(token=HF_TOKEN)

# ============================================
# 3. CONFIGURE MODEL AND DIRECTORIES
# ============================================
base_model = "google/gemma-3-1b-it"
output_dir = "./gemma-natural-farming-qa"

# ============================================
# 4. LOAD AND PREPARE THE DATASET
# ============================================
data_file = "/content/natural_farming_dataset_perplexity.jsonl"
dataset = load_dataset("json", data_files=data_file, split="train")

def format_dataset(sample):
    return {
        "messages": [
            {"role": "user", "content": sample["question"]},
            {"role": "assistant", "content": sample["answer"]}
        ]
    }

formatted_dataset = dataset.map(format_dataset, remove_columns=dataset.features)

# Split into 80% train, 10% validation, 10% test
split_dataset = formatted_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
val_test_split = split_dataset["test"].train_test_split(test_size=0.5, shuffle=True, seed=42)

dataset_dict = {
    "train": split_dataset["train"],
    "validation": val_test_split["train"],
    "test": val_test_split["test"]
}

print("‚úÖ Dataset Split Summary:")
print(f"Train: {len(dataset_dict['train'])} | Validation: {len(dataset_dict['validation'])} | Test: {len(dataset_dict['test'])}")
print("\nExample data sample:")
print(dataset_dict["train"][0]["messages"])

# ============================================
# 5. LOAD MODEL AND TOKENIZER
# ============================================
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto",
    attn_implementation="eager"
)
tokenizer = AutoTokenizer.from_pretrained(base_model)
print(f"‚úÖ Model loaded on {model.device} | dtype: {model.dtype}")



Found existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
Found existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
Found existing installation: torchaudio 2.8.0+cu126
Uninstalling torchaudio-2.8.0+cu126:
  Successfully uninstalled torchaudio-2.8.0+cu126
Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
[0mFound existing installation: accelerate 1.11.0
Uninstalling accelerate-1.11.0:
  Successfully uninstalled accelerate-1.11.0
Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0
Found existing installation: huggingface-hub 0.35.3
Uninstalling huggingface-hub-0.35.3:
  Successfully uninstalled huggingface-hub-0.35.3
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4001 [00:00<?, ? examples/s]

‚úÖ Dataset Split Summary:
Train: 3200 | Validation: 400 | Test: 401

Example data sample:
[{'content': 'How deep should irrigation water penetrate?', 'role': 'user'}, {'content': 'Irrigation should wet soil to root depth: shallow-rooted crops 15-20 cm, medium 30-40 cm, deep-rooted 50-60 cm. Light frequent irrigation promotes shallow roots; deep infrequent irrigation encourages deep root systems making plants more drought-tolerant. Check with probe after irrigation.', 'role': 'assistant'}]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

‚úÖ Model loaded on cuda:0 | dtype: torch.bfloat16


In [None]:
# ============================================
# 6. CONFIGURE THE TRAINING PROCESS
# ============================================
sft_config = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=3,
    # max_seq_length=256, # Removed max_seq_length
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="adamw_torch_fused",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    bf16=torch.cuda.is_bf16_supported(),
    fp16=False,
    push_to_hub=False,
    report_to="tensorboard"
)

# ============================================
# 7. TRAIN AND VALIDATE MODEL
# ============================================
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    processing_class=tokenizer,
)

print("üöÄ Starting fine-tuning...")
trainer.train()

print("üíæ Saving final model...")
trainer.save_model(output_dir)



Tokenizing train dataset:   0%|          | 0/3200 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.


üöÄ Starting fine-tuning...


Step,Training Loss
10,3.5205
20,2.0289
30,1.7345
40,1.4556
50,1.2015
60,0.8399
70,0.854
80,0.7048
90,0.6827
100,0.534


üíæ Saving final model...


In [None]:
!pip install rouge_score
# ============================================
# 8. VALIDATE MODEL PERFORMANCE
# ============================================
print("\nüîç Validating model on validation set...")
model.eval()

# Use a simple text-generation pipeline
val_pipe = pipeline("text-generation", model=output_dir, tokenizer=tokenizer)

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

generated_texts = []
reference_texts = []

for i, sample in enumerate(dataset_dict["validation"]):
    user_msg = [{"role": "user", "content": sample["messages"][0]["content"]}]
    prompt = tokenizer.apply_chat_template(user_msg, tokenize=False, add_generation_prompt=True)
    output = val_pipe(prompt, max_new_tokens=128, num_return_sequences=1)[0]["generated_text"][len(prompt):].strip()
    generated_texts.append(output)
    reference_texts.append(sample["messages"][1]["content"])
    if i < 3:
        print(f"\nExample {i+1}:")
        print(f"Q: {sample['messages'][0]['content']}")
        print(f"Model: {output}")
        print(f"Ref: {sample['messages'][1]['content']}")

# Compute metrics
bleu_score = bleu.compute(predictions=generated_texts, references=reference_texts)
rouge_score = rouge.compute(predictions=generated_texts, references=reference_texts)

print("\nüìä Validation Metrics:")
print(f"BLEU Score: {bleu_score['bleu']:.4f}")
print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b606ff1e798a57e692bcd985472110f75a5a0c212c44d7cac4f725af1c719bda
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2

üîç Validating model on validation set...


Device set to use cuda:0


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]


Example 1:
Q: How does soil testing help farmers?
Model: Soil testing helps farmers understand pH, nutrient levels, organic matter content, and potential problems, guiding amendment decisions, preventing over-application, tracking improvements over time, identifying deficiencies early, and optimizing inputs for crop needs and economic efficiency.
Ref: Degraded soil can be restored through adding organic matter consistently, eliminating tillage, growing diverse crops including deep-rooted species, using cover crops and green manures, applying bio-inputs to rebuild microbial populations, controlling erosion, and allowing natural regeneration over 3-7 years.

Example 2:
Q: How can farmers optimize crop maturity?
Model: crop maturity significantly affects farm productivity by influencing resource use efficiency, determining competition among plants, affecting pest and disease pressure, and impacting overall yield and quality. Natural farming emphasizes optimal crop maturity based on local

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



üìä Validation Metrics:
BLEU Score: 0.8518
ROUGE-L: 0.8591


In [None]:
!pip install bert_score
bertscore = evaluate.load("bertscore")
results = bertscore.compute(predictions=generated_texts, references=reference_texts, lang="en")
print(sum(results["f1"]) / len(results["f1"]))

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.1/61.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.9777223162353039


In [None]:

# ============================================
# 9. TEST INTERACTIVELY
# ============================================
print("\n--- Interactive Testing ---")
test_pipe = pipeline("text-generation", model=output_dir, tokenizer=tokenizer)

while True:
    question = input("\nEnter your question (or type 'exit' to quit): ").strip()
    if question.lower() == "exit":
        print("üëã Exiting...")
        break

    messages = [{"role": "user", "content": question}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = test_pipe(prompt, max_new_tokens=256)
    print(f"\nüß† Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")


--- Interactive Testing ---


Device set to use cuda:0



Enter your question (or type 'exit' to quit): what is natural farming?

üß† Answer:
Natural farming is a sustainable agricultural method that avoids chemical fertilizers, pesticides, and intensive tillage, relying on ecological processes and indigenous seeds. It focuses on soil health, biodiversity, and long-term productivity without external chemical inputs.


KeyboardInterrupt: Interrupted by user

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import shutil, os

# Path to your best checkpoint
best_ckpt = "/content/gemma-natural-farming-qa/checkpoint-1200"

# Where to save clean offline model
save_dir = "./best_model"

# Load model and tokenizer from the checkpoint
model = AutoModelForCausalLM.from_pretrained(best_ckpt)
tokenizer = AutoTokenizer.from_pretrained(best_ckpt)

# Save only what's needed for inference
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"‚úÖ Saved minimal offline model to {save_dir}")


‚úÖ Saved minimal offline model to ./best_model


In [None]:
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
import shutil

# 1Ô∏è‚É£ Authenticate
auth.authenticate_user()
drive_service = build('drive', 'v3')

# 2Ô∏è‚É£ Zip your model folder
local_folder = "/content/best_model"
zip_path = "/content/best_model.zip"
shutil.make_archive(zip_path.replace('.zip',''), 'zip', local_folder)
print(f"‚úÖ Zipped folder to {zip_path}")

# 3Ô∏è‚É£ Upload zip to specific Drive folder by ID
folder_id = "1v7wyPcLmawtlKgFsOqoMdcB8qMZ7fqPj"
file_metadata = {
    'name': 'best_model.zip',
    'parents': [folder_id]
}
media = MediaFileUpload(zip_path, mimetype='application/zip')
file = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()

print(f"‚úÖ Uploaded zip to Drive folder! File ID: {file.get('id')}")


‚úÖ Zipped folder to /content/best_model.zip
