<a href="https://colab.research.google.com/github/pradraju/re/blob/main/ganesha_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Uninstall potentially conflicting packages
# !pip uninstall -y sentence-transformers \
#                   torchvision \
#                   torchaudio \
#                   google-cloud-aiplatform \
#                   gcsfs \
#                   fsspec \
#                   google-cloud-storage

# Install the latest versions of the core training stack
!pip install torch \
            transformers \
            accelerate \
            peft \
            trl \
            bitsandbytes \
            datasets \
            psutil \
            google-cloud-storage \
            gcsfs \
            fsspec \
            numpy






In [2]:
import torch, transformers, accelerate, peft, trl, bitsandbytes as bnb
print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("accelerate:", accelerate.__version__)
print("peft:", peft.__version__)
print("trl:", trl.__version__)
print("bnb:", bnb)

torch: 2.7.1+cu126
transformers: 4.53.1
accelerate: 1.8.1
peft: 0.16.0
trl: 0.19.1
bnb: <module 'bitsandbytes' from '/usr/local/lib/python3.11/dist-packages/bitsandbytes/__init__.py'>


In [3]:
# 📌 Step 1: Authenticate Colab with Google Cloud
from google.colab import auth
auth.authenticate_user()

# 📌 Step 2: Download the file from GCS
from google.cloud import storage
import os, json

# ✅ Set your actual values here
BUCKET_NAME = "ganesha-training-20250714"
BLOB_PATH = "data/training_data.jsonl"  # ✅ UPDATE THIS if your file is not at root
DEST_PATH = "/content/training_data.jsonl"

print(f"\n📥 Downloading '{BLOB_PATH}' from bucket '{BUCKET_NAME}'...")
client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(BLOB_PATH)

if not blob.exists():
    raise FileNotFoundError(f"❌ Blob '{BLOB_PATH}' does not exist in bucket '{BUCKET_NAME}'")

blob.download_to_filename(DEST_PATH)
print(f"✅ Download complete. File saved to: {DEST_PATH}")

# 📌 Step 3: Preview a few examples
print("\n🔍 Verifying structure of first 3 examples:")
with open(DEST_PATH, "r") as f:
    examples = [json.loads(line.strip()) for _, line in zip(range(3), f)]

for i, ex in enumerate(examples):
    print(f"\n🧪 Example {i+1}:")
    print("🔤 OCR Text:", ex["instruction"]["ocr_text"][:200].replace("\n", " "), "...")
    print("👤 User Profile:", json.dumps(ex["instruction"]["user_profile"], indent=2))
    print("📦 Output (truncated):", ex["output"][:300], "...")



📥 Downloading 'data/training_data.jsonl' from bucket 'ganesha-training-20250714'...
✅ Download complete. File saved to: /content/training_data.jsonl

🔍 Verifying structure of first 3 examples:

🧪 Example 1:
🔤 OCR Text: NUTRITION INFORMATION (per 100g): Energy 240 kcal Fat 2O.0g   of which saturates 6.67g Carbohydrate 0.0g Protein 16.0g Sa1t 0.8175g ...
👤 User Profile: {
  "dietary_restrictions": [
    "halal"
  ],
  "health_conditions": [
    "celiac disease",
    "kidney disease"
  ],
  "allergies": [
    "fish",
    "sesame"
  ]
}
📦 Output (truncated): {"ingredient_flags":[],"nutrition_flags":[],"processing_flags":[],"claim_flags":[],"nutrition_values":{"Energy":"240 kcal","Fat":"20.0g","Saturates":"6.67g","Carbohydrate":"0.0g","Protein":"16.0g","Salt":"0.8175g"},"nutrient_dv":{},"allergens":[],"dietary_restrictions":[],"raw_ingredients_text":""," ...

🧪 Example 2:
🔤 OCR Text: NUTRITION INFORMATION (per 100g): Energy 0 kcal Fat 0.0g   of which saturates 0.0g Carbohydrate 0.0g   of w

In [4]:
from transformers import AutoTokenizer
from datasets import Dataset
import json

# === Step 1: Load Data ===
DATA_PATH = "/content/training_data.jsonl"
try:
    with open(DATA_PATH) as f:
        raw_data = [json.loads(line) for line in f]
except FileNotFoundError:
    print(f"ERROR: The data file was not found at {DATA_PATH}")
    print("Please make sure you have uploaded 'training_data.jsonl' to your Colab session.")
    # Create a dummy file to prevent further errors
    with open(DATA_PATH, "w") as f:
        f.write("")
    raw_data = []


# === Step 2: Initialize Tokenizer ===
# Using Phi-3 as the target model
model_id = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# Set pad token to eos token for open-end generation
tokenizer.pad_token = tokenizer.eos_token


# === Step 3: Build Prompt Function ===
# This function creates the full conversational prompt
def format_example(ex):
    # Ensure all parts of the instruction exist
    ocr = ex.get("instruction", {}).get("ocr_text", "").strip()
    profile_data = ex.get("instruction", {}).get("user_profile", {})
    profile = json.dumps(profile_data, indent=2)
    response = ex.get("output", "").strip()

    # Construct the prompt using the model's required chat template structure
    prompt = f"""<|user|>
Expert food analyst. OCR packaging text - try to fix OCR errors.

Contextual health analysis: identify genuine concerns appropriate for this product type. Expert judgment over mechanical rules.

OCR Text:
{ocr}

User Profile:
{profile}
<|end|>
<|assistant|>
{response}<|end|>"""
    return {"prompt": prompt}

# === Step 4: Format Prompts and Create Dataset ===
if raw_data:
    formatted_prompts = [format_example(ex) for ex in raw_data]
    dataset = Dataset.from_list(formatted_prompts)
else:
    dataset = Dataset.from_list([{"prompt": ""}]) # Handle empty data case


# === Step 5: Correct Tokenization Function with Label Masking ===
# This is the magic token that separates the user prompt from the assistant response.
# We will mask everything up to and including this token.
assistant_marker = "<|assistant|>"
assistant_token_ids = tokenizer.encode(assistant_marker, add_special_tokens=False)

def tokenize_and_mask(example):
    # Tokenize the full prompt
    tokenized = tokenizer(
        example["prompt"],
        truncation=True,
        padding=False,  # Padding is handled by the data collator
        max_length=2048, # Increased for potentially long JSON outputs
    )

    # Create a labels copy from the input_ids
    labels = tokenized["input_ids"].copy()

    # Find the start of the assistant's response
    # We search for the sequence of tokens representing "<|assistant|>"
    for i in range(len(labels) - len(assistant_token_ids) + 1):
        if labels[i:i+len(assistant_token_ids)] == assistant_token_ids:
            response_start_index = i + len(assistant_token_ids)

            # Mask all tokens from the beginning of the prompt to the start of the response
            for j in range(response_start_index):
                labels[j] = -100 # -100 is the standard ignore_index for loss functions

            tokenized["labels"] = labels
            return tokenized

    # If the assistant marker isn't found (e.g., due to truncation), we skip the example
    return {}

# === Step 6: Tokenize the Dataset ===
if raw_data:
    tokenized_dataset = dataset.map(tokenize_and_mask, batched=False).filter(
        lambda example: len(example) > 0 and "labels" in example
    )
    print(f"✅ Successfully tokenized and masked {len(tokenized_dataset)} examples.")
    # You can inspect an example to see the -100 masking
    print("\nSample labels (first 100 tokens):")
    print(tokenized_dataset[0]['labels'][:100])
else:
    tokenized_dataset = dataset # Keep it empty if no data
    print("⚠️ No data was processed. The 'tokenized_dataset' is empty.")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Filter:   0%|          | 0/46 [00:00<?, ? examples/s]

✅ Successfully tokenized and masked 46 examples.

Sample labels (first 100 tokens):
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [17]:
import torch
import gc
import os
import json
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
# Correctly import the PEFT utility
from peft import LoraConfig, TaskType, prepare_model_for_kbit_training
from trl import SFTTrainer

# === 0️⃣ SETUP & MEM MANAGEMENT ===
gc.collect()
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# === 1️⃣ LOAD TOKENIZER + MODEL + LoRA ===
model_id = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
# Load the base model, DO NOT wrap it with PEFT yet.
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb,
    device_map="auto",
    trust_remote_code=True,
)

# ✅ FIX: Prepare the quantized model for training. This is the best practice.
model = prepare_model_for_kbit_training(model)

# Define the LoRA config, but DO NOT apply it here.
peft_cfg = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_dropout=0.05,
)

# ❌ REMOVED: Do not manually wrap the model with get_peft_model here.
# model = get_peft_model(model, peft_cfg)


# === 2️⃣ LOAD & FORMAT DATA ===
DATA_PATH = "/content/training_data.jsonl"
raw_ds = load_dataset("json", data_files=DATA_PATH, split="train")

def format_chat_template(ex):
    ocr     = ex["instruction"].get("ocr_text","").strip()
    profile = json.dumps(ex["instruction"].get("user_profile",{}))
    resp    = ex.get("output","").strip()
    return {
        "messages": [
            {
                "role": "user",
                "content": f"Expert food analyst…\n\nOCR Text:\n{ocr}\n\nUser Profile:\n{profile}"
            },
            {
                "role": "assistant",
                "content": resp
            }
        ]
    }

formatted_ds = raw_ds.map(format_chat_template, remove_columns=raw_ds.column_names)

# === 3️⃣ CONFIGURE & TRAIN with SFTTrainer ===
training_args = TrainingArguments(
    output_dir="./phi3-ganesha-analyst",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    fp16=True,
    save_strategy="epoch",
    logging_steps=10,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,                  # Pass the unprepared base model
    args=training_args,
    train_dataset=formatted_ds,
    peft_config=peft_cfg,         # Let the trainer apply PEFT
    processing_class=tokenizer,          # ✅ FIX: Use the correct 'tokenizer' argument
)

print("🚀 Starting SFTTrainer…")
trainer.train()
print("✅ SFTTrainer done.")

trainer.save_model("./phi3-ganesha-analyst-final")
print("✅ Final adapter saved.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/46 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/46 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🚀 Starting SFTTrainer…


  return fn(*args, **kwargs)


Step,Training Loss
10,1.3596


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


✅ SFTTrainer done.
✅ Final adapter saved.


In [1]:
!nvidia-smi

import gc, torch
gc.collect()
torch.cuda.empty_cache()

# 1. Ensure the latest libraries are installed
!pip install --upgrade -q accelerate transformers peft trl

# 2. Aggressively clear the entire Hugging Face cache
# This will force a fresh download of the official model files
!rm -rf ~/.cache/huggingface/

# 3. Verify the installation
import transformers
print(f"✅ Transformers version: {transformers.__version__}")

print("\n‼️ IMPORTANT: Environment is clean. Please RESTART YOUR SESSION now before running Step 2.")


Wed Jul 16 22:07:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   31C    P0             45W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json
import gc

# Clean up memory
gc.collect()
torch.cuda.empty_cache()

# Define paths
base_model_id = "microsoft/Phi-3-mini-4k-instruct"
adapter_path = "./phi3-ganesha-analyst-final" # Make sure this is the correct path to your adapter

# Load the base model and tokenizer
print("Loading base model...")
try:
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
except Exception as e:
    print(f"Error loading base model: {e}")
    # Stop execution if model fails to load
    raise

# Load the PEFT model
print(f"Loading adapter from: {adapter_path}")
try:
    model = PeftModel.from_pretrained(base_model, adapter_path)
except Exception as e:
    print(f"Error loading adapter. Is the path '{adapter_path}' correct? Error: {e}")
    # Stop execution if adapter fails to load
    raise

# Merge the adapter into the base model for faster inference
print("Merging adapter and base model...")
model = model.merge_and_unload()
print("Merge complete.")

model.eval() # Set the model to evaluation mode

# --- Create a Test Case ---
messy_ocr_text = """
INGREDIENTS: WH0LE GRAIN ROLLED 0ATS, SUGAR, PEANUT BUTTER (PEANUTS, DEXTR0SE,
HYDROGENATED VEGETABLE 0lL [C0TT0NSEED, S0YBEAN], SALT), SEMl-SWEET CH0C0LATE
CHlPS (SUGAR, CH0C0LATE LIQU0R, C0C0A BUTTER, S0Y LECITHIN), C0RN SYRUP.
C0NTAINS 2% 0R LESS 0F: S0DIUM BICARB0NATE, NATURAL FLAV0RS.
"""
user_profile = {"allergies": ["soy", "dairy"], "diet": "vegetarian"}

# --- Format the Prompt ---
prompt_list = [
    {
        "role": "user",
        "content": f"Expert food analyst…\n\nOCR Text:\n{messy_ocr_text}\n\nUser Profile:\n{json.dumps(user_profile)}"
    },
    {
        "role": "assistant",
        "content": ""
    }
]

# Step 1: Create the formatted prompt string from the chat template.
formatted_prompt = tokenizer.apply_chat_template(
    prompt_list,
    tokenize=False,
    add_generation_prompt=True
)

# Step 2: Pass the string to the tokenizer to get a dictionary.
inputs = tokenizer(
    formatted_prompt,
    return_tensors="pt",
    return_attention_mask=True
).to(model.device)

# --- Run Inference ---
print("\nGenerating response...")

outputs = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_new_tokens=512,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=False,
)

# Decode and print only the newly generated text
response_ids = outputs[0][inputs['input_ids'].shape[1]:]
response_text = tokenizer.decode(response_ids, skip_special_tokens=True)

print("\n--- MODEL OUTPUT ---")
print(response_text)

Loading base model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Loading adapter from: ./phi3-ganesha-analyst-final
Error loading adapter. Is the path './phi3-ganesha-analyst-final' correct? Error: Can't find 'adapter_config.json' at './phi3-ganesha-analyst-final'


ValueError: Can't find 'adapter_config.json' at './phi3-ganesha-analyst-final'