In [None]:
# =================================================================
# Cell 1: Force Clear Cache (RUN THIS ONCE)
# =================================================================
!rm -rf ~/.cache/huggingface/datasets
print("✅ Hugging Face datasets cache has been forcefully cleared.")

✅ Hugging Face datasets cache has been forcefully cleared.


In [None]:
# =================================================================
# FINAL, DEFINITIVE SETUP CELL: Replicating the Official Environment
# =================================================================

# --- Step 1: Standard Setup ---
from google.colab import drive
import os

print("--- Step 1: Mounting Google Drive ---")
drive.mount('/content/drive')

PROJECT_DIR = '/content/drive/MyDrive/dolphin-vlm-finetuning' # Make sure this path is correct
os.makedirs(PROJECT_DIR, exist_ok=True)
os.chdir(PROJECT_DIR)
print(f"Working directory set to: {PROJECT_DIR}")


# --- Step 2: Clear any old cache ---
# This is a safety measure to ensure we download fresh files.
print("\n--- Step 2: Clearing Hugging Face cache ---")
!rm -rf ~/.cache/huggingface/
print("Cache cleared.")

--- Step 1: Mounting Google Drive ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Working directory set to: /content/drive/MyDrive/dolphin-vlm-finetuning

--- Step 2: Clearing Hugging Face cache ---
Cache cleared.


In [None]:
!pip install pip==25.1.1

!pip --version
#Python 3.8.16

Collecting pip==25.1.1
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.8 MB[0m [31m14.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m37.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
pip 25.1.1 from /usr/local/lib/python3.11/dist-packages/pip (python 3.11)


In [None]:
!pip install transformers==4.47.0
!pip install datasets==3.4.0
!pip install langgraph
!pip install bitsandbytes==0.43.2
!pip install numpy==1.26.4
!pip install peft==0.11.1
!pip install torchaudio --index-url https://download.pytorch.org/whl/cu121 accelerate>=0.21.0 psutil langchain sentence_transformers tokenizers -q

Collecting bitsandbytes==0.43.2
  Using cached bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Using cached bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.2
Collecting peft==0.11.1
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.11.1
[31mERROR: Could not find a version that satisfies the requirement langchain (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for langchain[0m[31m
[0m

In [None]:
# =================================================================
# FINAL SCRIPT: Handling the Multi-Lingual Nested QA Structure
# =================================================================

import os
from datasets import load_dataset, Dataset, DatasetDict, Image
from tqdm.auto import tqdm # For a progress bar

# --- Configuration ---
SAMPLE_SIZE = 2000
RANDOM_SEED = 42
PROJECT_DIR = '/content/drive/MyDrive/dolphin-vlm-finetuning'
OUTPUT_DIR = os.path.join(PROJECT_DIR, "processed_data")
# Using the cmarkea dataset as specified
DATASET_NAME = "cmarkea/doc-vqa"

# --- BEST PRACTICE: Check if data is already processed ---
if os.path.exists(os.path.join(OUTPUT_DIR, "dataset_dict.json")):
    print(f"✅ Processed dataset already found at '{OUTPUT_DIR}'.")
    print("Skipping data preparation. Ready for fine-tuning!")
else:
    print(f"Processed dataset not found. Starting data preparation pipeline...")

    def load_and_flatten_dataset(dataset_name, split, sample_size, seed, language='en'):
        """
        Loads the dataset and flattens the nested QA pairs from the specified
        language key (e.g., 'en'). This is the definitive fix.
        """
        print(f"\n--- Loading and Processing '{dataset_name}' ({split} split) ---")
        try:
            # Step 1: Load the raw dataset
            print("Step 1: Loading raw dataset from Hugging Face Hub...")
            dataset = load_dataset(dataset_name, split=split)
            print(f"Successfully loaded {len(dataset)} raw records.")

            # Step 2: Flatten the multi-lingual data structure
            print(f"Step 2: Flattening nested Q&A pairs for language='{language}'...")
            flattened_data = []
            for record in tqdm(dataset):
                # === THE FINAL FIX IS HERE ===
                # We access the language key first (e.g., 'en') to get the list.
                if language in record['qa']:
                    qa_list = record['qa'][language]
                    for qa_pair in qa_list:
                        flattened_data.append({
                            'image': record['image'],
                            'question': qa_pair['question'],
                            'answer': qa_pair['answer'] # This dataset has 'answer', not 'answers'
                        })
                else:
                    # This handles cases where a record might not have the specified language
                    print(f"Warning: Language key '{language}' not found in a record. Skipping.")
                    continue

            print(f"Flattening complete. Created {len(flattened_data)} individual Q&A records.")

            # Step 3: Create a new Dataset object from the flattened data
            if not flattened_data:
                raise ValueError("No data was processed. Check the dataset structure and parsing logic.")

            flat_dataset = Dataset.from_list(flattened_data).cast_column("image", Image(decode=True))

            # Step 4: Sub-sample the flattened dataset
            print(f"Step 4: Taking a random sample of {sample_size} records...")
            final_sample_size = min(sample_size, len(flat_dataset))
            sampled_dataset = flat_dataset.shuffle(seed=seed).select(range(final_sample_size))
            print(f"Sub-sampling complete. Using {len(sampled_dataset)} records.")

            # Step 5: Split the data into training and testing sets
            print("Step 5: Splitting the data into training and testing sets...")
            split_dataset = sampled_dataset.train_test_split(test_size=0.2, seed=seed)
            print(f"Data split into {len(split_dataset['train'])} training and {len(split_dataset['test'])} testing samples.")

            return split_dataset

        except Exception as e:
            print(f"An error occurred during dataset processing: {e}")
            import traceback
            traceback.print_exc()
            return None

    def save_dataset_to_disk(dataset_dict, output_dir):
        if dataset_dict is None:
            print("No dataset to save.")
            return

        print(f"\n--- Saving processed dataset to '{output_dir}' ---")
        try:
            os.makedirs(output_dir, exist_ok=True)
            dataset_dict.save_to_disk(output_dir)
            print("✅ Dataset saved successfully to your Google Drive!")
        except Exception as e:
            print(f"Failed to save dataset: {e}")

    # --- Execute the entire pipeline ---
    # We use the 'train' split which contains the ground truth answers
    final_dataset = load_and_flatten_dataset(
        dataset_name=DATASET_NAME,
        split='train',
        sample_size=SAMPLE_SIZE,
        seed=RANDOM_SEED,
        language='en' # Specify English here
    )
    save_dataset_to_disk(final_dataset, OUTPUT_DIR)

✅ Processed dataset already found at '/content/drive/MyDrive/dolphin-vlm-finetuning/processed_data'.
Skipping data preparation. Ready for fine-tuning!


## The finetuning section

# New Section

In [None]:
from huggingface_hub import login
login(token="hf_token")

In [None]:
!pip install --upgrade torch torchvision

In [None]:
# =================================================================
# Cell 2: Load Data, Processor, and Pre-process
# =================================================================
# Re-import libraries after installation
from datasets import load_from_disk
from transformers import AutoProcessor
import os

# --- Configuration ---
BASE_MODEL_ID = "ByteDance/Dolphin"
PROCESSED_DATA_PATH = os.path.join(PROJECT_DIR, "processed_data")

# --- Load Processor ---
print("--- Loading Processor ---")
# With the correct libraries, this will now succeed.
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token
print("✅ Processor loaded successfully!")


# --- Load Dataset ---
print("\n--- Loading Pre-processed Data ---")
train_dataset = load_from_disk(os.path.join(PROCESSED_DATA_PATH, "train"))
eval_dataset = load_from_disk(os.path.join(PROCESSED_DATA_PATH, "test"))
print(f"✅ Loaded {len(train_dataset)} training and {len(eval_dataset)} evaluation samples.")


# --- Pre-processing Function ---
def apply_chat_template(examples):
    # This function processes a batch of examples
    # 1. Apply the chat template to the text
    messages = [
        [
            {"role": "user", "content": f"<image>\n{q}"},
            {"role": "assistant", "content": a}
        ] for q, a in zip(examples["question"], examples["answer"])
    ]
    # 2. Tokenize text and process images
    model_inputs = processor(text=messages, images=examples["image"], return_tensors="pt", padding="max_length", truncation=True)
    # 3. The labels are the input_ids themselves for language modeling
    model_inputs['labels'] = model_inputs.input_ids
    return model_inputs

# --- Apply Transformation ---
# Use set_transform for memory efficiency. The function is applied on the fly.
train_dataset.set_transform(apply_chat_template)
eval_dataset.set_transform(apply_chat_template)
print("\n✅ Dataset transformation is set. Ready for training.")

--- Loading Processor ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/478 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/277 [00:00<?, ?B/s]

✅ Processor loaded successfully!

--- Loading Pre-processed Data ---
✅ Loaded 1600 training and 400 evaluation samples.

✅ Dataset transformation is set. Ready for training.


In [None]:
!pip uninstall torch_xla -y

[0m

In [None]:
# =================================================================
# FINAL, DEFINITIVE TRAINING CELL V7: Adding Explicit Max Length
# =================================================================

import torch
import os
from transformers import AutoModelForVision2Seq, TrainingArguments, Trainer, BitsAndBytesConfig, AutoProcessor
from peft import LoraConfig, get_peft_model
from datasets import load_from_disk

# --- Load Data and Processor ---
print("--- Loading Data and Processor ---")
BASE_MODEL_ID = "ByteDance/Dolphin"
# Ensure your PROJECT_DIR points to the correct location in your environment
PROJECT_DIR = '/content/drive/MyDrive/dolphin-vlm-finetuning'
PROCESSED_DATA_PATH = os.path.join(PROJECT_DIR, "processed_data")
MODEL_OUTPUT_PATH = os.path.join(PROJECT_DIR, "dolphin-vlm-docvqa-finetuned")

processor = AutoProcessor.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

# Load the raw dataset
train_dataset = load_from_disk(os.path.join(PROCESSED_DATA_PATH, "train"))
eval_dataset = load_from_disk(os.path.join(PROCESSED_DATA_PATH, "test"))


# === THIS IS THE CORRECTED BATCH-COMPATIBLE FUNCTION ===
def preprocess_function_for_map(examples):
    """
    This function processes a BATCH of examples to be compatible with .map(batched=True)
    """
    # 1. Create the structured message format for the batch
    messages_batch = [
        [
            {"role": "user", "content": f"<image>\n{q}"},
            {"role": "assistant", "content": a}
        ] for q, a in zip(examples["question"], examples["answer"])
    ]

    # 2. First, apply the chat template to convert the structured messages into a list of strings.
    #    The `processor.tokenizer` handles this conversion correctly.
    text_inputs = processor.tokenizer.apply_chat_template(
        messages_batch,
        tokenize=False,
        add_generation_prompt=False # We handle tokenization in the next step
    )

    # 3. Now, process the batch of images and the newly formatted text strings.
    #    The `text` argument now receives the correct format (List[str]).
    # --- THIS IS THE FIX ---
    # We explicitly provide a `max_length` to ensure all sequences in a batch
    # are padded/truncated to the same size, resolving the ValueError.
    model_inputs = processor(
        text=text_inputs,
        images=examples["image"],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512  # Explicitly set max_length
    )

    # 4. Set the labels for language modeling
    #    The labels should be the same as the input_ids for Causal LM fine-tuning.
    model_inputs["labels"] = model_inputs.input_ids.clone()

    return model_inputs

# --- Apply the Pre-processing using .map() ---
print("\n--- Pre-processing dataset with .map() - this may take a moment... ---")
processed_train_dataset = train_dataset.map(preprocess_function_for_map, batched=True, remove_columns=train_dataset.column_names)
processed_eval_dataset = eval_dataset.map(preprocess_function_for_map, batched=True, remove_columns=eval_dataset.column_names)
print("✅ Dataset pre-processing complete.")


# --- Define Quantization and LoRA Configs ---
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
LORA_CONFIG = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "k_proj", "v_proj", "dense"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")


# --- Load Model with Quantization Config ---
print("\n--- Loading Model with BitsAndBytesConfig ---")
model = AutoModelForVision2Seq.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True,
)
model.config.pad_token_id = processor.tokenizer.pad_token_id


# --- Apply LoRA ---
print("\n--- Applying LoRA Adapters ---")
model = get_peft_model(model, LORA_CONFIG)
model.print_trainable_parameters()


# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    fp16=True,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    optim="paged_adamw_8bit",
    report_to="none",
)

# --- Instantiate and Run Trainer with the PROCESSED datasets ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    # The tokenizer is part of the processor, so we pass it explicitly.
    tokenizer=processor.tokenizer
)

print("\n--- Starting Fine-Tuning ---")
trainer.train()
print("\n✅ Training complete.")

trainer.save_model()
print(f"✅ Best model adapter saved to {MODEL_OUTPUT_PATH}")


--- Loading Data and Processor ---

--- Pre-processing dataset with .map() - this may take a moment... ---


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
class BudgetAdvisorAgent:
    def __init__(self):
        print("Initializing Budget Advisor Agent.")

    def check_budget(self, price: float, user_budget: float):
        """
        Compares the product price against the user's budget.
        """
        print("Budget Advisor Agent checking budget...")
        if price <= user_budget:
            advice = f"This item is within your budget of ${user_budget:.2f}."
        elif price <= user_budget * 1.2: # Within 20% over
            advice = f"This item is slightly over your budget of ${user_budget:.2f}, but might be worth considering."
        else:
            advice = f"This item is significantly over your budget of ${user_budget:.2f}."

        return {"advice": advice}

# Singleton instance
budget_agent = BudgetAdvisorAgent()

Initializing Budget Advisor Agent.


In [None]:
# =================================================================
# Cell 4: Instantiate Trainer and Start Training
# =================================================================

# The Trainer class orchestrates the entire fine-tuning process
trainer = Trainer(
    model=model,
    args=TRAINING_ARGS,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start training! This will take some time depending on the dataset size and epochs.
print("Starting model training...")
trainer.train()
print("Training complete.")

# Save the final, best-performing LoRA adapter to your Google Drive
print(f"Saving the best model to {MODEL_OUTPUT_PATH}...")
trainer.save_model()
print("Model saved successfully!")

##  Inference and Evaluation (The inference.py Logic)

In [None]:
# =================================================================
# Cell 1: Setup and Model Loading for Inference
# =================================================================

# Install required packages
!pip install transformers==4.41.2 accelerate==0.30.1 bitsandbytes==0.43.1 peft==0.11.1 datasets==2.19.1 pandas -q

import os
import torch
import pandas as pd
from datasets import load_from_disk
from transformers import AutoModelForVision2Seq, AutoProcessor, pipeline
from peft import PeftModel

# --- Configuration ---
PROJECT_DIR = '/content/drive/MyDrive/dolphin-vlm-finetuning'
os.chdir(PROJECT_DIR)

BASE_MODEL_ID = "ByteDance/Dolphin-v1.5-phi-2"
# This MUST be the path where your trained adapter was saved
ADAPTER_PATH = os.path.join(PROJECT_DIR, "dolphin-vlm-docvqa-finetuned")

# --- Load Models and Processor ---

# Load the base model in 4-bit
base_model = AutoModelForVision2Seq.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    device_map="auto",
    trust_remote_code=True,
)

# Load the fine-tuned model by applying the PEFT adapter to the base model
# This is the correct and efficient way to load a LoRA-tuned model
tuned_model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)

# Load the processor
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)

print("Base model and fine-tuned model loaded successfully.")

In [None]:
# =================================================================
# Cell 2: Inference Function
# =================================================================

def run_inference(model, processor, sample):
    """
    Generates an answer for a given sample using the specified model.
    """
    image = sample['image']
    question = sample['question']

    # Apply the chat template for inference, but only for the user part
    prompt = f"USER: <image>\n{question}\nASSISTANT:"

    # Process the inputs
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)

    # Generate the output
    generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)

    # Decode and clean the output
    decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Extract just the assistant's response
    try:
        answer = decoded_text.split("ASSISTANT:")[1].strip()
        return answer
    except IndexError:
        return "Failed to generate a valid answer."

print("Inference function is ready.")

In [None]:
# =================================================================
# Cell 3: Run Comparison and Display Results
# =================================================================

# Load the test dataset
PROCESSED_DATA_PATH = os.path.join(PROJECT_DIR, "processed_data")
test_dataset = load_from_disk(os.path.join(PROCESSED_DATA_PATH, "test"))

# Select a few samples to test on
num_samples_to_show = 5
test_samples = test_dataset.shuffle(seed=42).select(range(num_samples_to_show))

results_list = []

print("Running comparison on test samples...")
print("-" * 50)

for i, sample in enumerate(test_samples):
    print(f"SAMPLE #{i+1}")
    print(f"❓ Question: {sample['question']}")

    # Get ground truth
    ground_truth = sample['answer']
    print(f"✅ Ground Truth: {ground_truth}")

    # Get base model's prediction
    base_model_answer = run_inference(base_model, processor, sample)
    print(f"🤖 Base Model: {base_model_answer}")

    # Get fine-tuned model's prediction
    tuned_model_answer = run_inference(tuned_model, processor, sample)
    print(f"🚀 Fine-Tuned Model: {tuned_model_answer}")
    print("-" * 50)

    results_list.append({
        "Question": sample['question'],
        "Ground Truth": ground_truth,
        "Base Model Answer": base_model_answer,
        "Fine-Tuned Answer": tuned_model_answer
    })

# Display the results in a clean pandas DataFrame for your showcase
print("\n--- Summary of Results ---")
results_df = pd.DataFrame(results_list)
display(results_df)

# Save the DataFrame to a CSV file for your records
results_df.to_csv("inference_comparison_results.csv", index=False)

In [None]:
try:
    from transformers import AutoModelForVision2Seq, TrainingArguments, Trainer, BitsAndBytesConfig
    print("Successfully imported: AutoModelForVision2Seq, TrainingArguments, Trainer, BitsAndBytesConfig")
except ImportError as e:
    print(f"ImportError: {e}")
    print("Please ensure the transformers library is correctly installed with all necessary dependencies.")
    print("You might need to restart the runtime and re-run the installation cells.")

ImportError: Could not import module 'TrainingArguments'. Are this object's requirements defined correctly?
Please ensure the transformers library is correctly installed with all necessary dependencies.
You might need to restart the runtime and re-run the installation cells.
