In [1]:
import json
import random
import numpy as np
from pathlib import Path
from PIL import Image
from datasets import load_from_disk
import mlx.core as mx
from mlx_vlm import load, generate
from mlx_vlm.prompt_utils import apply_chat_template
from mlx_vlm.utils import load_config
from evaluaion_utils import evaluate_sample

In [2]:
model_path = "mlx-community/Qwen2.5-VL-3B-Instruct-bf16"
model, processor = load(model_path)
config = load_config(model_path)



Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

In [3]:
system_prompt = """You are an expert Visual Question Answering (VQA) system specializing in analyzing financial documents. Your task is to answer a question by generating a JSON object with the precise answer, its components, and its location.

**Output Rules:**
1. Your response MUST be a single, raw JSON object. Do not include explanations or markdown formatting.
2. Analyze the question and the image to determine the type of answer required, then generate a JSON object using **only the keys appropriate for that answer type**, as defined below.
3. All bounding box coordinates must be in `[x1, y1, x2, y2]`.
4. All numeric answers must be returned as JSON numbers (floats or integers), not strings.

**JSON Output Structures:**

* **1. For Simple Extractive Answers**:
    * `"answer"`: (float or string) The value found in the single cell.
    * `"answer_bbox"`: (list[float]) The bounding box of the answer cell.

* **2. For Answers Requiring Calculation/Aggregation**:
    * `"answer"`: (float) The final **calculated** numeric value.
    * `"individual_answers"`: (list[float]) The list of component numeric values used in the calculation.
    * `"individual_answers_bboxes"`: (list[list[float]]) The bounding boxes for each component value.

* **3. For Key-Value Identification**:
    * `"answer"`: (float) The final value part of the pair that answers the question.
    * `"answer_key"`: (string) The key part of the pair (e.g., the row header).
    * `"answer_key_bbox"`: (list[float]) The bounding box for the `answer_key` text.
    * `"individual_answers"`: (list[float]) The list of all numeric values considered to find the answer.
    * `"individual_answers_bboxes"`: (list[list[float]]) The bounding boxes for each value in `individual_answers`.
"""

In [4]:
n = 10
dataset = load_from_disk('EviFiVQA_val_dataset')


if n > len(dataset):
    print(f"Warning: Requested {n} samples, but dataset only has {len(dataset)} samples.")
    n = len(dataset)

sample_indices = random.sample(range(len(dataset)), n)
samples = dataset.select(sample_indices)

In [5]:
def scale_bounding_boxes(gt_dict: dict, width: int, height: int) -> dict:
    """
    Recursively finds and scales all normalized bounding boxes in the ground truth
    dictionary to absolute pixel coordinates and converts them to integers.

    Args:
        gt_dict (dict): The ground truth dictionary with normalized bboxes.
        width (int): The original width of the image.
        height (int): The original height of the image.

    Returns:
        dict: A new dictionary with bboxes scaled to image dimensions.
    """
    # Create a deep copy to avoid modifying the original data
    scaled_gt = json.loads(json.dumps(gt_dict))

    for key, value in scaled_gt.items():
        if not value:
            continue
        # Scale a single bounding box
        if key.endswith('_bbox'):
            scaled_gt[key] = [
                int(value[0] * width),
                int(value[1] * height),
                int(value[2] * width),
                int(value[3] * height)
            ]
        # Scale a list of bounding boxes
        elif key.endswith('_bboxes'):
            scaled_boxes = []
            for box in value:
                scaled_boxes.append([
                    int(box[0] * width),
                    int(box[1] * height),
                    int(box[2] * width),
                    int(box[3] * height)
                ])
            scaled_gt[key] = scaled_boxes
    return scaled_gt


In [6]:
all_scores = []
print(f"\n--- Processing {n} random samples from dataset ---")
for i, sample in enumerate(samples):
    print(f"\n--- Sample {i+1}/{n} ---")
    question = sample['question']
    image_path = sample['image']
    ground_truth_str = sample['ground_truth']
    image = [Image.open(image_path).convert("RGB")]
    width, height = image[0].size
    user_prompt = f"Based on the provided image and question, generate the detailed JSON answer.\n\n## Question:\n{question}\n\n## Image:"
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": user_prompt}]}
    ]
    prompt  = apply_chat_template(processor, config, messages, num_images=len(image))
    
    print("  Generating response...")
    response = generate(model, processor, prompt, image, verbose=False, max_tokens=2048)
    prediction_str = response[0]
    print(f"  Model Raw Output:\n{prediction_str}")

    try:
        json_part = prediction_str[prediction_str.find('{'):prediction_str.rfind('}')+1]
        prediction_dict = json.loads(json_part)
        ground_truth_dict = json.loads(ground_truth_str)
        scaled_ground_truth_dict = scale_bounding_boxes(ground_truth_dict, width, height)
        # Use the imported evaluation function
        scores = evaluate_sample(prediction_dict, scaled_ground_truth_dict)
        all_scores.append(scores)
        print(f"  Scores: {scores}")
    except json.JSONDecodeError:
        print("  ERROR: Failed to decode JSON from model output. Assigning score of 0.")
        all_scores.append({"EvidenceF1": 0.0, "AnswerScore_avg": 0.0, "TotalScore": 0.0})
    except Exception as e:
        print(f"  An unexpected error occurred during evaluation: {e}")
        all_scores.append({"EvidenceF1": 0.0, "AnswerScore_avg": 0.0, "TotalScore": 0.0})




--- Processing 10 random samples from dataset ---

--- Sample 1/10 ---
  Generating response...
  Model Raw Output:
```json
{
  "answer": 100,
  "answer_key": "Cash and cash equivalents at end of year",
  "answer_key_bbox": [1000, 2200, 1050, 2220],
  "individual_answers": [100, 2200, 1050, 2220],
  "individual_answers_bboxes": [[1000, 2200, 1050, 2220]]
}
```
  Scores: {'EvidenceF1': 0.0, 'AnswerScore_avg': 0.058808509943736174, 'TotalScore': 0.0}

--- Sample 2/10 ---
  Generating response...
  Model Raw Output:
```json
{
  "answer": 209,
  "answer_bbox": [100, 270, 120, 280]
}
```
  Scores: {'EvidenceF1': 0.0, 'AnswerScore_avg': 1.0, 'TotalScore': 0.0}

--- Sample 3/10 ---
  Generating response...
  Model Raw Output:
```json
{
  "answer": 1505,
  "answer_bbox": [1000, 100, 1050, 110]
}
```
  Scores: {'EvidenceF1': 0.0, 'AnswerScore_avg': 0.4249182927993987, 'TotalScore': 0.0}

--- Sample 4/10 ---
  Generating response...
  Model Raw Output:
```json
{
  "answer": 188,
  "answer_key":

In [7]:
# 5. Final Summary
if all_scores:
    avg_evidence = np.mean([s['EvidenceF1'] for s in all_scores])
    avg_answer = np.mean([s['AnswerScore_avg'] for s in all_scores])
    avg_total = np.mean([s['TotalScore'] for s in all_scores])
    print("\n" + "="*45); print("--- Final Average Scores ---")
    print(f"Average Evidence F1:    {avg_evidence:.4f}")
    print(f"Average Answer Score:   {avg_answer:.4f}")
    print(f"Average Total Score:    {avg_total:.4f}")
    print("="*45)


--- Final Average Scores ---
Average Evidence F1:    0.0000
Average Answer Score:   0.7056
Average Total Score:    0.0000
