## Prompt

In [None]:
# === PROMPT ===
PROMPT = f"""
You are given an image of a graph. Your task is to analyze it and extract structured information.

Return your findings as a **valid, minified JSON object** with the following fields.
If any detail cannot be determined from the image, set its value to null (without quotes).

{{
  "maximum": "The highest y-value visible on the graph.",
  "minimum": "The lowest y-value visible on the graph.",
  "range": "The overall span of the y-axis, written as 'min-max'.",
  "title": "The exact title text shown on the graph, if present. If not present, write null (wthout quotes)",
  "domain": "The subject domain of the graph. Choose ONE ONLY from the following: economics, healthcare, politics, environment, technology, entertainment, animal, linguistics, internet, miscellaneous. If none of these options are correct, output null."
}}

**Guidelines:**
1. Base all answers strictly on what is visible in the graph; do not infer or invent data.
2. Include numerical values exactly as they appear (no rounding).
3. Maintain factual, neutral descriptions.
4. Output only the final JSON object — no text, commentary, or markdown.

Output ONLY the JSON object with string values for each aspect.
"""


## Set Up


In [None]:
!pip install roboflow --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.9/49.9 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
import base64
import csv
from google.colab import userdata
from io import BytesIO
import json
import os
import numpy as np
import pandas as pd
from PIL import Image
import random
import re
import requests
from roboflow import Roboflow
import shutil
import time


ModuleNotFoundError: No module named 'roboflow'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.environ["ROBOFLOW_API_KEY"] = userdata.get("ROBOFLOW_API_KEY")
rf_key = os.environ["ROBOFLOW_API_KEY"]
VERSION = 2
# os.environ["API_KEY"] = API_KEY = userdata.get('OpenRouterAPI_GMAIL')

In [None]:
TARGET_DIR = "/content/drive/MyDrive/dl-project"
os.makedirs(TARGET_DIR, exist_ok=True)

In [None]:
## Load Roboflow dataset

os.chdir(TARGET_DIR)

rf = Roboflow(api_key=rf_key)
project = rf.workspace("graph-analysis").project("my-first-project-qltkc")
version = project.version(VERSION)
dataset = version.download("jsonl")

print("Dataset downloaded to:", dataset.location)

train_dir = os.path.join(dataset.location, "train")

# Load annotations from the JSONL file
jsonl_path = os.path.join(train_dir, "annotations.jsonl")
if not os.path.exists(jsonl_path):
    raise RuntimeError(f"Annotation file not found at: {jsonl_path}")

with open(jsonl_path, "r") as f:
    ground_truth = [json.loads(line) for line in f]

print(f"\nLoaded {len(ground_truth)} samples from {jsonl_path}")

## Vars

In [None]:
# FILE_NAME = "qwen2_5"; MODEL = "qwen/qwen2.5-vl-32b-instruct:free"
# FILE_NAME = "gemma3-4b"; MODEL = "google/gemma-3-4b-it:free" # 35.4M tokens, 4B params
# FILE_NAME = "gemma-3-12b"; MODEL = "google/gemma-3-12b-it:free", # 12B params, 15.5M tokens
# FILE_NAME = "gemma-3-27b"; MODEL = "google/gemma-3-27b-it:free", # 27B params, 2.07B tokens
# FILE_NAME = "mistralai-3.2"; MODEL = "mistralai/mistral-small-3.2-24b-instruct:free",
FILE_NAME = "mistralai-3.1"; MODEL = "mistralai/mistral-small-3.1-24b-instruct:free"


# os.environ["API_KEY"] = API_KEY = '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'
# os.environ["API_KEY"] = API_KEY = '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'
# os.environ["API_KEY"] = API_KEY = "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" # sanjana
# os.environ["API_KEY"] = API_KEY = "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" # sanika
os.environ["API_KEY"] = API_KEY = "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" # jiya

PROJECT_ROOT = os.path.join(TARGET_DIR, "models", FILE_NAME)
output_path = os.path.join(PROJECT_ROOT, f"{FILE_NAME}_benchmark_results.json")
counter = 0
max_retries=5
base_delay=2

## Helper methods

In [None]:
## clean json file

def extract_json_from_response(response_text):
    """
    Extracts JSON from a Markdown code block like ```json ... ```
    or falls back to attempting to parse raw JSON.
    """
    if not response_text:
        return None

    # Try to extract the JSON block inside ```json ... ```
    match = re.search(r"```json\s*(\{.*?\})\s*```", response_text, re.DOTALL)
    if match:
        json_str = match.group(1)
    else:
        # fallback: remove triple backticks or try as-is
        json_str = response_text.strip().strip("`")

    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        print("Could not parse JSON from model response. Returning raw text.")
        return None

In [None]:
def query(image_path, annotation_text=None):
    """
    Sends an image (and optional annotation) to Qwen2.5-VL via OpenRouter.
    """
    # Open the image file and encode it in base64
    with open(image_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode('utf-8')

    payload = {
        "model": "mistralai/mistral-small-3.1-24b-instruct:free",
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"{PROMPT}"},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
                ]}]}

    headers = {
        "Authorization": f"Bearer {os.environ['API_KEY']}",
        "Content-Type": "application/json",}

    response = requests.post(
        "https://openrouter.ai/api/v1/chat/completions",
        headers=headers,
        data=json.dumps(payload)
    )

    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        print("Error:", response.status_code, response.text)
        return None


## Benchmarking

In [None]:
results = []

if os.path.exists(output_path):
    with open(output_path, "r") as f:
        try:
            results = json.load(f)
        except json.JSONDecodeError:
            results = []
processed_images = {r["image_filename"] for r in results}
images = sorted({graph.get("image") for graph in ground_truth if graph.get("image")})

for i, image_filename in enumerate(images):

    if image_filename in processed_images:
        print("PROCESSED: ", image_filename)
        continue
    counter += 1
    image_path = os.path.join(train_dir, image_filename)
    if not os.path.exists(image_path):
        continue

    model_response = query(image_path)

    parsed_response = extract_json_from_response(model_response)
    result = {
        "image_filename": image_filename,
        "model_response": parsed_response,
    }
    if not parsed_response or (isinstance(parsed_response, str) and ("Error" in parsed_response or "<html" in parsed_response.lower())):
        continue
    results.append(result)
    processed_images.add(image_filename)

    print(f"{i} {image_filename}")

    # --- Checkpoint every iteration ---
    with open(output_path, "w") as f:
        json.dump(results, f, indent=2)



print(f"\n✅ Benchmarking complete! {len(results)} total results saved to {output_path}")


0 000001_00b319b5f0f0.rf.16503e60d7b3e2deef95fe1461d91703.jpg
1 000004_01790997a91e.rf.d5e1f9c9738e0444fcba59bb34905a80.jpg
2 000005_017a0940ddab.rf.e7c1b3f549deb811e842478549969c5c.jpg
3 000009_0278b786a87e.rf.81fbfccc955f24ed7d14593981b85f57.jpg
4 000011_02ad9cae0975.rf.04aa5d9315a1a92617f947a7369a5dcf.jpg
5 000012_02e968ffd4c2.rf.cbb25382db5607914b351cf3fad6569d.jpg
6 000013_039c55e5bfc4.rf.9c5c15d842f86cdbdff284482312d3bb.jpg
7 000015_04006ea043ee.rf.c65bad82a50f1701d822319bba6bfdc5.jpg
8 000016_0414fc638de0.rf.cbcbadea73faf9146c7f307373e978da.jpg
Error: 429 {"error":{"message":"Provider returned error","code":429,"metadata":{"raw":"mistralai/mistral-small-3.1-24b-instruct:free is temporarily rate-limited upstream. Please retry shortly, or add your own key to accumulate your rate limits: https://openrouter.ai/settings/integrations","provider_name":"Chutes"}},"user_id":"user_34vsbuG9dTIoWxQHJPZ5kdohIhs"}
Error: 429 {"error":{"message":"Provider returned error","code":429,"metadata":

KeyboardInterrupt: 

## Parse Model Output and Annotations

In [None]:
PROJECT_ROOT = os.path.join(TARGET_DIR, "models", FILE_NAME)

# FILE_NAME = "qwen2_5"; MODEL = "qwen/qwen2.5-vl-32b-instruct:free"
# FILE_NAME = "gemma3-4b"; MODEL = "google/gemma-3-4b-it:free" # 35.4M tokens, 4B params
# FILE_NAME = "gemma-3-12b"; MODEL = "google/gemma-3-12b-it:free", # 12B params, 15.5M tokens
# FILE_NAME = "gemma-3-27b"; MODEL = "google/gemma-3-27b-it:free", # 27B params, 2.07B tokens
# FILE_NAME = "mistralai-3.1"; MODEL = "mistralai/mistral-small-3.1-24b-instruct:free"
# FILE_NAME = "mistralai-3.2"; MODEL = "mistralai/mistral-small-3.2-24b-instruct:free"


In [None]:
## write json inference output to csv file

PROJECT_ROOT = os.path.join(TARGET_DIR, "models")


def generate_csv_files(FILE_NAME):
    ## parsing annotations jsonl file
    os.chdir(os.path.join(PROJECT_ROOT, FILE_NAME))
    with open(f"{FILE_NAME}_benchmark_results.json") as f:
        data = json.load(f)
    df_inf = pd.json_normalize(data)
    if 'model_response' in df_inf.columns:
        df_inf = df_inf.drop(columns=['model_response'])

    new_column_names = ['image_filename', 'max', 'min', 'range', 'title', 'domain']
    df_inf.columns = new_column_names
    df_inf.to_csv(f"{FILE_NAME}_benchmark_results.csv", index=False)
    jsonl_file_path = jsonl_path

    data = {}

    with open(jsonl_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                json_obj = json.loads(line)
                image_name = json_obj.get("image")
                prefix = json_obj.get("prefix")
                suffix = json_obj.get("suffix")

                if image_name not in data:
                    data[image_name] = {}
                data[image_name][prefix] = suffix
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON line: {line.strip()}")
                continue

    with open("annotations_train.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["image_filename", "max", "min", "range", "title", "domain"])

        for img, info in data.items():
            writer.writerow([
                img,
                info.get("What is the maximum?", ""),
                info.get("What is the minimum?", ""),
                info.get("What is the range of the y-axis? Format as min-max (No spaces)", ""),
                info.get("What is the title?", ""),
                info.get("What is the domain?", "")
            ])

    df_truth = pd.read_csv("annotations_train.csv")

    filename = f"{FILE_NAME}_benchmark_results_join.csv"

    merged_df = pd.merge(df_inf, df_truth, on='image_filename', how='inner')
    merged_df.columns = ['image_filename', 'max_inf', 'min_inf', 'range_inf', 'title_inf', 'domain_inf', 'max_truth', 'min_truth', 'range_truth', 'title_truth', 'domain_truth' ]
    merged_df.to_csv(filename, index=False)


    # analysis



    print("DONE: ", filename)

# generate_csv_files("qwen2_5")
# generate_csv_files("gemma3-4b")
# generate_csv_files("gemma-3-12b")
generate_csv_files("gemma-3-27b")
# generate_csv_files("mistralai-3.1")
# generate_csv_files("mistralai-3.2")



DONE:  gemma-3-27b_benchmark_results_join.csv


## Analysis

In [None]:
PROJECT_ROOT = os.path.join(TARGET_DIR, "models")

for dir_name in os.listdir(PROJECT_ROOT):
    dir_path = os.path.join(PROJECT_ROOT, dir_name)

    if not os.path.isdir(dir_path):
        continue

    for file_name in os.listdir(dir_path):
        if "join" in file_name and file_name.endswith(".csv"):
            file_path = os.path.join(dir_path, file_name)
            print(f"Opening: {file_path}")

            file_path =

            # --- Read the original CSV ---
            with open(file_path, "r", newline="") as csvfile:
                reader = csv.DictReader(csvfile)
                rows = []
                for row in reader:
                    # Remove commas before converting to float
                    max_inf_float = float(row["max_inf"].replace(",", "")) if row["max_inf"] else None
                    max_truth_float = float(row["max_truth"].replace(",", "")) if row["max_truth"] else None
                    min_inf_float = float(row["min_inf"].replace(",", "")) if row["min_inf"] else None
                    min_truth_float = float(row["min_truth"].replace(",", "")) if row["min_truth"] else None

                    row["max_corr"] = (max_inf_float == max_truth_float) if max_inf_float is not None and max_truth_float is not None else None
                    row["min_corr"] = (min_inf_float == min_truth_float) if min_inf_float is not None and min_truth_float is not None else None
                    row["range_corr"] = (row["range_inf"] == row["range_truth"])
                    row["title_corr"] = (row["title_inf"].lower() == row["title_truth"].lower())
                    row["domain_corr"] = (row["domain_inf"].lower() == row["domain_truth"].lower())
                    rows.append(row)

                # Preserve original fieldnames, ensuring corr columns exist at the end
                fieldnames = reader.fieldnames.copy()
                new_cols = ["max_corr", "min_corr", "range_corr", "title_corr", "domain_corr"]
                for col in new_cols:
                    if col not in fieldnames:
                        fieldnames.append(col)

            # --- Write updated CSV ---
            with open(file_path, "w", newline="") as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(rows)

Opening: /content/drive/MyDrive/dl-project/models/qwen2_5/qwen2_5_benchmark_results_join.csv
Opening: /content/drive/MyDrive/dl-project/models/gemma3-4b/gemma3-4b_benchmark_results_join.csv
Opening: /content/drive/MyDrive/dl-project/models/gemma-3-12b/gemma-3-12b_benchmark_results_join.csv
Opening: /content/drive/MyDrive/dl-project/models/gemma-3-27b/gemma-3-27b_benchmark_results_join.csv
Opening: /content/drive/MyDrive/dl-project/models/mistralai-3.2/mistralai-3.2_benchmark_results_join.csv
Opening: /content/drive/MyDrive/dl-project/models/mistralai-3.1/mistralai-3.1_benchmark_results_join.csv


In [None]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.3-py3-none-any.whl.metadata (3.9 kB)
Collecting Levenshtein==0.27.3 (from python-Levenshtein)
  Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.3->python-Levenshtein)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.3-py3-none-any.whl (9.5 kB)
Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packa

In [None]:
import csv
from google.colab import userdata
from io import BytesIO
import json
import os
import numpy as np
import pandas as pd
import re
import requests
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import HuberRegressor
from sklearn.datasets import make_regression
import Levenshtein
import regex as re


from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
TARGET_DIR = "/content/drive/MyDrive/dl-project"
os.makedirs(TARGET_DIR, exist_ok=True)
PROJECT_ROOT = os.path.join(TARGET_DIR, "models")


In [None]:
def evaluate(FILE_NAME):

    os.chdir(os.path.join(PROJECT_ROOT, FILE_NAME))
    df = pd.read_csv(f"{FILE_NAME}_benchmark_results_join.csv")

    # remove commas and convert to floats
    df["max_inf"] = df["max_inf"].str.replace(",", "").astype(float) if df["max_inf"].dtype == 'object' else df["max_inf"]
    df["max_truth"] = df["max_truth"].str.replace(",", "").astype(float) if df["max_truth"].dtype == 'object' else df["max_truth"]
    df["min_inf"] = df["min_inf"].str.replace(",", "").astype(float) if df["min_inf"].dtype == 'object' else df["min_inf"]
    df["min_truth"] = df["min_truth"].str.replace(",", "").astype(float) if df["min_truth"].dtype == 'object' else df["min_truth"]

    metrics = {
        "max" : {"y_pred": df["max_inf"], "y_act": df["max_truth"]},
        "min" : {"y_pred": df["min_inf"], "y_act": df["min_truth"]},
        "range" : {"y_pred": df["range_inf"], "y_act": df["range_truth"]},
        "title" : {"y_pred": df["title_inf"], "y_act": df["title_truth"]},
        "domain": {"y_pred": df["domain_inf"], "y_act": df["domain_truth"]}
    }
    n = len(df["title_inf"])

    metrics

    ## domain
    f1_domain = f1_score(metrics["domain"]["y_act"], metrics["domain"]["y_pred"], average="weighted")

    # domain_s_mape = np.zeros(n)
    # domain_s_mape[np.where(df["domain_corr"] == False)] = 2 ## True = 0 False = 1
    # domain_s_mape = np.mean(domain_s_mape) * 100




    ## title
    title_pred = metrics["title"]["y_pred"]
    title_act = metrics["title"]["y_act"]

    distance = sum([Levenshtein.distance(str(title_act[i]), str(title_pred[i])) for i in range(n)])/n
    similarity = sum([Levenshtein.ratio(str(title_act[i]), str(title_pred[i])) for i in range(n)])/n


    # title_s_mape = np.zeros(n)
    # title = df["title_corr"]
    # title_s_mape[np.where(title == False)] = 2 ## True = 0 False = 1
    # title_s_mape = np.mean(title_s_mape) * 100

    ## range
    def extract_bounds(ranges):
        lowers, uppers = [], []
        for s in ranges:
            if not isinstance(s, str):
                s = str(s)
            s = s.replace(",", "")  # remove commas
            m = re.match(r"^\s*([0-9.]+)\s*-\s*([0-9.]+)\s*$", s)
            if m:
                lowers.append(float(m.group(1)))
                uppers.append(float(m.group(2)))
        return lowers, uppers


    lower_pred, upper_pred = extract_bounds(metrics["range"]["y_pred"])
    lower_act, upper_act = extract_bounds(metrics["range"]["y_act"])
    range_act = np.array(upper_act)- np.array(lower_act)


    def sMAPE(actual, predict):
        num = np.abs(np.array(predict) - np.array(actual))
        s_mape = np.mean(num/range_act)
        s_mape_sq = np.mean((num/range_act)**2)
        return s_mape, s_mape_sq

    ## max
    max_pred = metrics["max"]["y_pred"]
    max_act = metrics["max"]["y_act"]
    max_mae = mean_absolute_error(max_act, max_pred)
    max_s_mape, max_s_mape_sq = sMAPE(max_act, max_pred)


    ## min
    min_pred = metrics["min"]["y_pred"]
    min_act = metrics["min"]["y_act"]
    min_mae = mean_absolute_error(min_act, min_pred)
    min_s_mape, min_s_mape_sq = sMAPE(min_act, min_pred)


    lower_mae = mean_absolute_error(lower_act, lower_pred)
    upper_mae = mean_absolute_error(upper_act, upper_pred)


    s_mape_lower, s_mape_lower_sq = sMAPE(lower_act, lower_pred)
    s_mape_upper, s_mape_upper_sq = sMAPE(upper_act, upper_pred)

    results_dict = {
            "file_name": FILE_NAME,
            "f1_domain": f1_domain, ## f1 score for domain
            "title_lev_distance": distance, ## calculated Levenshtein distance for title
            "title_lev_similarity": similarity, ## Levenshtein similarity
            "max_s_mape": max_s_mape,
            "min_s_mape": min_s_mape,
            "lower_s_mape": s_mape_lower,
            "upper_s_mape": s_mape_upper,
            "max_s_mape_sq": max_s_mape_sq,
            "min_s_mape_sq": min_s_mape_sq,
            "lower_s_mape_sq": s_mape_lower_sq,
            "upper_s_mape_sq": s_mape_upper_sq

        }

    result_df = pd.DataFrame([results_dict])
    analysis_path = os.path.join(TARGET_DIR, "results", "analysis.csv")

    if os.path.exists(analysis_path):
        existing = pd.read_csv(analysis_path)

        # If the file_name already exists, replace its row
        if "file_name" in existing.columns and FILE_NAME in existing["file_name"].values:
            existing.loc[existing["file_name"] == FILE_NAME, :] = result_df.values[0]
            updated = existing
        else:
            updated = pd.concat([existing, result_df], ignore_index=True)

        updated.to_csv(analysis_path, index=False)
    else:
        result_df.to_csv(analysis_path, index=False)


evaluate("qwen2_5")
evaluate("gemma3-4b")
evaluate("gemma-3-12b")
evaluate("gemma-3-27b")
evaluate("mistralai-3.1")
evaluate("mistralai-3.2")