In [1]:
# Initial set up
!pip install fastapi uvicorn nest_asyncio pyngrok requests



Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [3]:
!pip install fireworks-ai

Collecting fireworks-ai
  Downloading fireworks_ai-0.19.19-py3-none-any.whl.metadata (2.4 kB)
Collecting httpx-ws (from fireworks-ai)
  Downloading httpx_ws-0.7.2-py3-none-any.whl.metadata (9.3 kB)
Collecting mmh3>=4.1.0 (from fireworks-ai)
  Downloading mmh3-5.2.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (14 kB)
Collecting betterproto-fw>=2.0.3 (from betterproto-fw[compiler]>=2.0.3->fireworks-ai)
  Downloading betterproto_fw-2.0.3-py3-none-any.whl.metadata (18 kB)
Collecting asyncstdlib-fw>=3.13.2 (from fireworks-ai)
  Downloading asyncstdlib_fw-3.13.2-py3-none-any.whl.metadata (5.0 kB)
Collecting protobuf==5.29.3 (from fireworks-ai)
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting rich>=14.0.0 (from fireworks-ai)
  Downloading rich-14.1.0-py3-none-any.whl.metadata (18 kB)
Collecting attrs==23.2.0 (from fireworks-ai)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting aiod

In [5]:
FIREWORKS_API_KEY = ""
FIREWORKS_SDK_DEBUG=True

In [None]:
# simple_eval.py
import os, base64, glob, json
from dotenv import load_dotenv
from fireworks import LLM

load_dotenv()
API_KEY = os.getenv("FIREWORKS_API_KEY")

# 🔹 Fill in your Fireworks model IDs here
MODELS = [
  "llama4-maverick-instruct-basic",
    "lama4-scout-instruct-basic",
    "qwen2p5-vl-32b-instruct",
]

IMAGES = glob.glob("images/*.*")[:10]  # pick your 10 images

PROMPT = """Extract the following fields from this ID document.
Return only JSON with keys: name, dob (YYYY-MM-DD), issuing_country (ISO3),
id_number, expiry_date (YYYY-MM-DD), address.
If a field is missing, set it to null."""

def encode_image(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("ascii")

def run():
    for model in MODELS:
        print(f"\n=== {model} ===")
        llm = LLM(model=model, api_key=API_KEY)

        outputs = []
        for img in IMAGES:
            img_b64 = encode_image(img)
            messages = [{
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
                    {"type": "text", "text": PROMPT},
                ],
            }]
            resp = llm.chat.completions.create(messages=messages, max_tokens=500)
            content = resp.choices[0].message.content
            print(f"{os.path.basename(img)} → {content}")
            outputs.append({"image": img, "output": content})

        with open(f"results_{model.split('/')[-1]}.json", "w") as f:
            json.dump(outputs, f, indent=2)

if __name__ == "__main__":
    run()


In [29]:
# score_colab.py (run this cell in Colab)

import json, re, os
from typing import List, Dict, Any

# ---- Fields you care about (from your eval spec) ----
FIELDS = ["name", "dob", "issuing_country", "id_number", "expiry_date", "address"]

# ---------- Loaders & cleaners ----------
def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def extract_json_block(text: str) -> Dict[str, Any]:
    """Pull the first {...} block out of a messy string like '```json { ... } ```'."""
    if not isinstance(text, str):
        return {}
    m = re.search(r"\{.*\}", text, flags=re.S)
    if not m:
        return {}
    try:
        return json.loads(m.group(0))
    except Exception:
        return {}

def load_predictions_anyshape(path: str) -> List[Dict[str, Any]]:
    """
    Supports two shapes:
      1) Clean array of dicts (already JSON objects with the fields).
      2) Array of {image, output: '```json { ... } ```'} from your model runs.
    Returns a list of plain dicts with fields (no code fences, etc.).
    """
    data = load_json(path)
    if not data:
        return []
    # If it's the model-output style (has "output" strings), clean them:
    if isinstance(data[0], dict) and "output" in data[0] and isinstance(data[0]["output"], str):
        cleaned = []
        for rec in data:
            obj = extract_json_block(rec["output"])
            # Keep image name for human-friendly tables (optional)
            if "image" in rec:
                obj["image"] = rec["image"]
            cleaned.append(obj)
        return cleaned
    # Otherwise assume it's already an array of plain dicts
    return data

# ---------- Normalization (simple & friendly) ----------
def norm(v):
    if v is None:
        return None
    return str(v).strip().upper()

def normalize_record(rec: Dict[str, Any]) -> Dict[str, Any]:
    return {f: norm(rec.get(f)) for f in FIELDS}

# ---------- Scoring ----------
def compare_by_index(gt: List[Dict[str, Any]], pr: List[Dict[str, Any]]):
    """
    Compares record i in eval with record i in predictions.
    Returns per-field accuracy, ALL_FIELDS_EXACT, and side-by-side rows.
    """
    # Normalize once
    gt_norm = [normalize_record(r) for r in gt]
    pr_norm = [normalize_record(r) for r in pr]

    n = min(len(gt_norm), len(pr_norm))
    totals = {f: 0 for f in FIELDS}
    correct = {f: 0 for f in FIELDS}
    all_exact = 0
    side_by_side_rows = []  # for human inspection

    for i in range(n):
        g = gt_norm[i]
        p = pr_norm[i]
        row_all_ok = True
        row_display = {"#": i+1}
        for f in FIELDS:
            gv, pv = g.get(f), p.get(f)
            row_display[f] = f"{gv or '∅'}  /  {pv or '∅'}"
            totals[f] += 1
            if gv == pv:
                correct[f] += 1
            else:
                row_all_ok = False
        if row_all_ok:
            all_exact += 1
        side_by_side_rows.append(row_display)

    acc = {f: (100.0 * correct[f] / totals[f] if totals[f] else 0.0) for f in FIELDS}
    acc["ALL_FIELDS_EXACT"] = 100.0 * all_exact / n if n else 0.0
    return acc, side_by_side_rows, n

def print_accuracy_table(model_label: str, acc: Dict[str, float], n_docs: int):
    print(f"\n=== {model_label} ===")
    print(f"Docs compared: {n_docs}")
    for f in FIELDS:
        print(f"{f:16s}: {acc[f]:5.1f}%")
    print(f"{'ALL_FIELDS_EXACT':16s}: {acc['ALL_FIELDS_EXACT']:5.1f}%")

def print_side_by_side(model_label: str, rows: List[Dict[str, str]], max_rows=None):
    print(f"\n-- {model_label}: Eval vs Model (eval  /  model) --")
    header = ["#"] + FIELDS
    print(" | ".join(h.upper() for h in header))
    print("-" * 110)
    count = 0
    for r in rows:
        if max_rows is not None and count >= max_rows:
            break
        cells = [str(r['#'])] + [r[f] for f in FIELDS]
        print(" | ".join(cells))
        count += 1

# ---------- Main (edit your paths here) ----------
eval_path = "/content/eval.json"
model_paths = [
    "/content/results_llama4-maverick-instruct-basic.json",
    "/content/results_llama4-scout-instruct-basic.json",
    "/content/results_qwen2p5-vl-32b-instruct.json",
]

# Load eval (already a clean array of dicts)
eval_data = load_json(eval_path)

# Score each model
best_label, best_score = None, -1.0
for mp in model_paths:
    preds = load_predictions_anyshape(mp)
    acc, rows, n_docs = compare_by_index(eval_data, preds)

    label = os.path.basename(mp)
    print_accuracy_table(label, acc, n_docs)
    # Side-by-side for quick judgment (print all docs)
    print_side_by_side(label, rows)

    if acc["ALL_FIELDS_EXACT"] > best_score:
        best_score = acc["ALL_FIELDS_EXACT"]
        best_label = label

print(f"\n>>> BEST MODEL: {best_label}  ({best_score:.1f}% ALL_FIELDS_EXACT)")



=== results_llama4-maverick-instruct-basic.json ===
Docs compared: 9
name            :  55.6%
dob             : 100.0%
issuing_country : 100.0%
id_number       :  66.7%
expiry_date     :  66.7%
address         :  44.4%
ALL_FIELDS_EXACT:  11.1%

-- results_llama4-maverick-instruct-basic.json: Eval vs Model (eval  /  model) --
# | NAME | DOB | ISSUING_COUNTRY | ID_NUMBER | EXPIRY_DATE | ADDRESS
--------------------------------------------------------------------------------------------------------------
1 | IMA CARDHLDER  /  IMA CARDHOLDER | 1977-08-31  /  1977-08-31 | USA  /  USA | 000123456789  /  11234568 | 2024-08-31  /  2014-08-31 | 2570 24TH STREET ANYTOWN, CA 95818  /  2570 24TH STREET ANYTOWN, CA 95818
2 | SAMPLE JAICE ANN  /  JANICE ANN SAMPLE | 2005-01-07  /  2005-01-07 | USA  /  USA | ∅  /  99999999 | 2026-01-08  /  2026-01-08 | 123 MAIN STREET APT1 HARRISBURG PA 17101-0000  /  123 MAIN STREET APT.1 HARRISBURG, PA 17101-0000
3 | JOHN Q PUBLIC  /  JOHN Q PUBLIC | 1952-05-28  /