In [None]:
import json
import requests
from itertools import combinations
from deepdiff import DeepDiff
import pymupdf
from glob import glob
from dotenv import load_dotenv
import os
from pathlib import Path
from typing import List
from pydantic import BaseModel, Field, ConfigDict
import yaml
from time import sleep, time

In [None]:
class ExtractMetadata(BaseModel):
    """
    Structured metadata for an academic publication.
    """
    model_config = ConfigDict(extra="forbid")
    title: str = Field(
        ...,
        description="The full name identifying the academic publication.",
    )
    authors: List[str] = Field(
        ...,
        description="The names of individuals who wrote the publication.",
    )
    affiliations: List[str] = Field(
        ...,
        description="Institutions or organizations associated with the authors.",
    )
    email_ids: List[str] = Field(
        ...,
        description="Contact email IDs of the authors.",
    )
    publication_date: str = Field(
        ...,
        description="The date when the publication was officially published in DD-MM-YYYY format.",
    )
    publisher: str = Field(
        ...,
        description="The organization responsible for publishing the document.",
    )
    doi: str = Field(
        ...,
        description="A unique digital object identifier linking directly to the publication online.",
    )
    keywords: List[str] = Field(
        ...,
        description="Specific terms highlighting the main topics of the publication.",
    )
    abstract: str = Field(
        ...,
        description="A brief summary outlining the publication’s content, methods, and findings.",
    )

In [None]:
load_dotenv()

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

extract_metadata_system_prompt = config["prompts"]["extract_metadata_system_prompt"]
evaluate_metadata_system_prompt = config["prompts"]["evaluate_metadata_system_prompt"]

models = config["models"]['extract_metadata']
JUDGE_MODEL = config["models"]['evaluate_metadata']

METADATA_FIELDS = config["metadata_fields"]

In [None]:
API_URL = "https://openrouter.ai/api/v1/chat/completions"
OPENROUTER_API_KEY = os.getenv("OPENAI_API_KEY")
headers = {
    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
    "Content-Type": "application/json"
}

In [None]:
pdf_dir = Path("../data/metadata_extraction_data/pdf")

output_dir = Path("../data/metadata_extraction_data/metadata")
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using PyMuPDF. If max_pages is set, limit to that many pages."""
    with pymupdf.open(pdf_path) as doc:
        text = "\n".join([page.get_text() for page in doc.pages()])
    return text

In [None]:
def query_llm(model, system_prompt, user_prompt, name, response_schema):
    """Send a chat completion request to the specified model via OpenRouter and return the assistant's response."""
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "temperature": 0,
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": name,
                "strict": True,
                "schema": response_schema
            }
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code != 200:
        print(response.json())
    response.raise_for_status()
    data = response.json()
    content = data["choices"][0]["message"]["content"]
    sleep(10)
    return content

In [None]:
def field_level_agreement(model_dicts):
    agreed = {}
    disagreements = {m: {} for m in model_dicts.keys()}
    
    for field in METADATA_FIELDS:
        
        values = {m: (d or {}).get(field, "") for m, d in model_dicts.items()}
        
        names = list(values.keys())
        
        pairs = list(combinations(names, 2))
        agreed_value = None
        agreed_count = 0
        for m_x, m_y in pairs:
            diff = DeepDiff(values[m_x], values[m_y], ignore_order=True)
            if not diff:
                agreed_value = values[m_x]
                agreed_count+=1
        if agreed_value is not None and agreed_count==len(model_dicts.keys()):
            agreed[field] = agreed_value
        else:
            for m, val in values.items():
                disagreements[m][field] = val
    disagreements = {m: v for m, v in disagreements.items() if v}
    return agreed, disagreements

In [None]:
def build_judge_prompt(excerpt_text, disagreements):
    """
    Build the user prompt for the judge:
      – include paper excerpt
      – show only the *disputed* fields from each model
    """
    prompt_parts = [f"Paper Text (Excerpt):\n{excerpt_text}\n"]
    for model_name, fields in disagreements.items():
        prompt_parts.append(f"Disputed fields from {model_name}:\n{json.dumps(fields, indent=2)}")
    return "\n".join(prompt_parts)

In [None]:
pdf_files = glob("../data/metadata_extraction_data/pdf/*.pdf")

In [None]:
all_metadata = {}
for pdf_path in pdf_files:
    text_file_path = Path(f"{output_dir / Path(pdf_path).stem}.json")
    if not text_file_path.is_file():
        pdf_text = extract_text_from_pdf(pdf_path)
        
        model_outputs = {}
        for model in models:
            
            raw_response = query_llm(model, extract_metadata_system_prompt, pdf_text, "ExtractMetadata", ExtractMetadata.model_json_schema())
            
            try:
                metadata = json.loads(raw_response)
            except json.JSONDecodeError:
                print(f"Something went wrong with metadata extraction for {pdf_path} with {model}")
                metadata = None
            
            model_outputs[model] = metadata if metadata is not None else raw_response

        agreed, disagreements = field_level_agreement(model_outputs)

        if not disagreements:
            final_metadata = agreed
            final_analysis  = "All three models were in complete agreement for every field."
        else:
            judge_user_prompt = build_judge_prompt(pdf_text, disagreements)
            res_schema = ExtractMetadata.model_json_schema()
            disagreed_fields = [k for k in disagreements[models[0]].keys()]
            res_schema["required"] = disagreed_fields
            for k in list(res_schema["properties"].keys()):
                if not k in disagreed_fields:
                    res_schema["properties"].pop(k, None)
            fields_str = ", ".join(disagreed_fields)
            evaluate_metadata_system_prompt = evaluate_metadata_system_prompt.format(fields=fields_str)
            judge_reply = query_llm(JUDGE_MODEL, evaluate_metadata_system_prompt, judge_user_prompt, "JudgeMetadata", res_schema)
            try:
                judged_meta = json.loads(judge_reply)
            except json.JSONDecodeError:
                print(f"Something went wrong with metadata judge for {pdf_path}")
                judged_meta = {}
            final_metadata = {**agreed, **judged_meta}
        all_metadata[pdf_path] = final_metadata
        
        with open(text_file_path, 'w', encoding='utf-8') as f:
            json.dump(final_metadata, f, ensure_ascii=False, indent=4)
        print(f"Extracted metadata to: {text_file_path}")
    else:
        print(f"Metadata already exists: {text_file_path}")