<a href="https://colab.research.google.com/github/sankalpj30/article_metadata_extraction/blob/pydantic-opik-setup/Gemini_Article_MetadataExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade opik google-genai litellm pydantic




In [9]:
from google import genai
from google.genai import client, types
import os, json, re, io, pathlib
from google.colab import files
from opik import track
from opik.integrations.genai import track_genai

from pydantic import BaseModel, Field, HttpUrl, ValidationError
from typing import List, Optional
from datetime import date

import opik


In [10]:

opik.configure(use_local=False)

OPIK: Existing Opik clients will not use updated values for "url", "api_key", "workspace".
OPIK: Opik is already configured. You can check the settings by viewing the config file at /root/.opik.config


In [36]:
#  Hard‑code your API key (not recommended for production)
def log(msg):
    print(f"\n📝 {msg}\n{'-'*80}")

client = genai.Client(api_key="AIzaSyC_b7uQQUYVBargbN-YNNveg9qArbe4pHk")

os.environ["OPIK_PROJECT_NAME"] = "gemini-article-metadata-extraction-demo"


Define Pydantic Models

In [37]:
class Author(BaseModel):
    full_name: str

class Reference(BaseModel):
    authors: List[str] = Field(..., description="List of authors in 'Last F.M.' format")
    title: str = Field(..., description="Title of the referenced work")
    journal: str = Field(..., description="Journal or conference name")
    year: int = Field(..., description="Year of publication")
    volume: Optional[str] = Field(None, description="Volume number")
    issue: Optional[str] = Field(None, description="Issue number")
    pages: Optional[str] = Field(None, description="Page range")
    doi: Optional[str] = Field(None, description="Digital Object Identifier")

class CitationMetadata(BaseModel):
    title: str
    authors: List[Author]
    journal: Optional[str] = None
    year: Optional[int] = None
    volume: Optional[str] = None
    issue: Optional[str] = None
    pages: Optional[str] = None
    doi: Optional[str] = None
    abstract: Optional[str] = None
    keywords: Optional[List[str]] = None
    references: Optional[List[Reference]] = None

JSON Cleaner Utility

In [38]:
def clean_and_load_json(text: str) -> dict:
    """
    Removes any markdown formatting and loads JSON string safely.
    """
    text = text.strip()
    if text.startswith("```json"):
        text = text.lstrip("```json").rstrip("```").strip()
    elif text.startswith("```"):
        text = text.strip("`").strip()
    return json.loads(text)

Metadata Extractor Function

In [39]:
@track
def extract_structured_metadata(response_text: str) -> dict:
    try:
        data = clean_and_load_json(response_text)
    except Exception as e:
        log(f"❌ Failed to extract JSON: {e}")
        raise

    log("✅ Raw JSON extracted:\n" + json.dumps(data, indent=2))

    try:
        authors = [Author(full_name=name) for name in data.get("authors", [])]
        data["authors"] = authors
        references=[Reference(**ref) for ref in data.get("references", [])]

        metadata = CitationMetadata(**data)
        log("✅ Metadata validated successfully")
        return metadata.model_dump()
    except ValidationError as ve:
        log(f"❌ Validation error: {ve}")
        raise RuntimeError("Metadata validation failed") from ve


Summarization Function

In [40]:
@track
def summarize_pdf_colab(filepath: str, model: str = "gemini-1.5-flash") -> dict:
    """
    Process PDF and extract structured bibliographic metadata using Gemini API.
    """
    path = pathlib.Path(filepath)
    size = path.stat().st_size

    instruction = """
Respond *ONLY* with a single JSON object (no markdown, no commentary):
{
  "title": "...",
  "authors": ["Author One", "Author Two"],
  "journal": "...",
  "year": 2023,
  "volume": "...",
  "issue": "...",
  "pages": "...",
  "doi": "...",
  "abstract": "...",
  "keywords": ["keyword1", "keyword2"],
  "references": [
  {
    "authors": ["Last, F.M.", "Last, F.M."],
    "title": "...",
    "journal": "...",
    "year": 2023,
    "volume": "...",
    "issue": "...",
    "pages": "...",
    "doi": "..."
  }
]
}
If a field is missing, use "Not Found" or null.
"""

    if size < 20 * 1024 * 1024:
        pdf_bytes = path.read_bytes()
        contents = [
            types.Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
            instruction
        ]
    else:
        pdf_io = io.BytesIO(path.read_bytes())
        uploaded_file = client.files.upload(
            file=pdf_io,
            config={"mime_type": "application/pdf"}
        )
        contents = [uploaded_file, instruction]

    log("📨 Sending to Gemini model...")
    response = client.models.generate_content(
        model=model,
        contents=contents
    )
    raw = response.text
    print("===== Model Response =====")
    print(raw)
    return raw


In [41]:
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

try:
    raw_response = summarize_pdf_colab(pdf_path)
    structured = extract_structured_metadata(raw_response)
    print("===== Bibliographic Metadata =====")
    log("✅ Final structured metadata:")
    print(json.dumps(structured, indent=2))
except Exception as e:
    print("❌ Error during summarization:", str(e))

Saving nihms-1925249.pdf to nihms-1925249 (10).pdf

📝 📨 Sending to Gemini model...
--------------------------------------------------------------------------------
===== Model Response =====
```json
{
  "title": "Developing guidance for feeding tube administration of oral medications",
  "authors": [
    "Mark G. Klang"
  ],
  "journal": "JPEN J Parenter Enteral Nutr.",
  "year": 2023,
  "volume": "47",
  "issue": "4",
  "pages": "519–540",
  "doi": "10.1002/jpen.2490",
  "abstract": "Background: Drug administration through feeding tubes presents many challenges to the healthcare provider. There is little information available on medications than can be delivered safely when crushed and what efforts can be implemented to minimize clogging the feeding tube. Our institution requested a comprehensive examination of all oral medications for the feeding tube route. Materials and Methods: This report is a synopsis of the physical evaluation of 323 different oral medications for their appropr