<a href="https://colab.research.google.com/github/sankalpj30/article_metadata_extraction/blob/api-module-data-extraction/Gemini_Article_MetadataExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install --upgrade fastapi localtunnel uvicorn pyngrok opik google-genai litellm pydantic  python-multipart


[31mERROR: Could not find a version that satisfies the requirement localtunnel (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for localtunnel[0m[31m
[0m

SyntaxError: invalid syntax (<ipython-input-11-c28d41661604>, line 1)

In [2]:
from google import genai
from google.genai import client, types
import os, json, re, io, pathlib, logging
from google.colab import files
from opik import track
from opik.integrations.genai import track_genai

from pydantic import BaseModel, Field, HttpUrl, ValidationError
from typing import List, Optional
from datetime import date

import opik

from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.middleware.cors import CORSMiddleware

import nest_asyncio
from colabcode import ColabCode
# Allow nested event loops for uvicorn in Colab
nest_asyncio.apply()

stMT7Gm7fPB62BhWoGRZBxLwb

In [3]:

opik.configure(use_local=False)

OPIK: Opik is already configured. You can check the settings by viewing the config file at /root/.opik.config


In [4]:
#  Hard‑code your API key (not recommended for production)
def log(msg):
    print(f"\n📝 {msg}\n{'-'*80}")

client = genai.Client(api_key="AIzaSyC_b7uQQUYVBargbN-YNNveg9qArbe4pHk")

os.environ["OPIK_PROJECT_NAME"] = "gemini-article-metadata-extraction-demo"


# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

Define Pydantic Models

In [5]:
class Author(BaseModel):
    full_name: str

class Reference(BaseModel):
    authors: List[str] = Field(..., description="List of authors in 'Last F.M.' format")
    title: str = Field(..., description="Title of the referenced work")
    journal: str = Field(..., description="Journal or conference name")
    year: int = Field(..., description="Year of publication")
    volume: Optional[str] = Field(None, description="Volume number")
    issue: Optional[str] = Field(None, description="Issue number")
    pages: Optional[str] = Field(None, description="Page range")
    doi: Optional[str] = Field(None, description="Digital Object Identifier")

class CitationMetadata(BaseModel):
    title: str
    authors: List[Author]
    journal: Optional[str] = None
    year: Optional[int] = None
    volume: Optional[str] = None
    issue: Optional[str] = None
    pages: Optional[str] = None
    doi: Optional[str] = None
    abstract: Optional[str] = None
    keywords: Optional[List[str]] = None
    references: Optional[List[Reference]] = None

JSON Cleaner Utility

In [6]:
def clean_and_load_json(text: str) -> dict:
    """
    Removes any markdown formatting and loads JSON string safely.
    """
    text = text.strip()
    if text.startswith("```json"):
        text = text.lstrip("```json").rstrip("```").strip()
    elif text.startswith("```"):
        text = text.strip("`").strip()
    return json.loads(text)

Metadata Extractor Function

In [7]:
@track
def extract_structured_metadata(response_text: str) -> dict:
    try:
        data = clean_and_load_json(response_text)
    except Exception as e:
        log(f"❌ Failed to extract JSON: {e}")
        raise

    log("✅ Raw JSON extracted:\n" + json.dumps(data, indent=2))

    try:
        authors = [Author(full_name=name) for name in data.get("authors", [])]
        data["authors"] = authors
        references=[Reference(**ref) for ref in data.get("references", [])]

        metadata = CitationMetadata(**data)
        log("✅ Metadata validated successfully")
        return metadata.model_dump()
    except ValidationError as ve:
        log(f"❌ Validation error: {ve}")
        raise RuntimeError("Metadata validation failed") from ve


Summarization Function

In [13]:
@track
def summarize_pdf(filepath: str, model: str = "gemini-1.5-flash") -> dict:
    """
    Process PDF and extract structured bibliographic metadata using Gemini API.
    """
    path = pathlib.Path(filepath)
    size = path.stat().st_size

    instruction = """
Respond *ONLY* with a single JSON object (no markdown, no commentary):
{
  "title": "...",
  "authors": ["Author One", "Author Two"],
  "journal": "...",
  "year": 2023,
  "volume": "...",
  "issue": "...",
  "pages": "...",
  "doi": "...",
  "abstract": "...",
  "keywords": ["keyword1", "keyword2"],
  "references": [
  {
    "authors": ["Last, F.M.", "Last, F.M."],
    "title": "...",
    "journal": "...",
    "year": 2023,
    "volume": "...",
    "issue": "...",
    "pages": "...",
    "doi": "..."
  }
]
}
If a field is missing, use "Not Found" or null.
"""

    if size < 20 * 1024 * 1024:
        pdf_bytes = path.read_bytes()
        contents = [
            types.Part.from_bytes(data=pdf_bytes, mime_type="application/pdf"),
            instruction
        ]
    else:
        pdf_io = io.BytesIO(path.read_bytes())
        uploaded_file = client.files.upload(
            file=pdf_io,
            config={"mime_type": "application/pdf"}
        )
        contents = [uploaded_file, instruction]

    log("📨 Sending to Gemini model...")
    response = client.models.generate_content(
        model=model,
        contents=contents
    )
    raw = response.text
    print("===== Model Response =====")
    print(raw)
    return raw


FastAPI App Definition

In [14]:
app = FastAPI(
    title="PDF Metadata Extractor",
    version="1.0"
)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.get("/health")
def health_check():
    return {"status": "ok"}

@app.post("/extract-metadata", response_model=CitationMetadata)
async def extract_metadata(file: UploadFile = File(...)):
    if not file.filename.lower().endswith(".pdf"):
        raise HTTPException(status_code=400, detail="Only PDF files supported")

    # Save uploaded file to /tmp and process
    content = await file.read()
    tmp_path = "/tmp/" + file.filename
    with open(tmp_path, "wb") as f:
        f.write(content)

    raw = summarize_pdf(tmp_path)
    return extract_structured_metadata(raw)

Launch FastAPI with ColabCode

In [19]:
import nest_asyncio, threading, uvicorn

nest_asyncio.apply()  # allow nested event loop :contentReference[oaicite:4]{index=4}

def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000)

threading.Thread(target=run_server, daemon=True).start()
print("✅ Uvicorn running on port 8000")

✅ Uvicorn running on port 8000


In [20]:
# Install localtunnel if you haven't already
!npx localtunnel --port 8000

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K[1G[0JNeed to install the following packages:
localtunnel@2.0.2
Ok to proceed? (y) [20Gy

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0Kyour url is: https://clear-trains-float.loca.lt
yy
^C


In [None]:
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

try:
    raw_response = summarize_pdf(pdf_path)
    structured = extract_structured_metadata(raw_response)
    print("===== Bibliographic Metadata =====")
    log("✅ Final structured metadata:")
    print(json.dumps(structured, indent=2))
except Exception as e:
    print("❌ Error during summarization:", str(e))