In [None]:
import re
import json
import pandas as pd
import subprocess
import requests
from pdf2image import convert_from_path
import pytesseract
from scholarly import scholarly
from difflib import SequenceMatcher

In [None]:
PDF_PATH = r"/References.pdf"
OCR_OUTPUT_PATH = r"/ocr_extracted.xlsx"
VERIFIED_OUTPUT_PATH = r"/verified_data.xlsx"

In [3]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [None]:
ENRICH_CACHE = {
    "author": {}, #author lookup cache
    "title_to_doi": {} #inferred DOIs cache
}
ENRICH_CACHE.setdefault("author", {})
ENRICH_CACHE.setdefault("title_to_doi", {})
CROSSREF_URL = "https://api.crossref.org/works"

In [5]:
def extract_text_with_ocr(pdf_path):
    images = convert_from_path(pdf_path, dpi=300)
    full_text = ""
    for img in images:
        text = pytesseract.image_to_string(img)
        full_text += text + "\n"
    return full_text

In [6]:
def call_ollama(prompt, model="llama3"):
    """
    Sends prompt to Ollama CLI and returns raw string output.
    """
    result = subprocess.run(
        ["ollama", "run", model],
        input=prompt.encode("utf-8"),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    output = result.stdout.decode("utf-8")
    if not output.strip():
        print("Warning: Ollama returned empty output")
    return output

In [None]:
def extract_references_with_llm(ocr_text, n_refs=1):
    """
    Extracts the first `n_refs` references from OCR text using LLaMA via Ollama.
    Handles extra explanatory text in LLaMA output and ensures valid JSON.
    Returns a list of dictionaries with keys:
    - "reference"
    - "title"
    - "first_author"
    - "last_author"
    """
    prompt = f"""
You are given the text of a PDF containing academic references. Extract the first {n_refs} references.

For each reference, output a JSON object with keys:
- "reference": the full reference text as it appears in the PDF
- "title": the title of the paper/book
- "first_author": the first author
- "last_author": the last author

Important:
- Return ONLY a JSON array.
- Do NOT include any explanation, notes, or extra text.
- DO NOT start the array with any other text.
- Ensure all strings use double quotes (") for keys and values.
- If you cannot extract some fields, leave them as empty strings.

PDF text:
{ocr_text}
"""

    #calling Ollama CLI
    result = subprocess.run(
        ["ollama", "run", "llama3"],
        input=prompt.encode("utf-8"),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    response = result.stdout.decode()

    #JSON extraction
    match = re.search(r'\[.*\]', response, re.DOTALL)
    if match:
        try:
            refs = json.loads(match.group())
        except json.JSONDecodeError:
            print("Failed to decode JSON. Saving raw response to llama_raw_output.txt")
            with open("llama_raw_output.txt", "w", encoding="utf-8") as f:
                f.write(response)
            refs = []
    else:
        print("No JSON array found. Saving raw response to llama_raw_output.txt")
        with open("llama_raw_output.txt", "w", encoding="utf-8") as f:
            f.write(response)
        refs = []

    #ensuring all expected keys exist
    for r in refs:
        for key in ["reference", "title", "first_author", "last_author"]:
            if key not in r:
                r[key] = ""

    return refs

In [None]:
#DuckDuckGo search
def search_duckduckgo(query):
    try:
        url = "https://api.duckduckgo.com/"
        resp = requests.get(url, params={"q": query, "format": "json"}, timeout=8)
        data = resp.json()
        snippet = data.get("AbstractText") or data.get("Heading") or ""
        snippet = snippet.strip()
        return snippet if snippet else ""
    except Exception:
        return ""

In [None]:
#fuzzy match author name
def similar(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

In [None]:
#cached lookup wrapper
def cached_lookup(key, cache, fn, *args):
    if key in cache:
        return cache[key]
    val = fn(*args)
    cache[key] = val
    return val

In [None]:
#inferring DOI from title
def infer_doi(title):
    if title in ENRICH_CACHE["title_to_doi"]:
        return ENRICH_CACHE["title_to_doi"][title]

    try:
        resp = requests.get(
            CROSSREF_URL,
            params={"query.bibliographic": title, "rows": 1},
            timeout=10
        )
        data = resp.json()
        items = data["message"]["items"]
        if items:
            doi = items[0].get("DOI", "")
            ENRICH_CACHE["title_to_doi"][title] = doi
            return doi
    except Exception:
        pass

    ENRICH_CACHE["title_to_doi"][title] = ""
    return ""

In [None]:
#main enrichment
def enrich_data_full_with_ddg(extracted_refs):
    enriched_rows = []

    for r in extracted_refs:
        fa = r["first_author"]
        la = r["last_author"]
        title = r["title"]

        #default empty
        best = {
            "fa_aff": "",
            "fa_email": "",
            "la_aff": "",
            "la_email": "",
        }
        
        #Google Scholar (highest trust)
        def lookup_author_scholar(name):
            try:
                q = scholarly.search_author(name)
                author = next(q)
                scholarly.fill(author)
                aff = author.affiliation or ""
                email = getattr(author, "email", "") or ""
                return aff, email
            except Exception:
                return "", ""

        fa_aff, fa_email = cached_lookup(f"scholar_{fa}", ENRICH_CACHE["author"], lookup_author_scholar, fa)
        la_aff, la_email = cached_lookup(f"scholar_{la}", ENRICH_CACHE["author"], lookup_author_scholar, la)

        if fa_aff: best["fa_aff"] = fa_aff
        if fa_email: best["fa_email"] = fa_email
        if la_aff: best["la_aff"] = la_aff
        if la_email: best["la_email"] = la_email

        #CrossRef (mid trust)
        doi = infer_doi(title)
        if doi:
            try:
                resp = requests.get(f"{CROSSREF_URL}/{doi}", timeout=10)
                data = resp.json()
                authors = data["message"].get("author", [])

                for a in authors:
                    fam = a.get("family", "")
                    aff = a.get("affiliation", [])
                    aff = aff[0]["name"] if aff else ""

                    if similar(fam, fa.split()[-1]) > 0.7:
                        if aff and not best["fa_aff"]:
                            best["fa_aff"] = aff

                    if similar(fam, la.split()[-1]) > 0.7:
                        if aff and not best["la_aff"]:
                            best["la_aff"] = aff
            except Exception:
                pass

        #DuckDuckGo (low trust; sanitized)
        def lookup_ddg(name):
            snippet = search_duckduckgo(f"{name} researcher affiliation university email")
            if not snippet:
                return "", ""

            snippet_clean = snippet.replace("\n", " ").strip()

            email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", snippet_clean)
            email = email_match.group(0) if email_match else ""

            return snippet_clean, email

        if not best["fa_aff"] or not best["fa_email"]:
            ddg_aff, ddg_email = cached_lookup(f"ddg_{fa}", ENRICH_CACHE["author"], lookup_ddg, fa)
            if ddg_aff and not best["fa_aff"]:
                best["fa_aff"] = ddg_aff
            if ddg_email and not best["fa_email"]:
                best["fa_email"] = ddg_email

        if not best["la_aff"] or not best["la_email"]:
            ddg_aff, ddg_email = cached_lookup(f"ddg_{la}", ENRICH_CACHE["author"], lookup_ddg, la)
            if ddg_aff and not best["la_aff"]:
                best["la_aff"] = ddg_aff
            if ddg_email and not best["la_email"]:
                best["la_email"] = ddg_email

        #LLaMA inference fallback (lowest trust)
        if "" in best.values():
            prompt = f"""
Infer affiliations/emails for:
Title: "{title}"
Authors: "{fa}", "{la}"
Return ONLY JSON:
{{
"first_author_affiliation": "",
"first_author_email": "",
"last_author_affiliation": "",
"last_author_email": ""
}}
"""
            try:
                out = subprocess.run(
                    ["ollama", "run", "llama3"],
                    input=prompt.encode(),
                    stdout=subprocess.PIPE
                )
                text = out.stdout.decode()
                match = re.search(r"\{.*\}", text, re.DOTALL)
                enrich = json.loads(match.group()) if match else {}
            except Exception:
                enrich = {}

            if not best["fa_aff"]:
                best["fa_aff"] = enrich.get("first_author_affiliation", "")
            if not best["fa_email"]:
                best["fa_email"] = enrich.get("first_author_email", "")
            if not best["la_aff"]:
                best["la_aff"] = enrich.get("last_author_affiliation", "")
            if not best["la_email"]:
                best["la_email"] = enrich.get("last_author_email", "")

        #applying final values
        r.update({
            "first_author_affiliation": best["fa_aff"],
            "first_author_email": best["fa_email"],
            "last_author_affiliation": best["la_aff"],
            "last_author_email": best["la_email"]
        })

        enriched_rows.append(r)

    return enriched_rows

In [None]:
print("Step 1: Extracting text from PDF using OCR")
ocr_text = extract_text_with_ocr(PDF_PATH)

#cleaning OCR text
#replacing curly quotes
ocr_text_clean = ocr_text.replace("’", "'").replace("‘", "'")
#merging wrapped lines that aren't new references
ocr_text_clean = re.sub(r"\n(?=[^\[])", " ", ocr_text_clean)

#extracting all references
refs_matches = re.findall(
    r'\[[^\]]+?\].*?(?=\[[^\]]+?\]|$)',
    ocr_text_clean,
    re.DOTALL
)

print(f"Found {len(refs_matches)} references in PDF.")

#parsing each reference individually
print("Step 2: Parsing references with LLaMA via Ollama")

extracted_refs = []
for i, ref in enumerate(refs_matches):
    print(f"  → Parsing reference {i+1}/{len(refs_matches)}...")
    parsed = extract_references_with_llm(ref, n_refs=1)
    if parsed:
        extracted_refs.extend(parsed)

#saving first Excel (raw parsed references)
df_ocr = pd.DataFrame(extracted_refs)
df_ocr.to_excel(OCR_OUTPUT_PATH, index=False)
print(f"Saved first Excel → {OCR_OUTPUT_PATH}")

#enriching with web enhancement
print("Step 3: Enriching with affiliations and emails")
verified_refs = enrich_data_full_with_ddg(extracted_refs)

#saving enriched Excel
df_verified = pd.DataFrame(verified_refs)
df_verified.to_excel(VERIFIED_OUTPUT_PATH, index=False)
print(f"Saved second Excel → {VERIFIED_OUTPUT_PATH}")

print("\nDONE - two separate Excel files generated successfully.")

Step 1: Extracting text from PDF using OCR
Found 614 references in PDF.
Step 2: Parsing references with LLaMA via Ollama
  → Parsing reference 1/614...
Failed to decode JSON. Saving raw response to llama_raw_output.txt
  → Parsing reference 2/614...
  → Parsing reference 3/614...
  → Parsing reference 4/614...
  → Parsing reference 5/614...
  → Parsing reference 6/614...
  → Parsing reference 7/614...
  → Parsing reference 8/614...
  → Parsing reference 9/614...
Failed to decode JSON. Saving raw response to llama_raw_output.txt
  → Parsing reference 10/614...
  → Parsing reference 11/614...
  → Parsing reference 12/614...
  → Parsing reference 13/614...
  → Parsing reference 14/614...
  → Parsing reference 15/614...
Failed to decode JSON. Saving raw response to llama_raw_output.txt
  → Parsing reference 16/614...
  → Parsing reference 17/614...
  → Parsing reference 18/614...
  → Parsing reference 19/614...
  → Parsing reference 20/614...
  → Parsing reference 21/614...
  → Parsing ref