In [32]:
import pandas as pd
import requests
import fitz  # PyMuPDF
import subprocess
import os
from urllib.parse import quote
from playwright.sync_api import sync_playwright
from playwright.async_api import async_playwright
import asyncio

In [33]:
# -------- Load and Parse Input --------
def parse_bpg(bpg_str):
    parts = bpg_str.split("~")
    return {
        "BIN": None if parts[0] == "NULL" else parts[0],
        "PCN": None if parts[1] == "NULL" else parts[1],
        "GroupID": None if parts[2] == "NULL" else parts[2],
    }

# -------- Search Google for PDFs (async) --------
async def search_pdf_links(query, max_results=3):
    links = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(f"https://www.google.com/search?q={quote(query)}")
        await page.wait_for_timeout(3000)
        anchors = await page.query_selector_all("a")
        for a in anchors:
            href = await a.get_attribute("href")
            if href and ".pdf" in href and "http" in href:
                links.append(href)
                if len(links) >= max_results:
                    break
        await browser.close()
    return links

# -------- Download PDF --------
def download_pdf(url):
    try:
        filename = os.path.join("pdfs", url.split("/")[-1].split("?")[0])
        if os.path.exists(filename):
            return filename
        r = requests.get(url, timeout=10)
        with open(filename, "wb") as f:
            f.write(r.content)
        return filename
    except:
        return None

# -------- Extract Text from PDF --------
def extract_text_from_pdf(path):
    text = ""
    try:
        doc = fitz.open(path)
        for page in doc:
            text += page.get_text()
        return text[:6000]  # truncate for LLM
    except:
        return ""

# -------- Ask Ollama to Extract Info --------
def ask_ollama(text):
    prompt = f"""
You are a data extraction expert. From the following PDF content, extract any BIN, PCN, Group ID, and Plan Type info you find. Return only:

BIN | PCN | Group ID | Plan type | Comments

Text:
{text}
"""
    try:
        result = subprocess.run(
            ["ollama", "run", "llama3.1:8b"],
            input=prompt.encode(),
            stdout=subprocess.PIPE,
            timeout=60
        )
        return result.stdout.decode()
    except Exception as e:
        return ""

# -------- Parse LLM Output --------
def parse_llm_output(output):
    lines = output.splitlines()
    rows = []
    for line in lines:
        if "|" in line:
            parts = [p.strip() for p in line.split("|")]
            if len(parts) >= 4:
                rows.append(parts[:5] + [""])  # Ensure 6 columns
    return rows

# -------- Main (async) --------
async def main():
    os.makedirs("pdfs", exist_ok=True)

    df = pd.read_excel("BPG.xlsx", header=None)
    df.columns = ["BPG"]

    # Reverse the DataFrame to process from bottom to top
    df = df[::-1].reset_index(drop=True)

    results = []

    for _, row in df.iterrows():
        bpg_data = parse_bpg(row["BPG"])
        query = " ".join([
            f"BIN {bpg_data['BIN']}" if bpg_data['BIN'] else "",
            f"PCN {bpg_data['PCN']}" if bpg_data['PCN'] else "",
            f"Group ID {bpg_data['GroupID']}" if bpg_data['GroupID'] else "",
            "filetype:pdf"
        ]).strip()

        pdf_links = await search_pdf_links(query)
        if not pdf_links:
            results.append(["", "", bpg_data['GroupID'] or "", "", "No PDF found", ""])
            continue

        for link in pdf_links:
            pdf_path = download_pdf(link)
            if not pdf_path:
                continue

            text = extract_text_from_pdf(pdf_path)
            if not text:
                continue

            llm_output = ask_ollama(text)
            rows = parse_llm_output(llm_output)
            for r in rows:
                r[5] = link  # Add PDF link
                results.append(r)
            break  # Stop after first successful PDF

    # Save results to Excel
    out_df = pd.DataFrame(results, columns=["BIN", "PCN", "Group ID", "Plan Type", "Comments", "PDF Link"])
    out_df.to_excel("BPG_output_CT.xlsx", index=False)
    print("✅ Done. Results saved to BPG_output_CT.xlsx")


# Run the async main:
import nest_asyncio
import asyncio

nest_asyncio.apply()

# For Jupyter or interactive shell, just await main()
await main()

Task exception was never retrieved
future: <Task finished name='Task-3' coro=<Connection.run() done, defined at c:\Users\surya\miniconda3\envs\BPGscript\lib\site-packages\playwright\_impl\_connection.py:272> exception=NotImplementedError()>
Traceback (most recent call last):
  File "C:\Users\surya\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3524, in run_code
    await eval(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\surya\AppData\Local\Temp\ipykernel_4948\1366881351.py", line 139, in <module>
    await main()
  File "C:\Users\surya\AppData\Local\Temp\ipykernel_4948\1366881351.py", line 105, in main
    pdf_links = await search_pdf_links(query)
  File "C:\Users\surya\AppData\Local\Temp\ipykernel_4948\1366881351.py", line 13, in search_pdf_links
    async with async_playwright() as p:
  File "c:\Users\surya\miniconda3\envs\BPGscript\lib\site-packages\playwright\async_api\_context_manager.py", line 46, in __aenter__
    playwrig

NotImplementedError: 