<a href="https://colab.research.google.com/github/soosysoda/fda_devices_tool/blob/main/text_extractor_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers sentence-transformers torch torchvision torchaudio PyMuPDF beautifulsoup4 requests



In [None]:
import fitz  # PyMuPDF
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from transformers import pipeline
import torch
import re
from sentence_transformers import SentenceTransformer, util


In [None]:
device = 0 if torch.cuda.is_available() else -1
gpu_status = "GPU: " + torch.cuda.get_device_name(0) if device == 0 else "CPU"
print(f"🔹 Running on {gpu_status}")

🔹 Running on GPU: Tesla T4


In [None]:
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=device)
embedder = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [None]:
CATEGORY_KEYWORDS = {
    "Hardware": ["probe", "sensor", "ultrasound", "scanner", "device", "unit", "hardware"],
    "Software Components": ["app", "module", "software", "system", "dashboard", "firmware"],
    "AI Models": ["ai", "ml", "classifier", "neural", "bert", "deep learning", "model"],
    "Data Pipelines": ["FHIR", "ETL", "stream", "cloud", "data ingestion", "API"],
    "User Interface": ["UI", "interface", "touchscreen", "display", "user input", "dashboard"],
    "Integration": ["EHR", "PACS", "HL7", "connect", "integration", "platform"]
}
CATEGORY_EMBEDDINGS = {
    category: embedder.encode(words, convert_to_tensor=True)
    for category, words in CATEGORY_KEYWORDS.items()
}

In [None]:
def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

def get_all_internal_links(start_url, domain, max_pages=5, timeout=6):
    visited = set()
    to_visit = [start_url]
    found = []

    while to_visit and len(found) < max_pages:
        url = to_visit.pop(0)  # FIFO for breadth-first crawling
        if url in visited:
            continue
        visited.add(url)

        try:
            res = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
            if res.status_code != 200 or "text/html" not in res.headers.get("Content-Type", ""):
                continue

            soup = BeautifulSoup(res.text, "html.parser")
            text = soup.get_text(separator=" ", strip=True)
            text = re.sub(r"\s+", " ", text)
            found.append((url, text))

            for a in soup.find_all("a", href=True):
                full_url = urljoin(url, a["href"])
                if (
                    full_url.startswith(("http://", "https://"))
                    and urlparse(full_url).netloc == domain
                    and full_url not in visited
                ):
                    to_visit.append(full_url)

        except requests.RequestException:
            continue

    return found


def extract_design_with_bert(text):
    question = "What is the system design of the medical device?"
    answers = []
    chunks = [text[i:i+1500] for i in range(0, len(text), 1500)]
    for chunk in chunks:
        result = qa_pipeline(question=question, context=chunk)
        if result and result['score'] > 0.05:
            answers.append(result['answer'])
    return list(set(answers))

In [None]:
def normalize_keywords(raw_texts):
    bag = set()
    for raw in raw_texts:
        if not raw:
            continue
        parts = re.split(r"[,;\n]+", raw)
        for p in parts:
            kw = re.sub(r"\s+", " ", p).strip(" -–—_,.;:()[]").strip()
            if kw and 2 <= len(kw) < 120:
                bag.add(kw.lower())
    return sorted(bag)

def classify_keyword(keyword):
    kw_vec = embedder.encode(keyword, convert_to_tensor=True)
    max_sim = -1
    best_cat = "Uncategorized"
    for category, emb_list in CATEGORY_EMBEDDINGS.items():
        sim_scores = util.cos_sim(kw_vec, emb_list)
        top_score = sim_scores.max().item()
        if top_score > max_sim:
            max_sim = top_score
            best_cat = category
    return best_cat

def classify_keywords_bulk(keywords):
    result = {k: [] for k in CATEGORY_KEYWORDS}
    result["Uncategorized"] = []
    for kw in keywords:
        cat = classify_keyword(kw)
        result[cat].append(kw)
    return result

def format_to_markdown_table(classified):
    md = "| Category | Keywords (comma separated) |\n|----------|------------------------------|\n"
    for cat in list(CATEGORY_KEYWORDS.keys()) + ["Uncategorized"]:
        kws = ", ".join(sorted(set(classified.get(cat, []))))
        md += f"| {cat} | {kws} |\n"
    return md

In [None]:
from google.colab import files
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

Saving K250005.pdf to K250005 (1).pdf


In [None]:
print("\n🔍 Extracting from PDF...")
pdf_text = extract_pdf_text(pdf_path)
pdf_design_info = extract_design_with_bert(pdf_text)
for para in pdf_design_info:
    print(f"✅ PDF: {para}")


🔍 Extracting from PDF...
✅ PDF: Medical Image Management And Processing System


In [None]:
pdf_keywords = normalize_keywords(pdf_design_info)

In [None]:
website_url = input("\n🌐 Enter official device website URL (or press Enter to skip): ").strip()
web_keywords = []
if website_url:
    domain = urlparse(website_url).netloc
    webpages = get_all_internal_links(website_url, domain, 10, 10)
    for url, content in webpages:
        design_info = extract_design_with_bert(content)
        for chunk in design_info:
            print(f"✅ {url}: {chunk}")
        web_keywords += normalize_keywords(design_info)


🌐 Enter official device website URL (or press Enter to skip): https://www.ewoosoft.com/


In [None]:
all_keywords = sorted(set(pdf_keywords + web_keywords))
print(all_keywords)

['medical image management and processing system']


In [None]:
classified = classify_keywords_bulk(all_keywords)
print("\n📊 Classified Keywords Table:\n")
print(format_to_markdown_table(classified))


📊 Classified Keywords Table:

| Category | Keywords (comma separated) |
|----------|------------------------------|
| Hardware | medical image management and processing system |
| Software Components |  |
| AI Models |  |
| Data Pipelines |  |
| User Interface |  |
| Integration |  |
| Uncategorized |  |

