In [4]:
import sys
!{sys.executable} -m pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.21.1-cp39-cp39-macosx_10_9_x86_64.whl (12.7 MB)
[K     |████████████████████████████████| 12.7 MB 23.4 MB/s eta 0:00:01
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.21.1


In [29]:
import re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
import fitz

def process_pdf(file_path, start_page, end_page):
    core_items = set()
    complementary_items = set()
    combined_items = set()

    doc = fitz.open(file_path)
    for page_num in range(start_page, end_page + 1):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    line_items = []
                    for span in line["spans"]:
                        text = span["text"]
                        if "Times New Roman" not in span["font"]:
                            continue
                        if "Italic" in span["font"]:
                            text = f"<i>{text}</i>"
                        line_items.append(text)

                    buffer = "".join(line_items)
                    items = re.findall(r'((?:<i>[^<]*</i>|[^<\s,][^,]*[^<\s,]))', buffer)
                    for item in items:
                        if "<i>" in item:
                            item = item.replace("<i>", "").replace("</i>", "").strip()
                            if item and item != '+':
                                complementary_items.add(item)
                        elif "+" not in item:
                            item = item.strip()
                            if item and item != '+' and not item.isdigit():
                                core_items.add(item)
                        elif item.strip() != '+':
                            combined_items.add(item.strip())

    combined_split_items = set()
    for combined_item in combined_items:
        items = combined_item.split(" + ")
        for item in items:
            if item not in core_items and item not in complementary_items and item != '+':
                combined_split_items.add(item.strip())

    return core_items, complementary_items, combined_split_items

file_path = "EML-20-eng.pdf"
start_page, end_page = 58, 61  # Assuming 0-based page indexing
core_items, complementary_items, combined_split_items = process_pdf(file_path, start_page, end_page)

print("Core items:")
print(core_items)
print("\nComplementary items:")
print(complementary_items)
print("\nCombined split items:")
print(combined_split_items)

Core items:
{'artemether', 'dextran 70', 'tick-borne encephalitis vaccine', 'suramin sodium', 'penicillamine', 'acetylcysteine', 'measles vaccine', 'oral rehydration salts', 'levonorgestrel-releasing intrauterine system', 'benzathine benzylpenicillin', 'caffeine citrate', 'atropine', 'ofloxacin', 'enalapril', 'amlodipine', 'dexamethasone', 'triclabendazole', 'antitetanus immunoglobulin', 'niclosamide', 'mupirocin', 'mefloquine', 'ulipristal', 'voriconazole', 'ergometrine', 'darunavir', 'carbamazepine', 'proguanil', 'rubella vaccine', 'sulfasalazine', 'entecavir', 'ethanol', 'natamycin', 'phenobarbital', 'ivermectin', 'ondansetron', 'amiloride', 'docusate sodium', 'anti-D immunoglobulin', 'acetic acid', 'hydroxocobalamin', 'melarsoprol', 'levamisole', 'glucose with sodium chloride', 'vancomycin', 'atazanavir', 'ascorbic acid', 'pyridoxine', 'clindamycin', 'furosemide', 'poliomyelitis vaccine', 'ribavirin', 'tropicamide', 'sodium stibogluconate or meglumine antimoniate', 'ampicillin', 'a

In [32]:
print(len(core_items))
print(len(complementary_items))
print(len(combined_split_items))

301
105
31
