<a href="https://colab.research.google.com/github/rileycsv/PDF-to-CSV-Infoceutical-Extractor/blob/main/PDFtoCSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# ---------------------------------------------
# 1. Install libraries
# ---------------------------------------------
!pip install PyMuPDF pandas --quiet

import fitz  # PyMuPDF
import pandas as pd
import re
from google.colab import files

# ---------------------------------------------
# 2. Upload your PDF
# ---------------------------------------------
print("⬆️ Upload your Infoceuticals PDF…")
uploaded = files.upload()
PDF_NAME = next(iter(uploaded))
doc = fitz.open(PDF_NAME)

# ---------------------------------------------
# 3. Match Titles with Tags
# ---------------------------------------------
END_TAGS = ("Imprint", "PEMF", "Rife", "Nosode", "Audible")
TITLE_RX = re.compile(rf"^(.*?)\s+\[({'|'.join(END_TAGS)})\]\s*$", re.IGNORECASE)

# ---------------------------------------------
# 4. Parse Pages
# ---------------------------------------------
entries = []
seen = set()

for page in doc:
    links = [(fitz.Rect(l["from"]), l["uri"]) for l in page.get_links() if l.get("uri")]

    lines = []
    for block in page.get_text("dict")["blocks"]:
        if "lines" not in block:
            continue
        for line in block["lines"]:
            y = line["bbox"][1]
            text = "".join(span["text"] for span in line["spans"]).strip()
            if text:
                lines.append((y, text))
    lines.sort(key=lambda tup: tup[0])

    i = 0
    while i < len(lines):
        y, txt = lines[i]
        match = TITLE_RX.match(txt)
        if match:
            raw_title = match.group(1).strip()
            if raw_title in seen:
                i += 1
                continue

            url = ""
            for rect, uri in links:
                if rect.y0 - 1 <= y <= rect.y1 + 1:
                    url = uri
                    break
            if not url:
                i += 1
                continue

            desc_lines = []
            j = i + 1
            while j < len(lines) and not TITLE_RX.match(lines[j][1]):
                desc_lines.append(lines[j][1])
                j += 1

            description = " ".join(desc_lines).strip()
            entries.append([raw_title, description, url])
            seen.add(raw_title)
            i = j
        else:
            i += 1

# ---------------------------------------------
# 5. Format and Clean DataFrame
# ---------------------------------------------
df = pd.DataFrame(entries, columns=["Title", "Description", "URL"])

def sanitize(text):
    text = text.replace('"', "'").replace('\n', ' ').strip()
    if text.startswith(("=", "+", "-", "@")):
        text = "\t" + text
    return text

def hyperlink(url):
    return f'=HYPERLINK("{url}", "{url}")'

df["Title"] = df["Title"].apply(sanitize)
df["Description"] = df["Description"].apply(sanitize)

# 🔧 Global fix of common corrupted characters
corrupted_chars = {
    "Î²": "β",
    "â€™": "'",
    "â€œ": '"',
    "â€": '"',
    "â€“": "–",
    "Ã©": "é",
    "Ã¨": "è",
    "Ã¢": "â",
    "Ã´": "ô",
    "Ãª": "ê",
    "Ã§": "ç",
    "Ã¼": "ü",
    "Ã ": "à"
}

for col in ["Title", "Description"]:
    for bad, good in corrupted_chars.items():
        df[col] = df[col].str.replace(bad, good, regex=False)

df["Link (clickable)"] = df["URL"].apply(hyperlink)
df = df[["Title", "Description", "Link (clickable)"]]

# ---------------------------------------------
# 6. Export to CSV
# ---------------------------------------------
csv_name = "infoceuticals_clickable_clean.csv"
df.to_csv(csv_name, index=False, quoting=1, encoding='utf-8')
print(f"\n✅ CSV exported: {csv_name} with {len(df)} entries")
files.download(csv_name)

⬆️ Upload your Infoceuticals PDF…


Saving all_infoceuticals_alphabetic_en.pdf to all_infoceuticals_alphabetic_en (17).pdf

✅ CSV exported: infoceuticals_clickable_clean.csv with 3669 entries


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>