# Install dependency

In [None]:
!pip install pathlib
!pip install pdfminer.six

# File path initialise

In [None]:
import os
from pathlib import Path

book_path = "./../BookAndDataFiles/books"
txt_files_path = "./../BookAndDataFiles/txtfiles"

# clean pdf extracted text

In [None]:
import re, unicodedata

def clean_pdf_text(s: str) -> str:
    # 0) normalize unicode + line endings
    s = unicodedata.normalize("NFKC", s).replace("\r\n", "\n").replace("\r", "\n")

    # 1) replace tabs with spaces; drop zero-width chars
    s = s.replace("\t", " ").replace("\u200b", "")

    # 2) common ligatures → ascii (optional)
    s = s.replace("ﬁ", "fi").replace("ﬂ", "fl")

    # 3) treat page breaks as paragraph breaks
    s = s.replace("\x0c", "\n\n")

    # 4) normalize multiple blank lines to exactly two (para separator)
    s = re.sub(r"\n{3,}", "\n\n", s)

    # 5) de-hyphenate words split across a newline: "neura-\nscience" → "neuroscience"
    s = re.sub(r"(\w+)-\n(\w+)", r"\1\2", s)

    # 6) temporarily protect paragraph breaks
    s = s.replace("\n\n", "<PARA>")

    # 7) unwrap single newlines (line-wrapped lines) to spaces
    s = re.sub(r"\n+", " ", s)

    # 8) restore paragraph breaks
    s = s.replace("<PARA>", "\n\n")

    # 9) collapse multiple spaces (incl. non-breaking)
    s = re.sub(r"[ \u00A0]{2,}", " ", s)

    # 10) tidy spaces around newlines
    s = re.sub(r" *\n *", "\n", s)

    return s.strip()


# extraction, cleaning and making files

In [None]:
from pdfminer.high_level import extract_text

for filename in os.listdir(book_path):
    if filename == "test.pdf":
        continue
    file_path = Path(os.path.join(book_path, filename)).resolve()
    text = extract_text(file_path)
    clean_text = clean_pdf_text(text)
    with open(os.path.join(txt_files_path, filename[:-4]), "w", encoding="utf-8") as f:
        f.write(clean_text)