In [1]:
# Install dependencies if not already installed
# pip install PyMuPDF Pillow  # PyMuPDF = Fitz
# pip install dotenv

In [2]:
# Load the required libraries
import os
import glob
from dotenv import load_dotenv

In [3]:
# Import custom project libraries
import Libraries.PyMuPDF_Parsing as pdf_parser
import Libraries.Text_Parsing as text_parser

Tesseract server: C:\Program Files\Tesseract-OCR\tesseract.exe
Max Chunk Size set to 5000


In [4]:
# Read environment variables from a .env file
load_dotenv() # Load variables from .env file

True

In [5]:
# Read work folders from environment variables
DOCUMENT_SOURCES = os.environ.get("DOCUMENT_SOURCES")
DOCUMENT_PRODUCED = os.environ.get("DOCUMENT_PRODUCED")

# Set the file types in scope
FILE_TYPES_STRING = os.environ.get("FILE_TYPES")
if FILE_TYPES_STRING:
    FILE_TYPES = FILE_TYPES_STRING.split(",")
# else:
#     FILE_TYPES = ["*.pdf", "*.txt"]


print(f"Source folder: {DOCUMENT_SOURCES}")
print(f"Produced folder: {DOCUMENT_PRODUCED}")
print(f"File types in scope: {FILE_TYPES}")

Source folder: c:\Users\Eugene\Downloads\Materials\Sources
Produced folder: c:\Users\Eugene\Downloads\Materials\Produced
File types in scope: ['*.pdf', '*.doc', '*.docx', '*.xls', '*.xlsx', '*.ppt', '*.pptx', '*.txt', '*.srt']


In [6]:
"""
Parse the content of a document and store text chunks in the Produced.
"""
def parse_document_content(full_file_name):
    # Check the file extension and parse accordingly
    print(f"Parsing file: {full_file_name}")
    
    #extract file extension
    file_extension = os.path.splitext(full_file_name)[1].lower()
    # if full_file_name.endswith(".pdf"):
    if file_extension in [".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx"]:
        # Open the PDF file using PyMuPDF
        pdf_parser.parse_pymupdf_content(full_file_name, DOCUMENT_PRODUCED)


    # elif file_path.endswith((".doc", ".docx")):
    #     # Use python-docx to read Word documents
    #     from docx import Document
    #     doc = Document(file_path)
    #     for para in doc.paragraphs:
    #         text += para.text + "\n"

    # elif file_path.endswith((".xls", ".xlsx")):
    #     # Use openpyxl to read Excel files
    #     from openpyxl import load_workbook
    #     wb = load_workbook(file_path)
    #     for sheet in wb.sheetnames:
    #         ws = wb[sheet]
    #         for row in ws.iter_rows(values_only=True):
    #             text += " ".join(map(str, row)) + "\n"

    # elif file_path.endswith((".ppt", ".pptx")):
    #     # Use python-pptx to read PowerPoint files
    #     from pptx import Presentation
    #     prs = Presentation(file_path)
    #     for slide in prs.slides:
    #         for shape in slide.shapes:
    #             if hasattr(shape, "text"):
    #                 text += shape.text + "\n"

    elif file_extension in [".srt", ".txt"]:
        text_parser.parse_plain_text_content(full_file_name, DOCUMENT_PRODUCED)

    else:
        print(f"Unsupported file type: {full_file_name}")

In [7]:
# Read a list of all files in the source directory (file types: .pdf, .docx, .xlsx, .pptx, .txt, .srt)
files = []
for file_type in FILE_TYPES:
    files.extend(glob.glob(os.path.join(DOCUMENT_SOURCES, "**", file_type), recursive=True))

print("Files found:", len(files))
# print("Files list:", files)

Files found: 6


In [8]:
# Iterate processing through the files
for file in files:
    parse_document_content(file)

Parsing file: c:\Users\Eugene\Downloads\Materials\Sources\00_afh_full (Airplane Flying Handbook (FAA-H-8083-3C)).pdf
PyMuPDF parsing module in use.
406  pages
406  pages
Page  1
Content written to  c:\Users\Eugene\Downloads\Materials\Produced\MDBfYWZoX2Z1bGwgKEFpcnBsYW5lIEZseWluZyBIYW5kYm9vayAoRkFBLUgtODA4My0zQykpLnBkZg==\Page_0001_Chunk_001.json
Page 1 has 1 images
Processing image file 
Page 1 Image 1:
FAA-b-3083-3C

Airplane Fiyin
Handbook 3

Bi

Content written to  c:\Users\Eugene\Downloads\Materials\Produced\MDBfYWZoX2Z1bGwgKEFpcnBsYW5lIEZseWluZyBIYW5kYm9vayAoRkFBLUgtODA4My0zQykpLnBkZg==\Page_0001_Image_002_Chunk_001.json
Page  2
Content written to  c:\Users\Eugene\Downloads\Materials\Produced\MDBfYWZoX2Z1bGwgKEFpcnBsYW5lIEZseWluZyBIYW5kYm9vayAoRkFBLUgtODA4My0zQykpLnBkZg==\Page_0002_Chunk_001.json
Page  3
Content written to  c:\Users\Eugene\Downloads\Materials\Produced\MDBfYWZoX2Z1bGwgKEFpcnBsYW5lIEZseWluZyBIYW5kYm9vayAoRkFBLUgtODA4My0zQykpLnBkZg==\Page_0003_Chunk_001.json
Page  4

In [9]:
import re

def split_into_words(input_string: str) -> str:
    # Step 1: Replace all separators with spaces
    normalized = re.sub(r'[_\-\s]+', ' ', input_string)
    
    # Step 2: Insert spaces at transitions:
    # - lowerCase -> lower Case
    # - digitLetter -> digit Letter
    # - letterDigit -> letter Digit
    spaced = re.sub(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Za-z])(?=\d)|(?<=\d)(?=[A-Za-z])', ' ', normalized)

    # Step 3: Normalize multiple spaces and strip leading/trailing space
    cleaned = re.sub(r'\s+', ' ', spaced).strip()
    
    return cleaned

print(split_into_words("thisIsA_Test-String2023With42Numbers"))

this Is A Test String 2023 With 42 Numbers
