In [None]:
# Install dependencies if not already installed
# pip install PyMuPDF Pillow  # PyMuPDF = Fitz
# pip install dotenv

In [None]:
# Load the required libraries
import os
import glob
from dotenv import load_dotenv

In [None]:
# Import custom project libraries
import Libraries.PyMuPDF_Parsing as pdf_parser
import Libraries.Text_Parsing as text_parser

In [None]:
# Read environment variables from a .env file
load_dotenv() # Load variables from .env file

In [None]:
# Read work folders from environment variables
DOCUMENT_SOURCES = os.environ.get("DOCUMENT_SOURCES")
DOCUMENT_PRODUCED = os.environ.get("DOCUMENT_PRODUCED")

# Set the file types in scope
FILE_TYPES_STRING = os.environ.get("FILE_TYPES")
if FILE_TYPES_STRING:
    FILE_TYPES = FILE_TYPES_STRING.split(",")
# else:
#     FILE_TYPES = ["*.pdf", "*.txt"]


print(f"Source folder: {DOCUMENT_SOURCES}")
print(f"Produced folder: {DOCUMENT_PRODUCED}")
print(f"File types in scope: {FILE_TYPES}")

In [None]:
"""
Parse the content of a document and store text chunks in the Produced.
"""
def parse_document_content(full_file_name):
    # Check the file extension and parse accordingly
    print(f"Parsing file: {full_file_name}")
    
    #extract file extension
    file_extension = os.path.splitext(full_file_name)[1].lower()
    # if full_file_name.endswith(".pdf"):
    if file_extension in [".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx"]:
        # Open the PDF file using PyMuPDF
        pdf_parser.parse_pymupdf_content(full_file_name, DOCUMENT_PRODUCED)


    # elif file_path.endswith((".doc", ".docx")):
    #     # Use python-docx to read Word documents
    #     from docx import Document
    #     doc = Document(file_path)
    #     for para in doc.paragraphs:
    #         text += para.text + "\n"

    # elif file_path.endswith((".xls", ".xlsx")):
    #     # Use openpyxl to read Excel files
    #     from openpyxl import load_workbook
    #     wb = load_workbook(file_path)
    #     for sheet in wb.sheetnames:
    #         ws = wb[sheet]
    #         for row in ws.iter_rows(values_only=True):
    #             text += " ".join(map(str, row)) + "\n"

    # elif file_path.endswith((".ppt", ".pptx")):
    #     # Use python-pptx to read PowerPoint files
    #     from pptx import Presentation
    #     prs = Presentation(file_path)
    #     for slide in prs.slides:
    #         for shape in slide.shapes:
    #             if hasattr(shape, "text"):
    #                 text += shape.text + "\n"

    elif file_extension in [".srt", ".txt"]:
        text_parser.parse_plain_text_content(full_file_name, DOCUMENT_PRODUCED)

    else:
        print(f"Unsupported file type: {full_file_name}")

In [None]:
# Read a list of all files in the source directory (file types: .pdf, .docx, .xlsx, .pptx, .txt, .srt)
files = []
for file_type in FILE_TYPES:
    files.extend(glob.glob(os.path.join(DOCUMENT_SOURCES, "**", file_type), recursive=True))

print("Files found:", len(files))
# print("Files list:", files)

In [None]:
# Iterate processing through the files
for file in files:
    parse_document_content(file)