In [1]:
import os
from PyPDF2 import PdfReader
import docx
import re
import pytesseract
from PIL import Image
import io
import fitz

def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.replace('\u00a0', ' ')
    text = text.replace('\n', ' ').replace('\r', '')
    return text

def extract_pdf_text(file_path):
    text = ''
    with fitz.open(file_path) as pdf:
        for page in pdf:
            text += page.get_text()
            
            for img in page.get_images():
                xref = img[0]
                base_image = pdf.extract_image(xref)
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes))
                text += ' ' + pytesseract.image_to_string(image)
    
    return clean_text(text)

def extract_docx_text(file_path):
    doc = docx.Document(file_path)
    text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    return clean_text(text)

def extract_txt_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return clean_text(file.read())

def extract_image_text(file_path):
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image)
    return clean_text(text)

def upload_and_parse_document(file_path):
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == '.pdf':
        return extract_pdf_text(file_path)
    elif file_extension in ['.docx', '.doc']:
        return extract_docx_text(file_path)
    elif file_extension == '.txt':
        return extract_txt_text(file_path)
    elif file_extension in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
        return extract_image_text(file_path)
    else:
        raise ValueError('Unsupported file type')

ModuleNotFoundError: No module named 'pytesseract'

In [2]:
text = upload_and_parse_document("Pre-Implementation Design Report.pdf")

In [3]:
text

"Task Overview Develop a prototype for an advanced document analysis system using transformer-based models, incorporating real-time annotation capabilities. This task is designed to be completed in 8-10 hours and should showcase your team's ability to work with state-of-the-art NLP models, handle data processing, and create an interactive user interface. Project Goals 1. Implement a document ingestion and preprocessing pipeline ● Take pdf and store in mongo db ● Upload newly annotated documents to mongo db upon finishing annotation, for the sake of version history and referencing ● Create one pipeline function which will accept all the text in the initial file and then repeatedly accept new text as real time annotation happens ○ Create a function that accepts a file path and set up conditionals to check what the file extension is. ■ Use PyPDF2’s PdfReader module to iterate through pdf documents and use the extract text built in function ■ Use docx module to handle situations where the 