In [95]:
from langchain.text_splitter import CharacterTextSplitter
from unstructured.partition.pdf import partition_pdf
from pprint import pprint
import fitz
import tesserocr
from PIL import Image
import re
from ollama import generate
from unidecode import unidecode
import easyocr

In [2]:
%env EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD=100
%env EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD=100

env: EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD=100
env: EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD=100


In [43]:
# Extract elements from PDF
def extract_pdf_elements(path, fname, image_path):
    """
    Extract images, tables, and chunk text from a PDF file.
    path: File path to extract PDF documents
    fname: File name
    image_path: Path to save images
    """
    return partition_pdf(
        filename=path + fname,    
        strategy='hi_res',  
        infer_table_structure=True,
        # chunking_strategy="by_title",
        hi_res_model_name = 'detectron2_onnx', 
        extract_image_block_types=["Image", "Table","Figure"],
        extract_image_block_output_dir=image_path,
        # max_characters=4000,
        # new_after_n_chars=3800,
        # combine_text_under_n_chars=2000,
    )

In [46]:
# Process the Wf document
# File path
fpath = "files/"
fname = "2022-annual-report-wf.pdf"
image_path = "data1"

# Get elements
raw_pdf_elements = extract_pdf_elements(fpath, fname,image_path)

Sak detectron2_onnx


In [45]:
# for element in raw_pdf_elements:
#     print(element.category, ' : ', element.text)

UncategorizedText  :  2022
Title  :  Annual Report
NarrativeText  :  Wells Fargo & Company
Title  :  WELLS FARGO
Title  :  CEO Letter
NarrativeText  :  Dear Shareholders,
NarrativeText  :  I’m proud to report that Wells Fargo continued to make progress on our priorities in 2022. Our underlying financial performance is improving, we are moving forward on our risk, control and regulatory agenda, we are focusing on businesses where we can generate appropriate risk-adjusted returns, we continue to strengthen the leadership team, and we are executing on our strategic objectives. While we have made progress, our work is not complete and we remain focused on successful and timely execution of our multi-year journey to complete our risk and control work and to move forward with our businesses.
Title  :  Stronger financial performance
NarrativeText  :  Our financial performance benefitted as we continued to drive improved efficiency, and it was positively impacted by both rising rates and a ben

In [47]:
class Document:
    def __init__(self, title, parent=None, level=0):
        self.title = title
        self.elements = []
        self.parent = parent
        self.level = level
        self.children = []

    def add_element(self, element):
        self.elements.append(element)

    def add_child(self, child):
        self.children.append(child)

    def to_dict(self):
        elements = [
            {"category": element.category, "text": unidecode(element.text), "metadata": element.metadata.to_dict()}
            for element in self.elements
        ]
        return {
            "title": self.title,
            "elements": elements,
            "children": [child.to_dict() for child in self.children],
        }

In [86]:
"""
Helps in determining the font size of the title
"""

pages = {}
doc = fitz.open("files/2022-annual-report-wf.pdf")
api = tesserocr.PyTessBaseAPI(path="/usr/share/tesseract-ocr/5/tessdata")


def get_title_level(element):
    metadata = element.metadata

    # Load the page
    page_key = f"{metadata.filename}-{metadata.page_number}"
    if not page_key in pages:
        pages[page_key] = doc.load_page(metadata.page_number - 1)
    page = pages[page_key]

    # Get the image clip
    coords = metadata.coordinates.points
    pix = page.get_pixmap(dpi=200)
    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples).crop(
        (coords[0][0], coords[0][1], coords[2][0], coords[2][1])
    )

    api.SetImage(image)
    api.Recognize()
    iterator = api.GetIterator()

    attrs = iterator.WordFontAttributes()

    if attrs["pointsize"] < 23:
        return 3
    elif attrs["pointsize"] < 35:
        return 2
    else:
        return 1

In [97]:
reader = easyocr.Reader(['en'],gpu=True)

def has_image_text(image_file_name):
    return len(reader.readtext(image_file_name)) > 0

In [98]:
describe_prommpt = "You are an assistant tasked with summarizing tables and image for retrieval. \
    These summaries will be embedded and used to retrieve the raw table or image elements. \
    Give a concise summary of the table or image that is well optimized for retrieval:"


def get_image_description(image_file_name):
    with open(image_file_name, "rb") as f:
        image = f.read()
        response = generate(model="llava", prompt=describe_prommpt, images=[image], stream=False)
        return response["response"]

In [99]:
def build_documents(elements):
    global parent, pages_to_ignore

    pages_to_ignore = [11]
    regex_roman_numeral = re.compile(r"^[IVXLCDMivxlcdm]+$")

    top = Document("Wells Fargo", "")
    current_document = top

    prev_category = None

    for i, element in enumerate(elements):
        print("Processing element " , i, " from page", element.metadata.page_number, "with category", element.category)

        if element.metadata.page_number in pages_to_ignore:
            print("Ignored page", element.metadata.page_number)
            # Ignore this as this is the footer page number
            pass

        elif element.text == "2022 Annual Report" or element.text == "Wells Fargo & Company":
            print("Ignored page footer", element.text)
            # Ignore this as this is the footer text
            pass

        elif len(element.text) < 6 and (regex_roman_numeral.match(element.text) or element.text.isnumeric()):
            print("Ignored page footer", element.text)
            # Ignore this as this is the footer page number
            pass

        elif element.category == "Title":
            level = get_title_level(element)

            if prev_category == "ListItem" and level == 3 and not element.text[0].isupper():
                # This is a continuation of the previous list item
                # unstructure wrongly classified it as a title
                current_document.add_element(element)

            elif level == current_document.level:
                # Title is at the same level, so create a new document with
                # this title, and add upcoming texts in this document
                current_document = Document(element.text, current_document.parent, level)
                current_document.parent.add_child(current_document)

            elif level < current_document.level:
                # Title is of a higher level, so create a new document with
                # this title, add that to parent.parent
                # and add upcoming texts in this document
                while level <= current_document.parent.level:
                    current_document = current_document.parent
                current_document = Document(element.text, current_document.parent, level)
                current_document.parent.add_child(current_document)

            else:
                # Title is of a lower level, so create a new document with
                # this title, and add upcoming texts in this document
                current_document = Document(element.text, current_document, level)
                current_document.parent.add_child(current_document)

        elif element.category == "Text" or element.category == "NarrativeText" or element.category == "ListItem":
            current_document.add_element(element)

        elif element.category == "Table":
            element.text = get_image_description(element.metadata.image_path)
            current_document.add_element(element)

        elif element.category == "Image":
            if has_image_text(element.metadata.image_path):
                element.text = get_image_description(element.metadata.image_path)
                current_document.add_element(element)
            else:
                print("Ignored image as it has no text", element.category)

        else:
            print("Ignored element", element.category)

        prev_category = element.category

    return top

In [100]:
doc = build_documents(raw_pdf_elements)

Processing element  0  from page 62 with category Image
Processing element  1  from page 62 with category NarrativeText
Processing element  2  from page 62 with category NarrativeText
Processing element  3  from page 62 with category NarrativeText
Processing element  4  from page 62 with category UncategorizedText
Ignored page footer 52
Processing element  5  from page 62 with category Title
Ignored page footer Wells Fargo & Company
Processing element  6  from page 63 with category NarrativeText
Processing element  7  from page 63 with category NarrativeText
Processing element  8  from page 63 with category NarrativeText
Processing element  9  from page 63 with category NarrativeText
Processing element  10  from page 63 with category Title
Processing element  11  from page 63 with category NarrativeText
Processing element  12  from page 63 with category NarrativeText
Processing element  13  from page 63 with category NarrativeText
Processing element  14  from page 63 with category Tabl

In [31]:
import json
with open('output.json', 'w') as f:
    json.dump(doc.to_dict(), f, indent=4)