In [10]:
import time
from collections import Counter

from unstructured.partition.pdf import partition_pdf

In [11]:
filepath = '../../data/docs/2022 Q3 AAPL.pdf'

# Raw parse

In [5]:
def log_raw_parse_by_strategy():
    for strategy in ['auto', 'hi_res', 'ocr_only', 'fast']:
        start = time.time()
        partition = partition_pdf(filepath, strategy=strategy)
        end = time.time()

        print(f'Strategy: {strategy} (parse time: {(end - start):.2f}s)')

        # https://docs.unstructured.io/open-source/concepts/document-elements#element-type
        unique_types = set()

        with open(f'outputs/unstructured-{strategy}.txt', 'w') as out:
            for elem in partition:
                unique_types.add(type(elem))
                out.write(
                    f'Element type: {type(elem)}\n'
                    f'{elem}\n\n'
                )

        for x in unique_types:
            print(x)

        print('-' * 48)

log_raw_parse_by_strategy()

Strategy: auto (parse time: 1.51s)
<class 'unstructured.documents.elements.Text'>
<class 'unstructured.documents.elements.ListItem'>
<class 'unstructured.documents.elements.Header'>
<class 'unstructured.documents.elements.Title'>
<class 'unstructured.documents.elements.NarrativeText'>
------------------------------------------------
Strategy: hi_res (parse time: 46.80s)
<class 'unstructured.documents.elements.Text'>
<class 'unstructured.documents.elements.ListItem'>
<class 'unstructured.documents.elements.FigureCaption'>
<class 'unstructured.documents.elements.Header'>
<class 'unstructured.documents.elements.Title'>
<class 'unstructured.documents.elements.Table'>
<class 'unstructured.documents.elements.NarrativeText'>
<class 'unstructured.documents.elements.Image'>
<class 'unstructured.documents.elements.Footer'>
------------------------------------------------
Strategy: ocr_only (parse time: 59.00s)
<class 'unstructured.documents.elements.Title'>
<class 'unstructured.documents.element

## Element types and their meanings
1) Image is a logo - 1 occurrence. Useless. (hi_res only)
2) Footer - redundant, same on each page. Useless (hi_res only)
3) ListItem + NarrativeText - necessary elements, strange parsing, different algorithm - different classification
4) Text - necessary, good
5) Header - small occurrence, strategies wrongly classified them, ocr_only did not use them (good)
6) FigureCaption - only used in hi_res and small occurrence, wrong classification of element type
7) Table - only captured by hi_res, super useful and somewhat good classification + parsing

**Notes**:
Seems like hi_res performed better overall (e.g. Table), with small mistakes and misclassifications. Possibly final parser pipeline will contain combination of several strategies. Links are not preserved in any of the strategy. Try some tweaking of hi_res, library provides many parameters for it.

In [6]:
partition = partition_pdf(filepath, strategy='hi_res', infer_table_structure=True,)

In [9]:
Counter(map(type, partition))

Counter({unstructured.documents.elements.Text: 233,
         unstructured.documents.elements.NarrativeText: 142,
         unstructured.documents.elements.Title: 98,
         unstructured.documents.elements.ListItem: 34,
         unstructured.documents.elements.Table: 33,
         unstructured.documents.elements.Footer: 21,
         unstructured.documents.elements.FigureCaption: 2,
         unstructured.documents.elements.Header: 2,
         unstructured.documents.elements.Image: 1})

Tried parsing table elements (not only first) to pd frames - doesn't work :)))


In [20]:
# from io import StringIO
# import pandas as pd
#
# tables = [elem for elem in partition if type(elem) == Table]
# dummy = tables[0].metadata.text_as_html
# df = pd.read_html(StringIO(dummy))
# df[0]