In [24]:
import pdfplumber
import os

def extract_content_with_position(pdf_path, output_dir):
    content_data = []

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract text with positions
            for char in page.chars:
                text = char.get('text')
                position = {
                    'left': char.get('x0'),
                    'top': char.get('top'),
                    'right': char.get('x1'),
                    'bottom': char.get('bottom')
                }
                content_data.append({
                    'type': 'text',
                    'page_num': page_num,
                    'content': text,
                    'position': position
                })

            # Extract images with positions
            for img_index, img in enumerate(page.images, start=1):
                # Define the bounding box for the image
                bbox = (img['x0'], img['top'], img['x1'], img['bottom'])
                try:
                    # Use within_bbox with strict=False to handle images partially outside the page
                    cropped_page = page.within_bbox(bbox, strict=False)
                    image = cropped_page.to_image()
                    image_path = os.path.join(output_dir, f"page_{page_num}_image_{img_index}.png")
                    image.save(image_path, format="PNG")
                    position = {
                        'left': img['x0'],
                        'top': img['top'],
                        'right': img['x1'],
                        'bottom': img['bottom']
                    }
                    content_data.append({
                        'type': 'image',
                        'page_num': page_num,
                        'content': image_path,
                        'position': position
                    })
                except ValueError as e:
                    print(f"Skipping image extraction on page {page_num}, index {img_index} due to error: {e}")

    return content_data


# Usage
pdf_file = '../data/2024-conocophillips-proxy-statement.pdf'
output_directory = 'extracted_content'
extracted_content = extract_content_with_position(pdf_file, output_directory)


In [30]:
import pdfplumber
pdf = pdfplumber.open(pdf_file)

In [1]:
import pymupdf4llm
pdf_file = '../data/2024-conocophillips-proxy-statement.pdf'
md_text = pymupdf4llm.to_markdown(pdf_file)

Processing ../data/2024-conocophillips-proxy-statement.pdf...


In [3]:
import pathlib
pathlib.Path("../data/pymupdf_content2024-conocophillips-proxy-statement.md").write_bytes(md_text.encode())

413998

In [None]:
md_text

In [23]:
from hotpdf import HotPdf
hotpdf_document = HotPdf(pdf_file)

In [24]:
print(len(hotpdf_document.pages))

62


In [28]:
full_page_text = hotpdf_document.extract_page_text(page=4)
full_page_text

''

In [29]:
import pypdfium2 as pdfium
pdf = pdfium.PdfDocument(pdf_file)
n_pages = len(pdf)
n_pages




62

In [34]:
page = pdf[20]

textpage = page.get_textpage()

# Extract text from the whole page
text_all = textpage.get_text_range()

text_all

''

In [19]:
from unstructured.partition.pdf import partition_pdf
# from unstructured_inference.inference.layoutelement import LayoutElements
import os
pdf_file = '../data/2024-conocophillips-proxy-statement.pdf'
# os.environ["OCR_AGENT"] = "unstructured.partition.utils.ocr_models.paddle_ocr.OCRAgentPaddle"

elements = partition_pdf(filename=pdf_file)

In [20]:
output_file_path = "../data/unstructured_content2024-conocophillips-proxy-statement.md"
parsed_content = "\n\n".join(str(doc) for doc in elements)

with open(output_file_path, "w", encoding="utf-8") as output_file:
    output_file.write(parsed_content)

print(f"Parsed content saved to {output_file_path}")

Parsed content saved to ../data/unstructured_content2024-conocophillips-proxy-statement.md


In [9]:
import nest_asyncio

nest_asyncio.apply()

In [10]:
import os

# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-VkQeYOe7dieXobtJXQbaTsCmHCl4EP03Cknhx08QlzM0FQeV"

In [12]:
from llama_cloud_services import LlamaParse

file_path = pdf_file

documents = LlamaParse(
    result_type="markdown",
    auto_mode=True,
    auto_mode_trigger_on_image_in_page=True,
    auto_mode_trigger_on_table_in_page=True,
).load_data(file_path)

Started parsing the file under job_id 1e36deb3-2f88-4d20-b3c6-f89482e1c14e
.............

In [15]:
documents

[Document(id_='6860d60e-9e19-4890-809e-d38ec5eb51e5', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='ConocoPhillips\n\n2023 Analyst & Investor Meeting', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'),
 Document(id_='99951b5b-e02a-4023-9592-da884ef67017', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="# Today's Agenda\n\n| Topic | Speaker | Position |\n|-------|---------|----------|\n| Opening | Ryan Lance | Chairman and CEO |\n| Strategy and Portfolio | Dominic Macklon | EVP, Strategy, Sustainability and Technology |\n| Al

In [17]:
output_file_path = "../data/llama_index_content2023-conocophillips-aim-presentation.md"
parsed_content = "\n\n".join(doc.get_content() for doc in documents)

with open(output_file_path, "w", encoding="utf-8") as output_file:
    output_file.write(parsed_content)

print(f"Parsed content saved to {output_file_path}")

Parsed content saved to ../data/llama_index_content2023-conocophillips-aim-presentation.md
