In [7]:
%load_ext autoreload
%autoreload 2

import os
from dotenv import load_dotenv
load_dotenv()

from pydantic import BaseModel
from typing import List, Dict, Union
import re
import json


import sys
sys.path.append('multimodal_processing_pipeline')
sys.path.append('search')

from IPython.display import Markdown, display

from utils.file_utils import * 
from utils.text_utils import *
from utils.openai_utils import *
from utils.data_models import *
from pdf_ingestion_pipeline import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# Example usage:
pdf_path = "sample_data/1_London_Brochure.pdf"
output_directory = "output_o1"

pipeline = PDFIngestionPipeline(
    pdf_path=pdf_path, 
    output_directory=output_directory, 
    multimodal_model=MulitmodalProcessingModelName(model_name="o1", reasoning_efforts="high"), 
    text_model=TextProcessingModelName(model_name="o1", reasoning_efforts="high")
)

# Process the PDF
document_content = pipeline.process_pdf()

# Save text twins
for page in document_content.pages:
    pipeline.save_page_text_twin(page)

pipeline.save_text_twin(document_content)    


# Condense text twins
for page in document_content.pages:
    pipeline.condense_page_text(page)

pipeline.condense_text(document_content)    



Processing page 1/2...
Reading file from path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\multimodal_processing_pipeline\prompts\process_extracted_text_prompt.txt


Writing file to full path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\output_o1\text\page_1.txt
Reading file from path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\multimodal_processing_pipeline\prompts\image_description_prompt.txt
Writing file to full path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\output_o1\images\page_1_image_1.txt
Writing file to full path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\output_o1\images\page_1_image_2.txt


Reading file from path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\multimodal_processing_pipeline\prompts\table_description_prompt.txt


Processing page 2/2...
Reading file from path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\multimodal_processing_pipeline\prompts\process_extracted_text_prompt.txt


Writing file to full path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\output_o1\text\page_2.txt
Reading file from path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\multimodal_processing_pipeline\prompts\image_description_prompt.txt


Reading file from path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\multimodal_processing_pipeline\prompts\table_description_prompt.txt
Writing file to full path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\output_o1\tables\page_2_table_1.txt


Writing file to full path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\output_o1\combined\page_1_twin.txt
Page 1 text twin saved at: output_o1\combined\page_1_twin.txt
Writing file to full path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\output_o1\combined\page_2_twin.txt
Page 2 text twin saved at: output_o1\combined\page_2_twin.txt
Writing file to full path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\output_o1\combined\text_twin.md
Text twin saved at: output_o1\combined\text_twin.md
Reading file from path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\multimodal_processing_pipeline\prompts\document_condensation_prompt.txt
Writing file to full path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\output_o1\combined\page_1_condensed.md
Page 1 condensed text saved at: output_o1\combined\page_1_condensed.md
Reading file from path: c:\Users\selhousseini\Documents\GitHub\mm_doc_proc\multimodal_processing_pipeline\prompts\document_condensation_prompt.txt
Writin

In [9]:
display(Markdown(document_content.full_text))

##### --- Page 1 ---

# Extracted Text

Margie’s Travel Presents…  
London  

London is the capital and most populous city of England and the United Kingdom. Standing on the River Thames in the south east of the island of Great Britain, London has been a major settlement for two millennia. It was founded by the Romans, who named it Londinium. London's ancient core, the City of London, largely retains its 1.12-square-mile medieval boundaries. Since at least the 19th century, London has also referred to the metropolis around this core, historically split between Middlesex, Essex, Surrey, Kent, and Hertfordshire, which today largely makes up Greater London, governed by the Mayor of London and the London Assembly.  

Mostly popular for: Leisure, Outdoors, Historical, Arts & Culture  
Best time to visit: Jun-Aug  
Averag Precipitation: 1.9 in  
Average Temperature: 56-67°F  

London Hotels  

Margie’s Travel offers the following accommodation options in London:  

The Buckingham Hotel Comfortable hotel close to major sights like Buckingham Palace, Regent’s Park, and Trafalgar Square.  

The City Hotel Luxury rooms in the city, within walking distance of Tower Bridge and the Tower of London..  

The Kensington Hotel Budget accommodation near Earl’s Court.  

To book your trip to London, visit www.margiestravel.com  


# Embedded Images:

### - Image 0:
This is a photograph showing Tower Bridge across the River Thames with its bascules raised. The bridge features tall stone towers and blue metal walkways set against a cloudy sky. The water beneath appears murky brown.

Likely included to highlight a famous London landmark in a travel or tourism context.

The image emphasizes the iconic design of Tower Bridge and its drawbridge mechanism, showcasing a key attraction for visitors.

### - Image 1:
This is a photograph of the fountains in Trafalgar Square. Water cascades from a large central fountain, surrounded by statues and historic buildings in the background.

Likely included to portray another prominent tourist spot in London related to the travel theme.

The fountain and statues illustrate a bustling public square, hinting at the city’s cultural and historical significance.

<br/>
<br/>
<img src="output_o1\images\page_1.png" alt="Page Number 1" width="300" height="425">




##### --- Page 2 ---

# Extracted Text

| Category                   | Information                                           |
|----------------------------|-------------------------------------------------------|
| Country                    | United Kingdom                                       |
| Capital Of                 | England                                              |
| Currency                   | Pound Sterling (GBP)                                 |
| Population (2021 census)   | Approximately 8.8 million                            |
| Famous For                 | Historical landmarks, museums, cultural diversity    |


# Tables:

### - Table 0:

| Category                 | Information                                     |
|--------------------------|-------------------------------------------------|
| Country                  | United Kingdom                                  |
| Capital Of               | England                                         |
| Currency                 | Pound Sterling (GBP)                            |
| Population (2021 census) | Approximately 8.8 million                       |
| Famous For               | Historical landmarks, museums, cultural diversity|


<br/>
<br/>
<img src="output_o1\images\page_2.png" alt="Page Number 2" width="300" height="425">





In [10]:
display(Markdown(document_content.condensed_full_text))

Margie’s Travel Presents… London

London is the capital and most populous city of England and the United Kingdom, standing on the River Thames in the southeast of Great Britain. It has been a major settlement for two millennia and was founded by the Romans, who named it Londinium. London’s ancient core, the City of London, largely retains its 1.12‑square‑mile medieval boundaries. Since at least the 19th century, “London” has also referred to the metropolis around this core—historically split among Middlesex, Essex, Surrey, Kent, and Hertfordshire—which today largely makes up Greater London, governed by the Mayor of London and the London Assembly.

Mostly popular for: Leisure, Outdoors, Historical, Arts & Culture  
Best time to visit: Jun–Aug  
Average Precipitation: 1.9 in  
Average Temperature: 56–67°F  

London Hotels  
Margie’s Travel offers the following accommodation options in London:  
• The Buckingham Hotel: Comfortable hotel near Buckingham Palace, Regent’s Park, and Trafalgar Square.  
• The City Hotel: Luxury rooms within walking distance of Tower Bridge and the Tower of London.  
• The Kensington Hotel: Budget accommodation near Earl’s Court.  

To book your trip to London, visit www.margiestravel.com  

Embedded Images  
• Image 0: Photograph showing Tower Bridge across the River Thames with its bascules raised; the bridge features tall stone towers and blue metal walkways beneath a cloudy sky, with murky brown water below.  
• Image 1: Photograph of the fountains in Trafalgar Square, where water cascades from a large central fountain, surrounded by statues and historic buildings.  

Additional Information (Page 2)  
• Country: United Kingdom  
• Capital Of: England  
• Currency: Pound Sterling (GBP)  
• Population (2021 census): Approximately 8.8 million  
• Famous For: Historical landmarks, museums, cultural diversity  

In [11]:
get_token_count(document_content.full_text), get_token_count(document_content.condensed_full_text)

(684, 402)