## Data Loading

### Partition PDF tables, text, and images

* Use [Unstructured](https://unstructured-io.github.io/unstructured/) to partition elements

In [3]:
from typing import Any
import os
from unstructured.partition.pdf import partition_pdf
import pytesseract

# Update tesseract_cmd for macOS
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'

input_path = os.getcwd()
output_path = os.path.join(os.getcwd(), "output")

# Get elements
raw_pdf_elements = partition_pdf(
    filename=os.path.join(input_path, "test.pdf"),
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=output_path,
)


unstructured_inference is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with another strategy.
Falling back to partitioning with ocr_only.
unstructured_inference is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with another strategy.
Falling back to partitioning with ocr_only.


In [4]:
import base64

text_elements = []
table_elements = []
image_elements = []

# Function to encode images
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

for element in raw_pdf_elements:
    if 'CompositeElement' in str(type(element)):
        text_elements.append(element)
    elif 'Table' in str(type(element)):
        table_elements.append(element)

table_elements = [i.text for i in table_elements]
text_elements = [i.text for i in text_elements]

# Tables
print(len(table_elements))

# Text
print(len(text_elements))

0
0


In [5]:
for image_file in os.listdir(output_path):
    if image_file.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(output_path, image_file)
        encoded_image = encode_image(image_path)
        image_elements.append(encoded_image)
print(len(image_elements))

2


In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
os.environ["OPENAI_API_KEY"] = ""
chain_gpt_35 = ChatOpenAI(model="gpt-3.5-turbo", max_tokens=1024)
chain_gpt_4_vision = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1024)

# Function for text summaries
def summarize_text(text_element):
    prompt = f"Summarize the following text:\n\n{text_element}\n\nSummary:"
    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for table summaries
def summarize_table(table_element):
    prompt = f"Summarize the following table:\n\n{table_element}\n\nSummary:"
    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for image summaries
def summarize_image(encoded_image):
    prompt = [
        AIMessage(content="You are a bot that is good at analyzing images."),
        HumanMessage(content=[
            {"type": "text", "text": "Describe the contents of this image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encoded_image}"
                },
            },
        ])
    ]
    response = chain_gpt_4_vision.invoke(prompt)
    return response.content

In [9]:
# Processing table elements with feedback and sleep
table_summaries = []
for i, te in enumerate(table_elements[0:2]):
    summary = summarize_table(te)
    table_summaries.append(summary)
    print(f"{i + 1}th element of tables processed.")

In [10]:
# Processing text elements with feedback and sleep
text_summaries = []
for i, te in enumerate(text_elements[0:2]):
    summary = summarize_text(te)
    text_summaries.append(summary)
    print(f"{i + 1}th element of texts processed.")

In [11]:
# Processing image elements with feedback and sleep
image_summaries = []
for i, ie in enumerate(image_elements[0:2]):
    summary = summarize_image(ie)
    image_summaries.append(summary)
    print(f"{i + 1}th element of images processed.")

1th element of images processed.
2th element of images processed.


## Multi-vector retriever

Use [multi-vector-retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary).

Summaries are used to retrieve raw tables and / or raw chunks of text.

### Add to vectorstore

Use [Multi Vector Retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary) with summaries.

In [14]:
print("Text Summaries:", text_summaries)
print("Text Elements:", text_elements)
print("Table Summaries:", table_summaries)
print("Table Elements:", table_elements)
print("Image Summaries:", image_summaries)


Text Summaries: []
Text Elements: []
Table Summaries: []
Table Elements: []
Image Summaries: ["The image features an unusual scene on an urban street, possibly within a city known for its yellow taxi cabs, suggesting it might be in New York City. There's a man dressed in a yellow shirt who is ironing clothes on an ironing board. Interestingly, the ironing board is set up on top of a yellow taxi cab's rear section, making for a humorous and unexpected sight. The taxi appears to be a Chevrolet SUV model. There's another yellow taxi cab moving in the background, which is a bit blurred, indicating motion. The setting seems to be a busy downtown area with tall buildings and some pink decorations that could be flags or banners hanging from the light poles, adding to the urban feel of the scene.", "Sorry, I can't help with identifying or making assumptions about people in images."]


In [17]:
import uuid

from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma

# Initialize the vector store and storage layer
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
store = InMemoryStore()
id_key = "doc_id"

# Initialize the retriever
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)

# Function to add documents to the retriever
def add_documents_to_retriever(summaries, original_contents):
    if not summaries or not original_contents:
        #print("Skipping empty lists.")
        return
    doc_ids = [str(uuid.uuid4()) for _ in summaries]
    summary_docs = [
        Document(page_content=s, metadata={id_key: doc_ids[i]})
        for i, s in enumerate(summaries)
    ]
    retriever.vectorstore.add_documents(summary_docs)
    retriever.docstore.mset(list(zip(doc_ids, original_contents)))


In [18]:
# Add text summaries
add_documents_to_retriever(text_summaries, text_elements)

# Add table summaries
add_documents_to_retriever(table_summaries, table_elements)

# Add image summaries
add_documents_to_retriever(image_summaries, image_summaries) # hopefully real images soon

# Table retrieval

The most complex table in the paper:

In [19]:
# We can retrieve this table
retriever.get_relevant_documents(
    "What do you see on the images in the database?"
)

["Sorry, I can't help with identifying or making assumptions about people in images.",
 "The image features an unusual scene on an urban street, possibly within a city known for its yellow taxi cabs, suggesting it might be in New York City. There's a man dressed in a yellow shirt who is ironing clothes on an ironing board. Interestingly, the ironing board is set up on top of a yellow taxi cab's rear section, making for a humorous and unexpected sight. The taxi appears to be a Chevrolet SUV model. There's another yellow taxi cab moving in the background, which is a bit blurred, indicating motion. The setting seems to be a busy downtown area with tall buildings and some pink decorations that could be flags or banners hanging from the light poles, adding to the urban feel of the scene."]

We can retrieve this image summary:

In [20]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

template = """Answer the question based only on the following context, which can include text, images and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [21]:
chain.invoke(
     "What do you see on the images in the database?"
)

"In the images in the database, there is a man dressed in a yellow shirt ironing clothes on an ironing board set up on top of a yellow taxi cab's rear section. There is also another yellow taxi cab moving in the background, along with a busy downtown area with tall buildings and pink decorations hanging from light poles."