In [None]:
import os
from unstructured.partition.pdf import partition_pdf
import pytesseract
import uuid

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma

In [None]:
import base64
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
from dotenv import load_dotenv
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

In [None]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [None]:
input_path = os.getcwd()
output_path = os.path.join(os.getcwd(), "figures")

# Create the figures directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

Extracting Images, Texts and Tables from PDF

In [None]:
raw_pdf_elements = partition_pdf(
    filename=os.path.join(input_path, "annual-report_fy2019_02_en.pdf"),
    extract_images_in_pdf=True,
    strategy="fast",  # Changed to fast strategy for better compatibility
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=5000,
    new_after_n_chars=4800,
    combine_text_under_n_chars=3000,
    image_output_dir_path=output_path,
)

In [None]:
text_elements = []
table_elements = []
image_elements = []


In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
for element in raw_pdf_elements:
    if 'CompositeElement' in str(type(element)):
        text_elements.append(element)
    elif 'Table' in str(type(element)):
        table_elements.append(element)

table_elements = [i.text for i in table_elements]
text_elements = [i.text for i in text_elements]

# Tables
print("The length of table elements are :", len(table_elements))

# Text
print("The length of text elements are :", len(text_elements))

for image_file in os.listdir(output_path):
     if image_file.endswith(('.png', '.jpg', '.jpeg')):
          image_path = os.path.join(output_path, image_file)
          encoded_image = encode_image(image_path)
          image_elements.append(encoded_image)

# image
print("The length of image elements are :",len(image_elements))

Applying Semantic Chunking:

In [None]:
#from langchain_experimental.text_splitter import SemanticChunker

In [None]:
chunker = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type="percentile")

In [None]:
text_elements = [chunker.split_text(text) for text in text_elements]

In [None]:
text_elements = [item for sublist in text_elements for item in (sublist if isinstance(sublist, list) else [sublist])]

In [None]:
model = ChatOpenAI(model="gpt-4o")