In [35]:
from PIL import Image
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader
import pdfplumber
import fitz
import os
from dotenv import load_dotenv
import openai
import base64
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [8]:
load_dotenv()

True

In [10]:
def extract_text(pdf_file):
  loader = PyPDFLoader(pdf_file)
  text_docs = loader.load()
  print("Text extracted")
  return text_docs

In [11]:
def extract_tables(pdf_file):
    table_docs = []

    with pdfplumber.open(pdf_file) as pdf:
        for i, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            for table_index, table in enumerate(tables):
                table_text = "\n".join(["\t".join([cell if cell is not None else "" for cell in row]) for row in table if row])


                doc = Document(
                    page_content=table_text,
                    metadata={"page": i + 1, "table_index": table_index}
                )
                table_docs.append(doc)

    print("Tables extracted")
    return table_docs

In [26]:
def openai_image_caption(image_path):
    with open(image_path, "rb") as img_file:
        image_data = base64.b64encode(img_file.read()).decode("utf-8")

    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You describe charts, plots, and images."},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Please provide a concise caption describing this image."},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_data}"
                        }
                    }
                ]
            }
        ]
    )
    return response.choices[0].message.content


In [27]:
def extract_images_with_captions(pdf_file, save_dir="images"):
    os.makedirs(save_dir, exist_ok=True)
    image_docs = []
    doc = fitz.open(pdf_file)

    for page_index in range(len(doc)):
        page = doc[page_index]
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_path = os.path.join(save_dir, f"page{page_index+1}_img{img_index+1}.{image_ext}")

            with open(image_path, "wb") as f:
                f.write(image_bytes)

            try:
                caption = openai_image_caption(image_path)
                print(f"Page {page_index+1} Image {img_index+1} Caption: {caption}")
            except Exception as e:
                caption = "Caption generation failed"
                print(f"Error captioning {image_path}: {e}")

            doc_obj = Document(
                page_content=caption,
                metadata={
                    "page": page_index + 1,
                    "image_index": img_index + 1,
                    "image_path": image_path
                }
            )
            image_docs.append(doc_obj)

    return image_docs

In [15]:
pdf_file = 'pdf.pdf'

In [17]:
text_docs = extract_text(pdf_file)

Text extracted


In [None]:
table_docs = extract_tables(pdf_file)

In [None]:
image_docs = extract_images_with_captions(pdf_file, save_dir="images")

In [30]:
all_docs = text_docs + table_docs + image_docs

In [34]:
db = FAISS.from_documents(all_docs, OpenAIEmbeddings())
retriever = db.as_retriever()

In [37]:
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-4o"),
    chain_type="stuff",
    retriever=retriever
)

In [43]:
response = qa_chain.invoke("What do the graphs show in this PDF?")

In [44]:
print(response["result"])

The graphs in the PDF show the following:

1. A scatter plot with green and red data points forming a triangular shape, indicating a linear separation between the two color clusters.
2. A histogram displaying the distribution of document toxicity scores, with most documents scoring low on the toxicity scale.
3. A bar chart depicting win, tie, and loss rates across word count quintiles, showing that win rates generally increase in higher quintiles, with error bars indicating variability.
4. A bar chart showing win, tie, and loss rates (%) by the number of turns, where win rates are generally the highest, followed by tie and loss rates, with error bars indicating variability.
