## Talk to PDF Images with GPT-4V

### Install Requirements

In [None]:
!pip3 install PyMuPDF

### Import Requirements

In [None]:
# Required Imports
import os
import fitz
import base64
import requests

### Export OAI Key

In [None]:
# OpenAI API Key
API_KEY = "<YOUR_API_KEY>"

### Remote PDF Process

In [None]:
images_folder = "data/images"
os.makedirs(images_folder, exist_ok=True)


def download_pdf(url):
    response = requests.get(url)
    response.raise_for_status()

    pdf_path = os.path.join("data", "pdfs", os.path.basename(url))
    os.makedirs(os.path.dirname(pdf_path), exist_ok=True)

    with open(pdf_path, "wb") as f:
        f.write(response.content)

    return pdf_path


def save_images_from_pdf(pdf_path):
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    images_folder = os.path.join("data", "images", pdf_name)
    os.makedirs(images_folder, exist_ok=True)

    image_save_paths = []

    pdf_document = fitz.open(pdf_path)
    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]

        image_list = page.get_images(full=True)
        for image_index, img in enumerate(image_list, start=1):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_filename = f"image_page{page_number + 1}_{image_index}.png"
            image_save_path = os.path.join(images_folder, image_filename)
            image_save_paths.append(image_save_path)

            with open(image_save_path, "wb") as image_file:
                image_file.write(image_bytes)

            print(f"Saved image to path: {image_save_path}")

    pdf_document.close()
    return image_save_paths

### Request to API

In [None]:
# Constants
API_URL = "https://api.openai.com/v1/chat/completions"


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def get_image_description(image_data):
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {API_KEY}"}
    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": (
                            "I have a collection of images that have been extracted from a PDF document"
                            "Each image may contain a diverse array of elements, such as text, "
                            "figures, charts, photographs, illustrations, or anything else. Please describe the content, "
                            "context, and any notable details of the following image."
                        ),
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
                    },
                ],
            }
        ],
        "max_tokens": 300,
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


def process_images_from_pdf(PDF_LINK):
    local_pdf_path = download_pdf(PDF_LINK)
    images_path = save_images_from_pdf(local_pdf_path)
    for image_path in images_path:
        try:
            base64_image = encode_image(image_path)
            result = get_image_description(base64_image)
            print("Result: ", result)
        except Exception as e:
            print(f"An error occurred while processing {image_path}: {e}")


if __name__ == "__main__":
    PDF_LINK = "https://arxiv.org/pdf/<EXAMPLE_PDF>.pdf"
    process_images_from_pdf(PDF_LINK)