# Installation

In [None]:
#!sudo apt install tesseract-ocr
!pip install pytesseract transformers ultralyticsplus==0.0.23 ultralytics==8.0.21
!pip install pymupdf "unstructured[pdf]" pdfminer.six pillow-heif
#!sudo apt-get install poppler-utils

In [None]:
!pip install pdf2image

In [10]:
!pip install langchain_openai

Successfully installed annotated-types-0.7.0 anyio-4.4.0 distro-1.9.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 jsonpatch-1.33 jsonpointer-3.0.0 langchain-core-0.2.26 langchain-openai-0.1.20 langsmith-0.1.95 openai-1.37.1 orjson-3.10.6 pydantic-2.8.2 pydantic-core-2.20.1 sniffio-1.3.1 tenacity-8.5.0 tiktoken-0.7.0
You should consider upgrading via the '/Users/amaan/Table-Extraction-ScienceGPT/myenv/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [None]:
!pip install python-dotenv

# Imports

In [22]:
%%capture
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import pytesseract
from pytesseract import Output

from ultralyticsplus import YOLO, render_result
from PIL import Image
from pdf2image import convert_from_path
from dotenv import load_dotenv
load_dotenv()


# Define YOLOV8 Model

In [5]:
%%capture
model = YOLO('keremberke/yolov8m-table-extraction')

# set model parameters
model.overrides['conf'] = 0.25  # NMS confidence threshold
model.overrides['iou'] = 0.45  # NMS IoU threshold
model.overrides['agnostic_nms'] = False  # NMS class-agnostic
model.overrides['max_det'] = 1000

# Convert pdf to images

In [1]:
import os
from pdf2image import convert_from_path


def pdf_to_jpg(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over all PDF files in the input folder
    for pdf_file in os.listdir(input_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(input_folder, pdf_file)
            pages = convert_from_path(pdf_path, 300)
            pdf_name = os.path.splitext(pdf_file)[0]

            # Save each page as a JPEG file
            for i, page in enumerate(pages):
                jpg_path = os.path.join(output_folder, f"{pdf_name}_page_{i + 1}.jpg")
                page.save(jpg_path, "JPEG")
                print(f"Saved {jpg_path}")

In [None]:
input_folder = "./table-docs"
output_folder = "./output-images"
pdf_to_jpg(input_folder, output_folder)

# Extract Table snippet from images

In [2]:
def extract_table(file_path):
  img = Image.open(file_path)
  results = model.predict(img)

  print('Boxes: ', results[0].boxes)
  render = render_result(model=model, image=img, result=results[0])
  cropped_images = []
  for i in range(len(results[0])):

    x1, y1, x2, y2, _, _ = tuple(int(item) for item in results[0].boxes.data.cpu().numpy()[i])
    img = np.array(Image.open(file_path))

    cropped_image = img[y1:y2, x1:x2]
    cropped_image = Image.fromarray(cropped_image)
    cropped_images.append(cropped_image)
  return render, cropped_images

In [8]:
def process_all_images(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over all image files in the input folder
    for image_file in os.listdir(input_folder):
        if image_file.endswith(".jpg"):
            file_path = os.path.join(input_folder, image_file)
            render, cropped_images = extract_table(file_path)
            image_name = os.path.splitext(image_file)[0]

            # Save each cropped image
            for i, cropped_image in enumerate(cropped_images):
                cropped_image_path = os.path.join(
                    output_folder, f"{image_name}_table_{i + 1}.jpg"
                )
                cropped_image.save(cropped_image_path, "JPEG")
                print(f"Saved {cropped_image_path}")

In [9]:
input_folder = "./output-images"
output_folder = "./cropped-tables"
process_all_images(input_folder, output_folder)

[W NNPACK.cpp:64] Could not initialize NNPACK! Reason: Unsupported hardware.


Boxes:  tensor([[3.82000e+02, 4.28000e+02, 2.16000e+03, 8.26000e+02, 3.13112e-01, 1.00000e+00]])
Saved ./cropped-tables/E - Glyphosate monograph (but it's only the pages with tables)_page_58_table_1.jpg
Boxes:  tensor([[2.46000e+02, 3.91000e+02, 3.04100e+03, 1.35100e+03, 5.86140e-01, 1.00000e+00],
        [2.52000e+02, 1.48400e+03, 3.03400e+03, 2.03700e+03, 3.94473e-01, 1.00000e+00]])
Saved ./cropped-tables/E - Glyphosate monograph (but it's only the pages with tables)_page_64_table_1.jpg
Saved ./cropped-tables/E - Glyphosate monograph (but it's only the pages with tables)_page_64_table_2.jpg
Boxes:  tensor([[3.88000e+02, 1.82200e+03, 2.14000e+03, 2.99400e+03, 9.21162e-01, 1.00000e+00]])
Saved ./cropped-tables/E - Glyphosate monograph (but it's only the pages with tables)_page_70_table_1.jpg
Boxes:  tensor([[3.68000e+02, 1.63000e+03, 2.17600e+03, 1.99500e+03, 7.72271e-01, 1.00000e+00],
        [3.82000e+02, 3.41000e+02, 2.16500e+03, 9.11000e+02, 7.62610e-01, 1.00000e+00]])
Saved ./crop

# Convert table snippet to text using GPT4o vision

In [56]:
from langchain_openai import AzureChatOpenAI
import base64

In [57]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [55]:
llm = AzureChatOpenAI(
    azure_deployment="science-gpt4o",
    api_version="2024-02-15-preview",
)

## Generation for 1 table snippet

In [60]:
IMAGE_PATH = "cropped-tables/A - Tables 11 and 21 (less complex)_page_1_table_1.jpg"

base64_image = encode_image(IMAGE_PATH)

prompt = """
I have a table snippet that I need to convert to text. Can you help me with that?

Output Format:
Column 1 | Column 2 | Column 3
Data 1 | Data 2 | Data 3
etc

Output ONLY the table with no additional comments.
"""
messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant that responds in Markdown.",
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{base64_image}"},
            },
        ],
    },
]
ai_message = llm.invoke(messages)
print(ai_message.content)

0 ppm | 3125 ppm | 6250 ppm | 12500 ppm | 25000 ppm | 50000 ppm
--- | --- | --- | --- | --- | ---
Males | 0/10 | 0/10 | 5/10 (1.0) | 9/10 (1.6) | 10/10 (2.8) | 10/10 (4.0)
Females | 0/10 | 0/10 | 2/10 (1.0) | 9/10 (1.3) | 10/10 (2.4) | 10/10 (3.1)


## Generation for all table snippets

In [62]:
def generate_table_text_from_directory(directory):
    responses = []
    for filename in os.listdir(directory):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(directory, filename)
            base64_image = encode_image(image_path)

            prompt = """
            I have a table snippet that I need to convert to text. Can you help me with that?

            Output Format:
            Column 1 | Column 2 | Column 3
            Data 1 | Data 2 | Data 3
            etc

            Output ONLY the table with no additional comments.
            """

            messages = [
                {
                    "role": "system",
                    "content": "You are a helpful assistant that responds in Markdown.",
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}"
                            },
                        },
                    ],
                },
            ]

            ai_message = llm.invoke(messages)
            responses.append(ai_message.content)

    return responses

# CAUTION: ONLY RUN THIS IF YOU WANT TO CALL GPT4O ON ALL IMAGES IN THE CROPPED-TABLE DIR

In [None]:
# directory = "path_to_your_directory"
# responses = process_images(directory)
# for response in responses:
#     print(response)