<a href="https://colab.research.google.com/github/rendalamili/ml-for-table-extraction/blob/main/Copy_of_Model_Testing_and_Metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notes

Resources Used:

*   Python  - https://docs.python.org/3/contents.html

*   HTML  - https://www.w3schools.com/HTML/html_tables.asp & https://developer.mozilla.org/en-US/docs/Learn/HTML/Tables/Basics

*   HTML to PDF - https://templated.io/blog how-to-convert-html-to-pdf-with-python-pdfkit/

*   Tesseract - https://pypi.org/project/tesseract-ocr-utils/ & https://tesseract-ocr.github.io/ & https://pytesseract.readthedocs.io/en/latest/

*   LLama Parse - https://docs.cloud.llamaindex.ai/llamaparse/getting_started

*   BeautifulSoup - https://tedboy.github.io/bs4_doc/






# Table Creation and PDF Creation

In [2]:
# To convert HTML to pdfs

!pip install pdfkit
!apt-get install -y wkhtmltopdf
import pdfkit


Collecting pdfkit
  Downloading pdfkit-1.0.0-py3-none-any.whl.metadata (9.3 kB)
Downloading pdfkit-1.0.0-py3-none-any.whl (12 kB)
Installing collected packages: pdfkit
Successfully installed pdfkit-1.0.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  avahi-daemon bind9-host bind9-libs geoclue-2.0 glib-networking glib-networking-common
  glib-networking-services gsettings-desktop-schemas iio-sensor-proxy libavahi-core7 libavahi-glib1
  libdaemon0 libevdev2 libfontenc1 libgudev-1.0-0 libhyphen0 libinput-bin libinput10
  libjson-glib-1.0-0 libjson-glib-1.0-common liblmdb0 libmaxminddb0 libmbim-glib4 libmbim-proxy
  libmd4c0 libmm-glib0 libmtdev1 libnl-genl-3-200 libnotify4 libnss-mdns libproxy1v5 libqmi-glib5
  libqmi-proxy libqt5core5a libqt5dbus5 libqt5gui5 libqt5network5 libqt5positioning5
  libqt5printsupport5 libqt5qml5 libqt5qmlmodels5 libqt5quick5 libqt5sensors5 libqt5svg5
  lib

## Table 1: Simple 3x3 Table


In [3]:
from IPython.display import display, HTML

html_table1 = """
<table border="1">
  <tr><th>Name</th><th>Age</th><th>Country</th></tr>
  <tr><td>John</td><td>30</td><td>USA</td></tr>
  <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
</table>
"""
display(HTML(html_table1))

pdfkit.from_string(html_table1, 'Table 1: Simple 3x3 Table.pdf')


Name,Age,Country
John,30,USA
Jane,25,Canada


True

## Table 2: Table with Merged Header Cells


In [4]:
html_table2 = """
<table border="1">
  <tr><th>Name</th><th colspan="2">Details</th></tr>
  <tr><td>John</td><td>Age: 30</td><td>Country: USA</td></tr>
  <tr><td>Jane</td><td>Age: 25</td><td>Country: Canada</td></tr>
</table>
"""
display(HTML(html_table2))

pdfkit.from_string(html_table2, 'Table 2: Table with Merged Header Cells.pdf')


Name,Details,Details.1
John,Age: 30,Country: USA
Jane,Age: 25,Country: Canada


True

## Table 3: Nested Table


In [5]:
html_table3 = """
<table border="1">
  <tr><th>Name</th><th>Age</th><th>Details</th></tr>
  <tr>
    <td>John</td><td>30</td>
    <td>
      <table border="1">
        <tr><td>Country</td><td>USA</td></tr>
        <tr><td>City</td><td>New York</td></tr>
      </table>
    </td>
  </tr>
</table>
"""
display(HTML(html_table3))

pdfkit.from_string(html_table3, 'Table 3: Nested Table.pdf')



Name,Age,Details
John,30,CountryUSA  CityNew York

0,1
Country,USA
City,New York


True

## Table 4: Table with Rowspan


In [6]:
html_table4 = """
<table border="1">
  <tr><th rowspan="2">Name</th><th>Age</th><th>Country</th></tr>
  <tr><td>30</td><td>USA</td></tr>
  <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
</table>
"""
display(HTML(html_table4))

pdfkit.from_string(html_table4, 'Table 4: Table with Rowspan.pdf')



Name,Age,Country
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
30,USA,
Jane,25,Canada


True

## Table 5: Table with 1 x Empty Cell


In [7]:
html_table5 = """
<table border="1">
  <tr><th>Name</th><th>Age</th><th>Country</th></tr>
  <tr><td>John</td><td>30</td><td>USA</td></tr>
  <tr><td>Jane</td><td></td><td>Canada</td></tr>
</table>
"""
display(HTML(html_table5))

pdfkit.from_string(html_table5, 'Table 5: Table with 1 x Empty Cell.pdf')


Name,Age,Country
John,30.0,USA
Jane,,Canada


True

## Table 6: Table with Complex Rowspan and Colspan


In [8]:
html_table6 = """
<table border="1">
  <tr><th rowspan="2">Name</th><th colspan="2">Details</th></tr>
  <tr><td>Age</td><td>Country</td></tr>
  <tr><td>John</td><td>30</td><td>USA</td></tr>
  <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
</table>
"""
display(HTML(html_table6))

pdfkit.from_string(html_table6, 'Table 6: Table with Complex Rowspan and Colspan.pdf')



Name,Details,Details
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Age,Country,
John,30,USA
Jane,25,Canada


True

## Table 7: Large Table with Multiple Rows and Columns


In [9]:
html_table7 = """
<table border="1">
  <tr><th>Name</th><th>Age</th><th>Country</th><th>Occupation</th><th>Salary</th></tr>
  <tr><td>John</td><td>30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr>
  <tr><td>Jane</td><td>25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr>
  <tr><td>Tom</td><td>40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr>
</table>
"""
display(HTML(html_table7))

pdfkit.from_string(html_table7, 'Table 7: Large Table with Multiple Rows and Columns.pdf')


Name,Age,Country,Occupation,Salary
John,30,USA,Engineer,"$100,000"
Jane,25,Canada,Doctor,"$150,000"
Tom,40,UK,Teacher,"$50,000"


True

## Table 8: Table with Mixed Content


In [10]:
html_table8 = """
<table border="1">
  <tr><th>Item</th><th>Description</th><th>Price</th></tr>
  <tr><td>Apple</td><td>Fresh red apple</td><td>$1</td></tr>
  <tr><td rowspan="2">Banana</td><td>Yellow banana</td><td>$0.5</td></tr>
  <tr><td>Green banana</td><td>$0.6</td></tr>
</table>
"""
display(HTML(html_table8))

pdfkit.from_string(html_table8, 'Table 8: Table with Mixed Content.pdf')

Item,Description,Price
Apple,Fresh red apple,$1
Banana,Yellow banana,$0.5
Banana,Green banana,$0.6


True

## Table 9: Table with Multiple Header Rows


In [11]:
html_table9 = """
<table border="1">
  <tr><th colspan="3">Personal Information</th></tr>
  <tr><th>Name</th><th>Age</th><th>Country</th></tr>
  <tr><td>John</td><td>30</td><td>USA</td></tr>
  <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
</table>
"""
display(HTML(html_table9))

pdfkit.from_string(html_table9, 'Table 9: Table with Multiple Header Rows.pdf')



Personal Information,Personal Information,Personal Information
Name,Age,Country
John,30,USA
Jane,25,Canada


True

## Table 10: Table with Vertical Header


In [12]:
html_table10 = """
<table border="1">
  <tr><th>Name</th><td>John</td></tr>
  <tr><th>Age</th><td>30</td></tr>
  <tr><th>Country</th><td>USA</td></tr>
</table>
"""
display(HTML(html_table10))

pdfkit.from_string(html_table10, 'Table 10: Table with Vertical Header.pdf')



0,1
Name,John
Age,30
Country,USA


True

# Testing Models

## Tesseract OCR

In [None]:
!pip install pytesseract pdf2image
!apt-get install -y tesseract-ocr
!apt-get install -y poppler-utils


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.17.0 pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 44 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.c

In [None]:
from pdf2image import convert_from_path
import pytesseract
from IPython.display import display, HTML

# List of PDF file paths
pdf_paths = [
    '/content/Table 1: Simple 3x3 Table.pdf',
    '/content/Table 2: Table with Merged Header Cells.pdf',
    '/content/Table 3: Nested Table.pdf',
    '/content/Table 4: Table with Rowspan',
    '/content/Table 5: Table with 1 x Empty Cell.pdf',
    '/content/Table 6: Table with Complex Rowspan and Colspan.pdf',
    '/content/Table 7: Large Table with Multiple Rows and Columns.pdf',
    '/content/Table 8: Table with Mixed Content.pdf',
    '/content/Table 9: Table with Multiple Header Rows.pdf',
    '/content/Table 10: Table with Vertical Header.pdf',
]

# List to store the HTML output of each PDF
model_htmls = []

# Loop through each PDF file
for pdf_path in pdf_paths:
    # Convert PDF to a list of images
    images = convert_from_path(pdf_path)

    # Initialise the HTML output for each PDF
    html_output = "<html><body>"

    # Process each page in the PDF
    for img in images:
        # Perform OCR on the image
        text = pytesseract.image_to_string(img)

        # Basic conversion of text to HTML
        lines = text.splitlines()
        html_output += "<table border='1'>"

        for line in lines:
            if line.strip():  # Skip empty lines
                html_output += "<tr>"
                cells = line.split()  # Split line into words, treating each as a cell
                for cell in cells:
                    html_output += f"<td>{cell}</td>"
                html_output += "</tr>"

        html_output += "</table><br>"

    # Close the HTML body and HTML tags
    html_output += "</body></html>"

    # Append the HTML output to the list, formatted as a multiline string
    model_htmls.append(f'''"""{html_output}"""''')

# Print the resulting model HTMLs
print("model_htmls = [")
for idx, html in enumerate(model_htmls):
    print(f"    {html},")
print("]")


model_htmls = [
    """<html><body><table border='1'><tr><td>Name</td><td>|Age</td><td>|Country</td></tr><tr><td>John|30</td><td>|USA</td></tr><tr><td>Jane/25</td><td>|Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Name</td></tr><tr><td>Details</td></tr><tr><td>John</td></tr><tr><td>Age:</td></tr><tr><td>30</td></tr><tr><td>Country:</td><td>USA</td></tr><tr><td>Country:</td><td>Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Name</td><td>|Age</td><td>Details</td></tr><tr><td>Country</td><td>|USA</td></tr><tr><td>John|30</td><td>:</td></tr><tr><td>City</td><td>New</td><td>York</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Age</td><td>|Country</td></tr><tr><td>30</td><td>|USA</td></tr><tr><td>Name</td></tr><tr><td>Jane/25</td><td>|Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Name</td><td>|Age</td><td>|Country</td></

## Llama Parse

In [None]:
!pip install llama-parse


Collecting llama-parse
  Downloading llama_parse-0.4.9-py3-none-any.whl.metadata (4.4 kB)
Collecting llama-index-core>=0.10.29 (from llama-parse)
  Downloading llama_index_core-0.11.0.post1-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json (from llama-index-core>=0.10.29->llama-parse)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core>=0.10.29->llama-parse)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core>=0.10.29->llama-parse)
  Downloading dirtyjson-1.0.8-py3-none-any.whl.metadata (11 kB)
Collecting httpx (from llama-index-core>=0.10.29->llama-parse)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting nltk>3.8.1 (from llama-index-core>=0.10.29->llama-parse)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.2.0 (from llama-index-core>=0.10.29->llama-par

In [None]:
# Import libraries
import os
import nest_asyncio
from llama_parse import LlamaParse

# Apply nest_asyncio to handle the running event loop issue
nest_asyncio.apply()

# Set the LlamaParse API key
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-NpvdXIPyfgkbk3pnaAYJav3j8k97vOH7tGvSeHN5KsgOKoeN"

# List of PDF file paths
pdf_paths = [
    '/content/Table 1: Simple 3x3 Table.pdf',
    '/content/Table 2: Table with Merged Header Cells.pdf',
    '/content/Table 3: Nested Table.pdf',
    '/content/Table 4: Table with Rowspan',
    '/content/Table 5: Table with 1 x Empty Cell.pdf',
    '/content/Table 6: Table with Complex Rowspan and Colspan.pdf',
    '/content/Table 7: Large Table with Multiple Rows and Columns.pdf',
    '/content/Table 8: Table with Mixed Content.pdf',
    '/content/Table 9: Table with Multiple Header Rows.pdf',
    '/content/Table 10: Table with Vertical Header.pdf',
]

# Initialise LlamaParse
llama_parser = LlamaParse(result_type="markdown")

# List to store HTML outputs
model_htmls = []

# Function to convert model text to HTML table format
def convert_model_text_to_html(model_text):
    lines = model_text.strip().splitlines()
    html_output = "<html><body><table border='1'>"

    # Process each line
    for i, line in enumerate(lines):
        # Remove leading and trailing pipe symbols and split by '|'
        cells = line.strip('|').split('|')

        # Identify if it's a header or a row of data
        if i == 0:  # First line is header
            html_output += "<tr>"
            for cell in cells:
                html_output += f"<th>{cell.strip()}</th>"
            html_output += "</tr>"
        elif "---" in line:  # Skip the Markdown-like separator line
            continue
        else:  # Data rows
            html_output += "<tr>"
            for cell in cells:
                html_output += f"<td>{cell.strip()}</td>"
            html_output += "</tr>"

    html_output += "</table><br></body></html>"
    return html_output

# Process each PDF file
for pdf_path in pdf_paths:
    # Load the PDF file
    with open(pdf_path, 'rb') as f:
        pdf_content = f.read()

    # Extract the content from the PDF with extra_info containing the file name
    documents = llama_parser.load_data(pdf_content, extra_info={"file_name": os.path.basename(pdf_path)})

    # Initialise HTML output for the PDF
    for doc in documents:
        # Convert the extracted markdown-like text to HTML
        model_html = convert_model_text_to_html(doc.text)

        # Add the HTML output to the list, formatted as a string
        model_htmls.append(f'"""{model_html}"""')

# Print the resulting model HTMLs
print("model_htmls = [")
for idx, html in enumerate(model_htmls):
    print(f"    {html},")
print("]")

ImportError: cannot import name 'validator' from 'llama_index.core.bridge.pydantic' (/usr/local/lib/python3.10/dist-packages/llama_index/core/bridge/pydantic.py)

## Camelot

In [13]:
# Install packages
!pip install ghostscript
!pip install pdf2image
!apt-get install -y ghostscript
!pip install pypdf2==1.26.0
!pip install camelot-py[cv]

Collecting ghostscript
  Downloading ghostscript-0.7-py2.py3-none-any.whl.metadata (4.4 kB)
Downloading ghostscript-0.7-py2.py3-none-any.whl (25 kB)
Installing collected packages: ghostscript
Successfully installed ghostscript-0.7
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-droid-fallback fonts-noto-mono fonts-urw-base35 libgs9 libgs9-common libidn12 libijs-0.35
  libjbig2dec0 poppler-data
Suggested packages:
  fonts-noto fonts-freefont-otf | fonts-freefont-ttf fonts-texgyre ghostscript-x poppler-utils
  fonts-japanese-mincho | fonts-ipafont-mincho fonts-japanese-gothic | fonts-ipafont-gothic
  fonts-arphic-ukai fonts-arphic-uming fonts-nanum
The following

In [14]:
import tkinter
import json
import camelot
import ghostscript
import os

In [17]:
from pdf2image import convert_from_path
from IPython.display import display, HTML

# List of PDF file paths
pdf_paths = [
    '/content/Table 1: Simple 3x3 Table.pdf',
    '/content/Table 2: Table with Merged Header Cells.pdf',
    '/content/Table 3: Nested Table.pdf',
    '/content/Table 4: Table with Rowspan.pdf',
    '/content/Table 5: Table with 1 x Empty Cell.pdf',
    '/content/Table 6: Table with Complex Rowspan and Colspan.pdf',
    '/content/Table 7: Large Table with Multiple Rows and Columns.pdf',
    '/content/Table 8: Table with Mixed Content.pdf',
    '/content/Table 9: Table with Multiple Header Rows.pdf',
    '/content/Table 10: Table with Vertical Header.pdf',
]

# List to store the HTML output of each PDF
model_htmls = []

# Loop through each PDF file
for pdf_path in pdf_paths:
    # Convert PDF to a list of images
    try:
      # Extract tables from PDF
      tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')

      # Initialise the HTML output for each PDF
      html_output = "<html><body>"

      # Process each page in the PDF
      for table in tables:
        html_output += ",<table border='1'>"

        # Convert text to HTML
        for row in table.df.itertuples(index=False):
          html_output += "<tr>"
          for cell in row:
            html_output += f"<td>{cell}</td>"
          html_output += "</tr>"

          html_output += "</table><br>"

      # Close the HTML body and HTML tags
      html_output += "</body></html>"

      # Append the HTML output to the list, formatted as a multiline string
      model_htmls.append(html_output)
    except:
      model_htmls.append("Error")

# Print the resulting model HTMLs
print("model_htmls = [")
for idx, html in enumerate(model_htmls):
    print(f"    \"\"\"{html}\"\"\",")
print("]")

model_htmls = [
    """<html><body></body></html>""",
    """<html><body></body></html>""",
    """<html><body></body></html>""",
    """<html><body></body></html>""",
    """<html><body></body></html>""",
    """<html><body></body></html>""",
    """<html><body></body></html>""",
    """<html><body></body></html>""",
    """<html><body></body></html>""",
    """<html><body></body></html>""",
]


In [19]:
from camelot import read_pdf

# List of PDF file paths
pdf_paths = [
    '/content/Table 1: Simple 3x3 Table.pdf',
    '/content/Table 2: Table with Merged Header Cells.pdf',
    '/content/Table 3: Nested Table.pdf',
    '/content/Table 4: Table with Rowspan.pdf',
    '/content/Table 5: Table with 1 x Empty Cell.pdf',
    '/content/Table 6: Table with Complex Rowspan and Colspan.pdf',
    '/content/Table 7: Large Table with Multiple Rows and Columns.pdf',
    '/content/Table 8: Table with Mixed Content.pdf',
    '/content/Table 9: Table with Multiple Header Rows.pdf',
    '/content/Table 10: Table with Vertical Header.pdf',
]

# List to store the HTML output of each PDF
model_htmls = []

# Loop through each PDF file
for pdf_path in pdf_paths:
    # Convert PDF to a list of images
    try:
      # Extract tables from PDF
      tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')

      # Initialise the HTML output for each PDF
      html_output = "<html><body>"

      # Process each page in the PDF
      for table in tables:
        html_output += ",<table border='1'>"

        # Convert text to HTML
        for row in table.df.itertuples(index=False):
          html_output += "<tr>"
          for cell in row:
            html_output += f"<td>{cell}</td>"
          html_output += "</tr>"

          html_output += "</table><br>"

      # Close the HTML body and HTML tags
      html_output += "</body></html>"

      # Append the HTML output to the list, formatted as a multiline string
      model_htmls.append(html_output)
    except:
      model_htmls.append("Error")

# Print the resulting model HTMLs
print("model_htmls = [")
for idx, html in enumerate(model_htmls):
    print(f"    \"\"\"{html}\"\"\",")
print("]")

model_htmls = [
    """<html><body>,<table border='1'><tr><td>Name Age Country</td><td></td></tr></table><br><tr><td>John 30</td><td>USA</td></tr></table><br><tr><td>Jane 25</td><td>Canada</td></tr></table><br></body></html>""",
    """<html><body>,<table border='1'><tr><td>Name</td><td>Details</td></tr></table><br><tr><td>John Age:	30 Country:	USA</td><td></td></tr></table><br><tr><td>Jane Age:	25 Country:	Canada</td><td></td></tr></table><br></body></html>""",
    """<html><body>,<table border='1'><tr><td>Name Age</td><td></td><td>Details</td></tr></table><br><tr><td></td><td>Country USA</td><td></td></tr></table><br><tr><td>John 30</td><td></td><td></td></tr></table><br><tr><td></td><td>City</td><td>New	York</td></tr></table><br></body></html>""",
    """<html><body>,<table border='1'><tr><td></td><td>Age Country</td></tr></table><br><tr><td>Name</td><td></td></tr></table><br><tr><td>30</td><td>USA</td></tr></table><br><tr><td>Jane 25</td><td>Canada</td></tr></table><br></body></htm

## Adobe

In [21]:
!pip install pdfservices-sdk

import logging
import os
from datetime import datetime

from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job import ExtractPDFJob
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params import ExtractPDFParams
from adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result import ExtractPDFResult
from adobe.pdfservices.operation.pdfjobs.jobs.export_pdf_job import ExportPDFJob
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_params import ExportPDFParams
from adobe.pdfservices.operation.pdfjobs.params.export_pdf.export_pdf_target_format import ExportPDFTargetFormat
from adobe.pdfservices.operation.pdfjobs.result.export_pdf_result import ExportPDFResult


Collecting pdfservices-sdk
  Downloading pdfservices_sdk-4.0.0-py3-none-any.whl.metadata (2.8 kB)
Collecting requests~=2.31.0 (from pdfservices-sdk)
  Downloading requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting setuptools~=69.5.1 (from pdfservices-sdk)
  Downloading setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Collecting sphinx~=7.3.7 (from pdfservices-sdk)
  Downloading sphinx-7.3.7-py3-none-any.whl.metadata (6.0 kB)
Collecting sphinx-rtd-theme~=2.0.0 (from pdfservices-sdk)
  Downloading sphinx_rtd_theme-2.0.0-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme~=2.0.0->pdfservices-sdk)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading pdfservices_sdk-4.0.0-py3-none-any.whl (274 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.4/274.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests-2.31.0-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━

In [22]:
# Initialize the logger
logging.basicConfig(level=logging.INFO)

# Testing Metrics

## Structure

In [None]:
!pip install beautifulsoup4

from bs4 import BeautifulSoup

def parse_html(html_content):
    """Parses HTML content and returns a BeautifulSoup object."""
    return BeautifulSoup(html_content, 'html.parser')

def compare_tables(model_html, ground_truth_html):
    """Compares the structure and content of tables in two HTML documents."""
    model_soup = parse_html(model_html)
    gt_soup = parse_html(ground_truth_html)

    model_tables = model_soup.find_all('table')
    gt_tables = gt_soup.find_all('table')

    total_tables = len(gt_tables)
    correct_tables = 0
    correct_rows = 0
    correct_cells = 0
    total_rows = 0
    total_cells = 0

    for i in range(min(len(model_tables), len(gt_tables))):
        model_table = model_tables[i]
        gt_table = gt_tables[i]

        model_rows = model_table.find_all('tr')
        gt_rows = gt_table.find_all('tr')

        if len(model_rows) == len(gt_rows):
            correct_tables += 1

        total_rows += len(gt_rows)

        for j in range(min(len(model_rows), len(gt_rows))):
            model_row = model_rows[j]
            gt_row = gt_rows[j]

            model_cells = model_row.find_all(['td', 'th'])
            gt_cells = gt_row.find_all(['td', 'th'])

            if len(model_cells) == len(gt_cells):
                correct_rows += 1

            total_cells += len(gt_cells)

            for k in range(min(len(model_cells), len(gt_cells))):
                if model_cells[k].get_text(strip=True) == gt_cells[k].get_text(strip=True):
                    correct_cells += 1

    # Safeguard against division by zero in case there are no tables or rows
    structure_accuracy = correct_tables / total_tables if total_tables > 0 else 0
    row_accuracy = correct_rows / total_rows if total_rows > 0 else 0
    cell_accuracy = correct_cells / total_cells if total_cells > 0 else 0

    return structure_accuracy, row_accuracy, cell_accuracy

# List of Ground Truth HTMLs
ground_truth_htmls = [
    #Table 1
    """<table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    #Table 2
    """
    <table border="1">
      <tr><th>Name</th><th colspan="2">Details</th></tr>
      <tr><td>John</td><td>Age: 30</td><td>Country: USA</td></tr>
      <tr><td>Jane</td><td>Age: 25</td><td>Country: Canada</td></tr>
    </table>
    """,
    # Table 3
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Details</th></tr>
      <tr>
        <td>John</td><td>30</td>
        <td>
          <table border="1">
            <tr><td>Country</td><td>USA</td></tr>
            <tr><td>City</td><td>New York</td></tr>
          </table>
        </td>
      </tr>
    </table>
    """,
    # Table 4
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 5
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td></td><td>Canada</td></tr>
    </table>
    """,
    # Table 6
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th colspan="2">Details</th></tr>
      <tr><td>Age</td><td>Country</td></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 7
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th><th>Occupation</th><th>Salary</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr>
      <tr><td>Tom</td><td>40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr>
    </table>
    """ ,
    # Table 8
    """
    <table border="1">
      <tr><th>Item</th><th>Description</th><th>Price</th></tr>
      <tr><td>Apple</td><td>Fresh red apple</td><td>$1</td></tr>
      <tr><td rowspan="2">Banana</td><td>Yellow banana</td><td>$0.5</td></tr>
      <tr><td>Green banana</td><td>$0.6</td></tr>
    </table>
    """ ,
    # Table 9
    """
    <table border="1">
      <tr><th colspan="3">Personal Information</th></tr>
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """ ,
    # Table 10
    """
    <table border="1">
      <tr><th>Name</th><td>John</td></tr>
      <tr><th>Age</th><td>30</td></tr>
      <tr><th>Country</th><td>USA</td></tr>
    </table>
    """
]

# List of Model HTMLs
model_htmls = [
   # ALL YOU NEED TO DO IS ADD IN YOUR OWN HTML OUTPUTS FROM YOUR MODELS
]

# Debugging issues: Ensure that the lengths of ground_truth_htmls and model_htmls are equal
assert len(ground_truth_htmls) == len(model_htmls), "Mismatch in the number of ground truth and model HTMLs."

# Lists to store accuracy values
structure_accuracies = []
row_accuracies = []
cell_accuracies = []


# Iterate over each pair of ground truth and model output
for i in range(len(ground_truth_htmls)):
    structure_acc, row_acc, cell_acc = compare_tables(model_htmls[i], ground_truth_htmls[i])
    print(f"Table {i+1}:")
    print(f"Structure Accuracy: {structure_acc:.2f}")
    print(f"Row Accuracy: {row_acc:.2f}")
    print(f"Cell Accuracy: {cell_acc:.2f}")
    print("-" * 40)

    structure_accuracies.append(structure_acc)
    row_accuracies.append(row_acc)
    cell_accuracies.append(cell_acc)

# Calculate mean averages
if structure_accuracies:
    mean_structure_acc = sum(structure_accuracies) / len(structure_accuracies)
    mean_row_acc = sum(row_accuracies) / len(row_accuracies)
    mean_cell_acc = sum(cell_accuracies) / len(cell_accuracies)

    # Print mean averages
    print("Mean Averages:")
    print(f"Mean Structure Accuracy: {mean_structure_acc:.2f}")
    print(f"Mean Row Accuracy: {mean_row_acc:.2f}")
    print(f"Mean Cell Accuracy: {mean_cell_acc:.2f}")
else:
    print("No tables were able to be processed.")





AssertionError: Mismatch in the number of ground truth and model HTMLs.

## Tesseract OCR

In [None]:
!pip install beautifulsoup4

from bs4 import BeautifulSoup

def parse_html(html_content):
    """Parses HTML content and returns a BeautifulSoup object."""
    return BeautifulSoup(html_content, 'html.parser')

def compare_tables(model_html, ground_truth_html):
    """Compares the structure and content of tables in two HTML documents."""
    model_soup = parse_html(model_html)
    gt_soup = parse_html(ground_truth_html)

    model_tables = model_soup.find_all('table')
    gt_tables = gt_soup.find_all('table')

    total_tables = len(gt_tables)
    correct_tables = 0
    correct_rows = 0
    correct_cells = 0
    total_rows = 0
    total_cells = 0

    for i in range(min(len(model_tables), len(gt_tables))):
        model_table = model_tables[i]
        gt_table = gt_tables[i]

        model_rows = model_table.find_all('tr')
        gt_rows = gt_table.find_all('tr')

        if len(model_rows) == len(gt_rows):
            correct_tables += 1

        total_rows += len(gt_rows)

        for j in range(min(len(model_rows), len(gt_rows))):
            model_row = model_rows[j]
            gt_row = gt_rows[j]

            model_cells = model_row.find_all(['td', 'th'])
            gt_cells = gt_row.find_all(['td', 'th'])

            if len(model_cells) == len(gt_cells):
                correct_rows += 1

            total_cells += len(gt_cells)

            for k in range(min(len(model_cells), len(gt_cells))):
                if model_cells[k].get_text(strip=True) == gt_cells[k].get_text(strip=True):
                    correct_cells += 1

    # Safeguard against division by zero in case there are no tables or rows
    structure_accuracy = correct_tables / total_tables if total_tables > 0 else 0
    row_accuracy = correct_rows / total_rows if total_rows > 0 else 0
    cell_accuracy = correct_cells / total_cells if total_cells > 0 else 0

    return structure_accuracy, row_accuracy, cell_accuracy

# List of Ground Truth HTMLs
ground_truth_htmls = [
    #Table 1
    """<table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    #Table 2
    """
    <table border="1">
      <tr><th>Name</th><th colspan="2">Details</th></tr>
      <tr><td>John</td><td>Age: 30</td><td>Country: USA</td></tr>
      <tr><td>Jane</td><td>Age: 25</td><td>Country: Canada</td></tr>
    </table>
    """,
    # Table 3
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Details</th></tr>
      <tr>
        <td>John</td><td>30</td>
        <td>
          <table border="1">
            <tr><td>Country</td><td>USA</td></tr>
            <tr><td>City</td><td>New York</td></tr>
          </table>
        </td>
      </tr>
    </table>
    """,
    # Table 4
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 5
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td></td><td>Canada</td></tr>
    </table>
    """,
    # Table 6
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th colspan="2">Details</th></tr>
      <tr><td>Age</td><td>Country</td></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 7
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th><th>Occupation</th><th>Salary</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr>
      <tr><td>Tom</td><td>40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr>
    </table>
    """ ,
    # Table 8
    """
    <table border="1">
      <tr><th>Item</th><th>Description</th><th>Price</th></tr>
      <tr><td>Apple</td><td>Fresh red apple</td><td>$1</td></tr>
      <tr><td rowspan="2">Banana</td><td>Yellow banana</td><td>$0.5</td></tr>
      <tr><td>Green banana</td><td>$0.6</td></tr>
    </table>
    """ ,
    # Table 9
    """
    <table border="1">
      <tr><th colspan="3">Personal Information</th></tr>
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """ ,
    # Table 10
    """
    <table border="1">
      <tr><th>Name</th><td>John</td></tr>
      <tr><th>Age</th><td>30</td></tr>
      <tr><th>Country</th><td>USA</td></tr>
    </table>
    """
]

# List of Model HTMLs
model_htmls = [
    """<html><body><table border='1'><tr><td>Name</td><td>|Age</td><td>|Country</td></tr><tr><td>John|30</td><td>|USA</td></tr><tr><td>Jane/25</td><td>|Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Name</td></tr><tr><td>Details</td></tr><tr><td>John</td></tr><tr><td>Age:</td></tr><tr><td>30</td></tr><tr><td>Country:</td><td>USA</td></tr><tr><td>Country:</td><td>Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Name</td><td>|Age</td><td>Details</td></tr><tr><td>Country</td><td>|USA</td></tr><tr><td>John|30</td><td>:</td></tr><tr><td>City</td><td>New</td><td>York</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Age</td><td>|Country</td></tr><tr><td>30</td><td>|USA</td></tr><tr><td>Name</td></tr><tr><td>Jane/25</td><td>|Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Name</td><td>|Age</td><td>|Country</td></tr><tr><td>John|30</td><td>|USA</td></tr><tr><td>Jane</td><td>Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Details</td></tr><tr><td>Age</td><td>|Country</td></tr><tr><td>John|30</td><td>|USA</td></tr><tr><td>Name</td></tr><tr><td>Jane/25</td><td>|Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Name</td><td>|Age</td><td>[Country</td><td>|Occupation|</td><td>Salary</td></tr><tr><td>John|30</td><td>|USA</td><td>Engineer</td><td>[$100,000</td></tr><tr><td>Jane|25</td><td>|Canada</td><td>|Doctor</td><td>$150,</td><td>000</td></tr><tr><td>Tom</td><td>|40</td><td>|UK</td><td>Teacher</td><td>$50,000</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Item</td><td>Description</td><td>(Price</td></tr><tr><td>[Apple</td><td>|Fresh</td><td>red</td><td>apple/$1</td></tr><tr><td>Yellow</td><td>banana</td><td>|$0.5</td></tr><tr><td>Banana</td></tr><tr><td>Green</td><td>banana</td><td>$0.6</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Personal</td><td>Information</td></tr><tr><td>Name</td><td>|</td><td>Age</td><td>|</td><td>Country</td></tr><tr><td>John</td><td>|30</td><td>USA</td></tr><tr><td>Jane</td><td>|25</td><td>|Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><td>Name</td><td>|John</td></tr><tr><td>Age</td><td>[30</td></tr><tr><td>Country</td></tr></table><br></body></html>""",
]

# Debugging issues: Ensure that the lengths of ground_truth_htmls and model_htmls are equal
assert len(ground_truth_htmls) == len(model_htmls), "Mismatch in the number of ground truth and model HTMLs."

# Lists to store accuracy values
structure_accuracies = []
row_accuracies = []
cell_accuracies = []


# Iterate over each pair of ground truth and model output
for i in range(len(ground_truth_htmls)):
    structure_acc, row_acc, cell_acc = compare_tables(model_htmls[i], ground_truth_htmls[i])
    print(f"Table {i+1}:")
    print(f"Structure Accuracy: {structure_acc:.2f}")
    print(f"Row Accuracy: {row_acc:.2f}")
    print(f"Cell Accuracy: {cell_acc:.2f}")
    print("-" * 40)

    structure_accuracies.append(structure_acc)
    row_accuracies.append(row_acc)
    cell_accuracies.append(cell_acc)

# Calculate mean averages
if structure_accuracies:
    mean_structure_acc = sum(structure_accuracies) / len(structure_accuracies)
    mean_row_acc = sum(row_accuracies) / len(row_accuracies)
    mean_cell_acc = sum(cell_accuracies) / len(cell_accuracies)

    # Print mean averages
    print("Mean Averages:")
    print(f"Mean Structure Accuracy: {mean_structure_acc:.2f}")
    print(f"Mean Row Accuracy: {mean_row_acc:.2f}")
    print(f"Mean Cell Accuracy: {mean_cell_acc:.2f}")
else:
    print("No tables were able to be processed.")





Table 1:
Structure Accuracy: 1.00
Row Accuracy: 0.33
Cell Accuracy: 0.11
----------------------------------------
Table 2:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.12
----------------------------------------
Table 3:
Structure Accuracy: 0.50
Row Accuracy: 0.50
Cell Accuracy: 0.21
----------------------------------------
Table 4:
Structure Accuracy: 0.00
Row Accuracy: 0.33
Cell Accuracy: 0.12
----------------------------------------
Table 5:
Structure Accuracy: 1.00
Row Accuracy: 0.33
Cell Accuracy: 0.22
----------------------------------------
Table 6:
Structure Accuracy: 0.00
Row Accuracy: 0.25
Cell Accuracy: 0.10
----------------------------------------
Table 7:
Structure Accuracy: 1.00
Row Accuracy: 0.75
Cell Accuracy: 0.25
----------------------------------------
Table 8:
Structure Accuracy: 0.00
Row Accuracy: 0.50
Cell Accuracy: 0.18
----------------------------------------
Table 9:
Structure Accuracy: 1.00
Row Accuracy: 0.50
Cell Accuracy: 0.40
---------------

## LlamaParse

In [None]:
!pip install beautifulsoup4

from bs4 import BeautifulSoup

def parse_html(html_content):
    """Parses HTML content and returns a BeautifulSoup object."""
    return BeautifulSoup(html_content, 'html.parser')

def compare_tables(model_html, ground_truth_html):
    """Compares the structure and content of tables in two HTML documents."""
    model_soup = parse_html(model_html)
    gt_soup = parse_html(ground_truth_html)

    model_tables = model_soup.find_all('table')
    gt_tables = gt_soup.find_all('table')

    total_tables = len(gt_tables)
    correct_tables = 0
    correct_rows = 0
    correct_cells = 0
    total_rows = 0
    total_cells = 0

    for i in range(min(len(model_tables), len(gt_tables))):
        model_table = model_tables[i]
        gt_table = gt_tables[i]

        model_rows = model_table.find_all('tr')
        gt_rows = gt_table.find_all('tr')

        if len(model_rows) == len(gt_rows):
            correct_tables += 1

        total_rows += len(gt_rows)

        for j in range(min(len(model_rows), len(gt_rows))):
            model_row = model_rows[j]
            gt_row = gt_rows[j]

            model_cells = model_row.find_all(['td', 'th'])
            gt_cells = gt_row.find_all(['td', 'th'])

            if len(model_cells) == len(gt_cells):
                correct_rows += 1

            total_cells += len(gt_cells)

            for k in range(min(len(model_cells), len(gt_cells))):
                if model_cells[k].get_text(strip=True) == gt_cells[k].get_text(strip=True):
                    correct_cells += 1

    # Safeguard against division by zero in case there are no tables or rows
    structure_accuracy = correct_tables / total_tables if total_tables > 0 else 0
    row_accuracy = correct_rows / total_rows if total_rows > 0 else 0
    cell_accuracy = correct_cells / total_cells if total_cells > 0 else 0

    return structure_accuracy, row_accuracy, cell_accuracy

# List of Ground Truth HTMLs
ground_truth_htmls = [
    #Table 1
    """<table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    #Table 2
    """
    <table border="1">
      <tr><th>Name</th><th colspan="2">Details</th></tr>
      <tr><td>John</td><td>Age: 30</td><td>Country: USA</td></tr>
      <tr><td>Jane</td><td>Age: 25</td><td>Country: Canada</td></tr>
    </table>
    """,
    # Table 3
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Details</th></tr>
      <tr>
        <td>John</td><td>30</td>
        <td>
          <table border="1">
            <tr><td>Country</td><td>USA</td></tr>
            <tr><td>City</td><td>New York</td></tr>
          </table>
        </td>
      </tr>
    </table>
    """,
    # Table 4
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 5
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td></td><td>Canada</td></tr>
    </table>
    """,
    # Table 6
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th colspan="2">Details</th></tr>
      <tr><td>Age</td><td>Country</td></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 7
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th><th>Occupation</th><th>Salary</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr>
      <tr><td>Tom</td><td>40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr>
    </table>
    """ ,
    # Table 8
    """
    <table border="1">
      <tr><th>Item</th><th>Description</th><th>Price</th></tr>
      <tr><td>Apple</td><td>Fresh red apple</td><td>$1</td></tr>
      <tr><td rowspan="2">Banana</td><td>Yellow banana</td><td>$0.5</td></tr>
      <tr><td>Green banana</td><td>$0.6</td></tr>
    </table>
    """ ,
    # Table 9
    """
    <table border="1">
      <tr><th colspan="3">Personal Information</th></tr>
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """ ,
    # Table 10
    """
    <table border="1">
      <tr><th>Name</th><td>John</td></tr>
      <tr><th>Age</th><td>30</td></tr>
      <tr><th>Country</th><td>USA</td></tr>
    </table>
    """
]

# List of Model HTMLs
model_htmls = [
    """<html><body><table border='1'><tr><th>Name</th><th>Age</th><th>Country</th></tr><tr><td>John</td><td>30</td><td>USA</td></tr><tr><td>Jane</td><td>25</td><td>Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><th>Name</th><th>Details</th></tr><tr><td>John</td><td>Age: 30 Country: USA</td></tr><tr><td>Jane</td><td>Age: 25 Country: Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><th>Name</th><th>Age</th><th>Details</th></tr><tr><td>John</td><td>30</td><td>Country: USA City: New York</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><th>Name</th><th>Age</th><th>Country</th></tr><tr><td>John</td><td>30</td><td>USA</td></tr><tr><td>Jane</td><td></td><td>Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><th>Name</th><th>Age</th><th>Country</th></tr><tr><td>John</td><td>30</td><td>USA</td></tr><tr><td>Jane</td><td></td><td>Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><th>Name</th><th>Age</th><th>Country</th><th>Details</th></tr><tr><td>John</td><td>30</td><td>USA</td><td></td></tr><tr><td>Jane</td><td>25</td><td>Canada</td><td></td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><th>Name</th><th>Age</th><th>Country</th><th>Occupation</th><th>Salary</th></tr><tr><td>John</td><td>30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr><tr><td>Jane</td><td>25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr><tr><td>Tom</td><td>40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><th>Item</th><th>Description</th><th>Price</th></tr><tr><td>Apple</td><td>Fresh red apple</td><td>$1</td></tr><tr><td>Banana</td><td>Green banana Yellow banana</td><td>$0.5 $0.6</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><th>Name</th><th>Age</th><th>Country</th></tr><tr><td>John</td><td>30</td><td>USA</td></tr><tr><td>Jane</td><td>25</td><td>Canada</td></tr></table><br></body></html>""",
    """<html><body><table border='1'><tr><th>Name</th><th>John</th></tr><tr><td>Age</td><td>30</td></tr><tr><td>Country</td><td>USA</td></tr></table><br></body></html>""",
]

# Debugging issues: Ensure that the lengths of ground_truth_htmls and model_htmls are equal
assert len(ground_truth_htmls) == len(model_htmls), "Mismatch in the number of ground truth and model HTMLs."

# Lists to store accuracy values
structure_accuracies = []
row_accuracies = []
cell_accuracies = []


# Iterate over each pair of ground truth and model output
for i in range(len(ground_truth_htmls)):
    structure_acc, row_acc, cell_acc = compare_tables(model_htmls[i], ground_truth_htmls[i])
    print(f"Table {i+1}:")
    print(f"Structure Accuracy: {structure_acc:.2f}")
    print(f"Row Accuracy: {row_acc:.2f}")
    print(f"Cell Accuracy: {cell_acc:.2f}")
    print("-" * 40)

    structure_accuracies.append(structure_acc)
    row_accuracies.append(row_acc)
    cell_accuracies.append(cell_acc)

# Calculate mean averages
if structure_accuracies:
    mean_structure_acc = sum(structure_accuracies) / len(structure_accuracies)
    mean_row_acc = sum(row_accuracies) / len(row_accuracies)
    mean_cell_acc = sum(cell_accuracies) / len(cell_accuracies)

    # Print mean averages
    print("Mean Averages:")
    print(f"Mean Structure Accuracy: {mean_structure_acc:.2f}")
    print(f"Mean Row Accuracy: {mean_row_acc:.2f}")
    print(f"Mean Cell Accuracy: {mean_cell_acc:.2f}")
else:
    print("No tables were able to be processed.")



Table 1:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 1.00
----------------------------------------
Table 2:
Structure Accuracy: 1.00
Row Accuracy: 0.33
Cell Accuracy: 0.50
----------------------------------------
Table 3:
Structure Accuracy: 0.00
Row Accuracy: 0.25
Cell Accuracy: 0.50
----------------------------------------
Table 4:
Structure Accuracy: 1.00
Row Accuracy: 0.67
Cell Accuracy: 0.62
----------------------------------------
Table 5:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 1.00
----------------------------------------
Table 6:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.14
----------------------------------------
Table 7:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 1.00
----------------------------------------
Table 8:
Structure Accuracy: 0.00
Row Accuracy: 0.75
Cell Accuracy: 0.78
----------------------------------------
Table 9:
Structure Accuracy: 0.00
Row Accuracy: 0.50
Cell Accuracy: 0.00
---------------

## Google Notebook LM

In [None]:
!pip install beautifulsoup4

from bs4 import BeautifulSoup

def parse_html(html_content):
    """Parses HTML content and returns a BeautifulSoup object."""
    return BeautifulSoup(html_content, 'html.parser')

def compare_tables(model_html, ground_truth_html):
    """Compares the structure and content of tables in two HTML documents."""
    model_soup = parse_html(model_html)
    gt_soup = parse_html(ground_truth_html)

    model_tables = model_soup.find_all('table')
    gt_tables = gt_soup.find_all('table')

    total_tables = len(gt_tables)
    correct_tables = 0
    correct_rows = 0
    correct_cells = 0
    total_rows = 0
    total_cells = 0

    for i in range(min(len(model_tables), len(gt_tables))):
        model_table = model_tables[i]
        gt_table = gt_tables[i]

        model_rows = model_table.find_all('tr')
        gt_rows = gt_table.find_all('tr')

        if len(model_rows) == len(gt_rows):
            correct_tables += 1

        total_rows += len(gt_rows)

        for j in range(min(len(model_rows), len(gt_rows))):
            model_row = model_rows[j]
            gt_row = gt_rows[j]

            model_cells = model_row.find_all(['td', 'th'])
            gt_cells = gt_row.find_all(['td', 'th'])

            if len(model_cells) == len(gt_cells):
                correct_rows += 1

            total_cells += len(gt_cells)

            for k in range(min(len(model_cells), len(gt_cells))):
                if model_cells[k].get_text(strip=True) == gt_cells[k].get_text(strip=True):
                    correct_cells += 1

    # Safeguard against division by zero in case there are no tables or rows
    structure_accuracy = correct_tables / total_tables if total_tables > 0 else 0
    row_accuracy = correct_rows / total_rows if total_rows > 0 else 0
    cell_accuracy = correct_cells / total_cells if total_cells > 0 else 0

    return structure_accuracy, row_accuracy, cell_accuracy

# List of Ground Truth HTMLs
ground_truth_htmls = [
    #Table 1
    """<table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    #Table 2
    """
    <table border="1">
      <tr><th>Name</th><th colspan="2">Details</th></tr>
      <tr><td>John</td><td>Age: 30</td><td>Country: USA</td></tr>
      <tr><td>Jane</td><td>Age: 25</td><td>Country: Canada</td></tr>
    </table>
    """,
    # Table 3
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Details</th></tr>
      <tr>
        <td>John</td><td>30</td>
        <td>
          <table border="1">
            <tr><td>Country</td><td>USA</td></tr>
            <tr><td>City</td><td>New York</td></tr>
          </table>
        </td>
      </tr>
    </table>
    """,
    # Table 4
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 5
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td></td><td>Canada</td></tr>
    </table>
    """,
    # Table 6
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th colspan="2">Details</th></tr>
      <tr><td>Age</td><td>Country</td></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 7
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th><th>Occupation</th><th>Salary</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr>
      <tr><td>Tom</td><td>40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr>
    </table>
    """ ,
    # Table 8
    """
    <table border="1">
      <tr><th>Item</th><th>Description</th><th>Price</th></tr>
      <tr><td>Apple</td><td>Fresh red apple</td><td>$1</td></tr>
      <tr><td rowspan="2">Banana</td><td>Yellow banana</td><td>$0.5</td></tr>
      <tr><td>Green banana</td><td>$0.6</td></tr>
    </table>
    """ ,
    # Table 9
    """
    <table border="1">
      <tr><th colspan="3">Personal Information</th></tr>
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """ ,
    # Table 10
    """
    <table border="1">
      <tr><th>Name</th><td>John</td></tr>
      <tr><th>Age</th><td>30</td></tr>
      <tr><th>Country</th><td>USA</td></tr>
    </table>
    """
]

# List of Model HTMLs
model_htmls = [
    # Table 1
    """<table><tr><th>Name</th><th>Age</th><th>Country</th></tr><tr><td>John</td><td>25</td><td>Canada</td></tr><tr><td>Jane</td><td>25</td><td>Canada</td></tr></table>""",
    # Table 2
    """<table>
  <tr>
    <th>Name</th>
    <th colspan="2">Details</th>
  </tr>
  <tr>
    <th></th>
    <th>Age:</th>
    <th>Country:</th>
  </tr>
  <tr>
    <td>Jane</td>
    <td>25</td>
    <td>Canada</td>
  </tr>
</table>""",
    # Table 3
    """<table>
  <tr>
    <th>Name</th>
    <td>John</td>
  </tr>
  <tr>
    <th>Age</th>
    <td>30</td>
  </tr>
  <tr>
    <th>Country</th>
    <td>USA</td>
  </tr>
  <tr>
    <th>City</th>
    <td>New York</td>
  </tr>
</table>""",
    # Table 4
    """<table>
  <tr>
    <th>Name</th>
    <td>Jane</td>
  </tr>
  <tr>
    <th>Age</th>
    <td>25</td>
  </tr>
  <tr>
    <th>Country</th>
    <td>Canada</td>
  </tr>
</table>""",
    # Table 5
    """<table>
  <tr>
    <th>Name</th>
    <td>Jane</td>
  </tr>
  <tr>
    <th>Country</th>
    <td>Canada</td>
  </tr>
</table>""",
    # Table 6
    """<table>
  <tr>
    <td>John</td> <td>30</td> <td>USA</td>
  </tr>
  <tr>
    <td>Jane</td> <td>25</td> <td>Canada</td>
  </tr>
</table>""",
    # Table 7
    """<table>
  <tr>
    <td>Jane</td> <td>25</td> <td>Canada</td> <td>Doctor</td> <td>$150,000</td>
  </tr>
  <tr>
    <td>Tom</td> <td>40</td> <td>UK</td> <td>Teacher</td> <td>$50,000</td>
  </tr>
</table>""",
    # Table 8
    """<table>
  <tr>
    <th>Banana Yellow</th>
    <td>banana $0.5</td>
  </tr>
  <tr>
    <th>Green</th>
    <td>banana $0.6</td>
  </tr>
</table>""",
    # Table 9
    """<table>
  <tr>
    <th>Name</th> <td>John</td>
    <th>Age</th> <td>30</td>
    <th>Country</th> <td>USA</td>
  </tr>
  <tr>
    <th>Name</th> <td>Jane</td>
    <th>Age</th> <td>25</td>
    <th>Country</th> <td>Canada</td>
  </tr>
</table>""",
    # Table 10
    """<table>
  <tr>
    <th>Country</th>
    <td>USA</td>
  </tr>
</table>"""
]

# Debugging issues: Ensure that the lengths of ground_truth_htmls and model_htmls are equal
assert len(ground_truth_htmls) == len(model_htmls), "Mismatch in the number of ground truth and model HTMLs."

# Lists to store accuracy values
structure_accuracies = []
row_accuracies = []
cell_accuracies = []


# Iterate over each pair of ground truth and model output
for i in range(len(ground_truth_htmls)):
    structure_acc, row_acc, cell_acc = compare_tables(model_htmls[i], ground_truth_htmls[i])
    print(f"Table {i+1}:")
    print(f"Structure Accuracy: {structure_acc:.2f}")
    print(f"Row Accuracy: {row_acc:.2f}")
    print(f"Cell Accuracy: {cell_acc:.2f}")
    print("-" * 40)

    structure_accuracies.append(structure_acc)
    row_accuracies.append(row_acc)
    cell_accuracies.append(cell_acc)

# Calculate mean averages
if structure_accuracies:
    mean_structure_acc = sum(structure_accuracies) / len(structure_accuracies)
    mean_row_acc = sum(row_accuracies) / len(row_accuracies)
    mean_cell_acc = sum(cell_accuracies) / len(cell_accuracies)

    # Print mean averages
    print("Mean Averages:")
    print(f"Mean Structure Accuracy: {mean_structure_acc:.2f}")
    print(f"Mean Row Accuracy: {mean_row_acc:.2f}")
    print(f"Mean Cell Accuracy: {mean_cell_acc:.2f}")
else:
    print("No tables were able to be processed.")


Table 1:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 0.78
----------------------------------------
Table 2:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 0.38
----------------------------------------
Table 3:
Structure Accuracy: 0.50
Row Accuracy: 0.50
Cell Accuracy: 0.43
----------------------------------------
Table 4:
Structure Accuracy: 1.00
Row Accuracy: 0.33
Cell Accuracy: 0.12
----------------------------------------
Table 5:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.17
----------------------------------------
Table 6:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.00
----------------------------------------
Table 7:
Structure Accuracy: 0.00
Row Accuracy: 0.50
Cell Accuracy: 0.00
----------------------------------------
Table 8:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.00
----------------------------------------
Table 9:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.25
---------------

## ChatGPT

In [None]:
from bs4 import BeautifulSoup

def parse_html(html_content):
    """Parses HTML content and returns a BeautifulSoup object."""
    return BeautifulSoup(html_content, 'html.parser')

def compare_tables(model_html, ground_truth_html):
    """Compares the structure and content of tables in two HTML documents."""
    model_soup = parse_html(model_html)
    gt_soup = parse_html(ground_truth_html)

    model_tables = model_soup.find_all('table')
    gt_tables = gt_soup.find_all('table')

    total_tables = len(gt_tables)
    correct_tables = 0
    correct_rows = 0
    correct_cells = 0
    total_rows = 0
    total_cells = 0

    for i in range(min(len(model_tables), len(gt_tables))):
        model_table = model_tables[i]
        gt_table = gt_tables[i]

        model_rows = model_table.find_all('tr')
        gt_rows = gt_table.find_all('tr')

        if len(model_rows) == len(gt_rows):
            correct_tables += 1

        total_rows += len(gt_rows)

        for j in range(min(len(model_rows), len(gt_rows))):
            model_row = model_rows[j]
            gt_row = gt_rows[j]

            model_cells = model_row.find_all(['td', 'th'])
            gt_cells = gt_row.find_all(['td', 'th'])

            if len(model_cells) == len(gt_cells):
                correct_rows += 1

            total_cells += len(gt_cells)

            for k in range(min(len(model_cells), len(gt_cells))):
                if model_cells[k].get_text(strip=True) == gt_cells[k].get_text(strip=True):
                    correct_cells += 1

    # Safeguard against division by zero in case there are no tables or rows
    structure_accuracy = correct_tables / total_tables if total_tables > 0 else 0
    row_accuracy = correct_rows / total_rows if total_rows > 0 else 0
    cell_accuracy = correct_cells / total_cells if total_cells > 0 else 0

    return structure_accuracy, row_accuracy, cell_accuracy

# List of Ground Truth HTMLs
ground_truth_htmls = [
    #Table 1
    """<table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    #Table 2
    """
    <table border="1">
      <tr><th>Name</th><th colspan="2">Details</th></tr>
      <tr><td>John</td><td>Age: 30</td><td>Country: USA</td></tr>
      <tr><td>Jane</td><td>Age: 25</td><td>Country: Canada</td></tr>
    </table>
    """,
    # Table 3
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Details</th></tr>
      <tr>
        <td>John</td><td>30</td>
        <td>
          <table border="1">
            <tr><td>Country</td><td>USA</td></tr>
            <tr><td>City</td><td>New York</td></tr>
          </table>
        </td>
      </tr>
    </table>
    """,
    # Table 4
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 5
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td></td><td>Canada</td></tr>
    </table>
    """,
    # Table 6
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th colspan="2">Details</th></tr>
      <tr><td>Age</td><td>Country</td></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 7
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th><th>Occupation</th><th>Salary</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr>
      <tr><td>Tom</td><td>40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr>
    </table>
    """ ,
    # Table 8
    """
    <table border="1">
      <tr><th>Item</th><th>Description</th><th>Price</th></tr>
      <tr><td>Apple</td><td>Fresh red apple</td><td>$1</td></tr>
      <tr><td rowspan="2">Banana</td><td>Yellow banana</td><td>$0.5</td></tr>
      <tr><td>Green banana</td><td>$0.6</td></tr>
    </table>
    """ ,
    # Table 9
    """
    <table border="1">
      <tr><th colspan="3">Personal Information</th></tr>
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """ ,
    # Table 10
    """
    <table border="1">
      <tr><th>Name</th><td>John</td></tr>
      <tr><th>Age</th><td>30</td></tr>
      <tr><th>Country</th><td>USA</td></tr>
    </table>
    """
]

# List of Model HTMLs
model_htmls = [
    # Table 1
    """<table border="1">
  <tr>
    <th>Name</th>
    <th>Age</th>
    <th>Country</th>
  </tr>
  <tr>
    <td>John</td>
    <td>30</td>
    <td>USA</td>
  </tr>
  <tr>
    <td>Jane</td>
    <td>25</td>
    <td>Canada</td>
  </tr>
</table>""",
    # Table 2
    """<table border="1">
  <tr>
    <th colspan="2">Name</th>
    <th>Details</th>
  </tr>
  <tr>
    <td rowspan="2">John</td>
    <td>Age:</td>
    <td>30</td>
  </tr>
  <tr>
    <td>Country:</td>
    <td>USA</td>
  </tr>
  <tr>
    <td rowspan="2">Jane</td>
    <td>Age:</td>
    <td>25</td>
  </tr>
  <tr>
    <td>Country:</td>
    <td>Canada</td>
  </tr>
</table>""",
    # Table 3
    """<table border="1">
  <tr>
    <th>Name</th>
    <th>Age</th>
    <th>Details</th>
  </tr>
  <tr>
    <td>John</td>
    <td>30</td>
    <td>
      <table border="0">
        <tr>
          <td>Country:</td>
          <td>USA</td>
        </tr>
        <tr>
          <td>City:</td>
          <td>New York</td>
        </tr>
      </table>
    </td>
  </tr>
</table>""",
    # Table 4
    """<table border="1">
  <tr>
    <th>Name</th>
    <th>Age</th>
    <th>Country</th>
  </tr>
  <tr>
    <td rowspan="2">John</td>
    <td>30</td>
    <td>USA</td>
  </tr>
  <tr>
    <td>25</td>
    <td>Canada</td>
  </tr>
</table>""",
    # Table 5
    """<table border="1">
  <tr>
    <th>Name</th>
    <th>Age</th>
    <th>Country</th>
  </tr>
  <tr>
    <td>John</td>
    <td>30</td>
    <td>USA</td>
  </tr>
  <tr>
    <td>Jane</td>
    <td></td>
    <td>Canada</td>
  </tr>
</table>""",
    # Table 6
    """<table border="1">
  <tr>
    <th rowspan="2">Name</th>
    <th colspan="2">Details</th>
  </tr>
  <tr>
    <th>Age</th>
    <th>Country</th>
  </tr>
  <tr>
    <td>John</td>
    <td>30</td>
    <td>USA</td>
  </tr>
  <tr>
    <td>Jane</td>
    <td>25</td>
    <td>Canada</td>
  </tr>
</table>""",
    # Table 7
    """<table border="1">
  <tr>
    <th>Name</th>
    <th>Age</th>
    <th>Country</th>
    <th>Occupation</th>
    <th>Salary</th>
  </tr>
  <tr>
    <td>John</td>
    <td>30</td>
    <td>USA</td>
    <td>Engineer</td>
    <td>$100,000</td>
  </tr>
  <tr>
    <td>Jane</td>
    <td>25</td>
    <td>Canada</td>
    <td>Doctor</td>
    <td>$150,000</td>
  </tr>
  <tr>
    <td>Tom</td>
    <td>40</td>
    <td>UK</td>
    <td>Teacher</td>
    <td>$50,000</td>
  </tr>
</table>""",
    # Table 8
    """<table border="1">
  <tr>
    <th>Item</th>
    <th>Description</th>
    <th>Price</th>
  </tr>
  <tr>
    <td>Apple</td>
    <td>Fresh red apple</td>
    <td>$1</td>
  </tr>
  <tr>
    <td rowspan="2">Banana</td>
    <td>Yellow banana</td>
    <td>$0.5</td>
  </tr>
  <tr>
    <td>Green banana</td>
    <td>$0.6</td>
  </tr>
</table>""",
    # Table 9
    """<table border="1">
  <tr>
    <th colspan="3">Personal Information</th>
  </tr>
  <tr>
    <th>Name</th>
    <th>Age</th>
    <th>Country</th>
  </tr>
  <tr>
    <td>John</td>
    <td>30</td>
    <td>USA</td>
  </tr>
  <tr>
    <td>Jane</td>
    <td>25</td>
    <td>Canada</td>
  </tr>
</table>""",
    # Table 10
    """<table border="1">
  <tr>
    <th>Name</th>
    <td>John</td>
  </tr>
  <tr>
    <th>Age</th>
    <td>30</td>
  </tr>
  <tr>
    <th>Country</th>
    <td>USA</td>
  </tr>
</table>"""
]

# Debugging issues: Ensure that the lengths of ground_truth_htmls and model_htmls are equal
assert len(ground_truth_htmls) == len(model_htmls), "Mismatch in the number of ground truth and model HTMLs."

# Lists to store accuracy values
structure_accuracies = []
row_accuracies = []
cell_accuracies = []


# Iterate over each pair of ground truth and model output
for i in range(len(ground_truth_htmls)):
    structure_acc, row_acc, cell_acc = compare_tables(model_htmls[i], ground_truth_htmls[i])
    print(f"Table {i+1}:")
    print(f"Structure Accuracy: {structure_acc:.2f}")
    print(f"Row Accuracy: {row_acc:.2f}")
    print(f"Cell Accuracy: {cell_acc:.2f}")
    print("-" * 40)

    structure_accuracies.append(structure_acc)
    row_accuracies.append(row_acc)
    cell_accuracies.append(cell_acc)

# Calculate mean averages
if structure_accuracies:
    mean_structure_acc = sum(structure_accuracies) / len(structure_accuracies)
    mean_row_acc = sum(row_accuracies) / len(row_accuracies)
    mean_cell_acc = sum(cell_accuracies) / len(cell_accuracies)

    # Print mean averages
    print("Mean Averages:")
    print(f"Mean Structure Accuracy: {mean_structure_acc:.2f}")
    print(f"Mean Row Accuracy: {mean_row_acc:.2f}")
    print(f"Mean Cell Accuracy: {mean_cell_acc:.2f}")
else:
    print("No tables were able to be processed.")


Table 1:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 1.00
----------------------------------------
Table 2:
Structure Accuracy: 0.00
Row Accuracy: 0.67
Cell Accuracy: 0.38
----------------------------------------
Table 3:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 0.61
----------------------------------------
Table 4:
Structure Accuracy: 1.00
Row Accuracy: 0.33
Cell Accuracy: 0.38
----------------------------------------
Table 5:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 1.00
----------------------------------------
Table 6:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 1.00
----------------------------------------
Table 7:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 1.00
----------------------------------------
Table 8:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 1.00
----------------------------------------
Table 9:
Structure Accuracy: 1.00
Row Accuracy: 1.00
Cell Accuracy: 1.00
---------------

## Camelot

In [20]:
from bs4 import BeautifulSoup

def parse_html(html_content):
    """Parses HTML content and returns a BeautifulSoup object."""
    return BeautifulSoup(html_content, 'html.parser')

def compare_tables(model_html, ground_truth_html):
    """Compares the structure and content of tables in two HTML documents."""
    model_soup = parse_html(model_html)
    gt_soup = parse_html(ground_truth_html)

    model_tables = model_soup.find_all('table')
    gt_tables = gt_soup.find_all('table')

    total_tables = len(gt_tables)
    correct_tables = 0
    correct_rows = 0
    correct_cells = 0
    total_rows = 0
    total_cells = 0

    for i in range(min(len(model_tables), len(gt_tables))):
        model_table = model_tables[i]
        gt_table = gt_tables[i]

        model_rows = model_table.find_all('tr')
        gt_rows = gt_table.find_all('tr')

        if len(model_rows) == len(gt_rows):
            correct_tables += 1

        total_rows += len(gt_rows)

        for j in range(min(len(model_rows), len(gt_rows))):
            model_row = model_rows[j]
            gt_row = gt_rows[j]

            model_cells = model_row.find_all(['td', 'th'])
            gt_cells = gt_row.find_all(['td', 'th'])

            if len(model_cells) == len(gt_cells):
                correct_rows += 1

            total_cells += len(gt_cells)

            for k in range(min(len(model_cells), len(gt_cells))):
                if model_cells[k].get_text(strip=True) == gt_cells[k].get_text(strip=True):
                    correct_cells += 1

    # Safeguard against division by zero in case there are no tables or rows
    structure_accuracy = correct_tables / total_tables if total_tables > 0 else 0
    row_accuracy = correct_rows / total_rows if total_rows > 0 else 0
    cell_accuracy = correct_cells / total_cells if total_cells > 0 else 0

    return structure_accuracy, row_accuracy, cell_accuracy

# List of Ground Truth HTMLs
ground_truth_htmls = [
    #Table 1
    """<table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    #Table 2
    """
    <table border="1">
      <tr><th>Name</th><th colspan="2">Details</th></tr>
      <tr><td>John</td><td>Age: 30</td><td>Country: USA</td></tr>
      <tr><td>Jane</td><td>Age: 25</td><td>Country: Canada</td></tr>
    </table>
    """,
    # Table 3
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Details</th></tr>
      <tr>
        <td>John</td><td>30</td>
        <td>
          <table border="1">
            <tr><td>Country</td><td>USA</td></tr>
            <tr><td>City</td><td>New York</td></tr>
          </table>
        </td>
      </tr>
    </table>
    """,
    # Table 4
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 5
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td></td><td>Canada</td></tr>
    </table>
    """,
    # Table 6
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th colspan="2">Details</th></tr>
      <tr><td>Age</td><td>Country</td></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 7
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th><th>Occupation</th><th>Salary</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr>
      <tr><td>Tom</td><td>40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr>
    </table>
    """ ,
    # Table 8
    """
    <table border="1">
      <tr><th>Item</th><th>Description</th><th>Price</th></tr>
      <tr><td>Apple</td><td>Fresh red apple</td><td>$1</td></tr>
      <tr><td rowspan="2">Banana</td><td>Yellow banana</td><td>$0.5</td></tr>
      <tr><td>Green banana</td><td>$0.6</td></tr>
    </table>
    """ ,
    # Table 9
    """
    <table border="1">
      <tr><th colspan="3">Personal Information</th></tr>
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """ ,
    # Table 10
    """
    <table border="1">
      <tr><th>Name</th><td>John</td></tr>
      <tr><th>Age</th><td>30</td></tr>
      <tr><th>Country</th><td>USA</td></tr>
    </table>
    """
]

# List of Model HTMLs
model_htmls = [
    # Table 1
    """<html><body>,<table border='1'><tr><td>Name Age Country</td><td></td></tr></table><br><tr><td>John 30</td><td>USA</td></tr></table><br><tr><td>Jane 25</td><td>Canada</td></tr></table><br></body></html>""",
    # Table 2
    """<html><body>,<table border='1'><tr><td>Name</td><td>Details</td></tr></table><br><tr><td>John Age:	30 Country:	USA</td><td></td></tr></table><br><tr><td>Jane Age:	25 Country:	Canada</td><td></td></tr></table><br></body></html>""",
    # Table 3
    """<html><body>,<table border='1'><tr><td>Name Age</td><td></td><td>Details</td></tr></table><br><tr><td></td><td>Country USA</td><td></td></tr></table><br><tr><td>John 30</td><td></td><td></td></tr></table><br><tr><td></td><td>City</td><td>New	York</td></tr></table><br></body></html>""",
    # Table 4
    """<html><body>,<table border='1'><tr><td></td><td>Age Country</td></tr></table><br><tr><td>Name</td><td></td></tr></table><br><tr><td>30</td><td>USA</td></tr></table><br><tr><td>Jane 25</td><td>Canada</td></tr></table><br></body></html>""",
    # Table 5
     """<html><body>,<table border='1'><tr><td>Name Age Country</td><td></td></tr></table><br><tr><td>John 30</td><td>USA</td></tr></table><br><tr><td>Jane</td><td>Canada</td></tr></table><br></body></html>""",
    # Table 6
    """<html><body>,<table border='1'><tr><td></td><td>Details</td></tr></table><br><tr><td>Name</td><td></td></tr></table><br><tr><td></td><td>Age Country</td></tr></table><br><tr><td>John 30</td><td>USA</td></tr></table><br><tr><td>Jane 25</td><td>Canada</td></tr></table><br></body></html>""",
    # Table 7
    """<html><body>,<table border='1'><tr><td>Name Age Country Occupation</td><td></td><td></td><td>Salary</td></tr></table><br><tr><td>John 30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr></table><br><tr><td>Jane 25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr></table><br><tr><td>Tom
40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr></table><br></body></html>""",
    # Table 8
    """<html><body>,<table border='1'><tr><td>Item</td><td>Description
Price</td></tr></table><br><tr><td>Apple</td><td>Fresh	red	apple $1</td></tr></table><br><tr><td></td><td>Yellow	banana
$0.5</td></tr></table><br><tr><td>Banana</td><td></td></tr></table><br><tr><td></td><td>Green	banana
$0.6</td></tr></table><br></body></html>""",
    # Table 9
    """<html><body>,<table border='1'><tr><td></td><td>Personal	Information</td><td></td></tr></table><br><tr><td>Name</td><td>Age</td><td>Country</td></tr></table><br><tr><td>John</td><td>30</td><td>USA</td></tr></table><br><tr><td>Jane</td><td>25</td><td>Canada</td></tr></table><br></body></html>""",
    # Table 10
    """<html><body>,<table border='1'><tr><td>Name</td><td>John</td></tr></table><br><tr><td>Age</td><td>30</td></tr></table><br><tr><td>Country USA</td><td></td></tr></table><br></body></html>""",
]

# Debugging issues: Ensure that the lengths of ground_truth_htmls and model_htmls are equal
assert len(ground_truth_htmls) == len(model_htmls), "Mismatch in the number of ground truth and model HTMLs."

# Lists to store accuracy values
structure_accuracies = []
row_accuracies = []
cell_accuracies = []


# Iterate over each pair of ground truth and model output
for i in range(len(ground_truth_htmls)):
    structure_acc, row_acc, cell_acc = compare_tables(model_htmls[i], ground_truth_htmls[i])
    print(f"Table {i+1}:")
    print(f"Structure Accuracy: {structure_acc:.2f}")
    print(f"Row Accuracy: {row_acc:.2f}")
    print(f"Cell Accuracy: {cell_acc:.2f}")
    print("-" * 40)

    structure_accuracies.append(structure_acc)
    row_accuracies.append(row_acc)
    cell_accuracies.append(cell_acc)

# Calculate mean averages
if structure_accuracies:
    mean_structure_acc = sum(structure_accuracies) / len(structure_accuracies)
    mean_row_acc = sum(row_accuracies) / len(row_accuracies)
    mean_cell_acc = sum(cell_accuracies) / len(cell_accuracies)

    # Print mean averages
    print("Mean Averages:")
    print(f"Mean Structure Accuracy: {mean_structure_acc:.2f}")
    print(f"Mean Row Accuracy: {mean_row_acc:.2f}")
    print(f"Mean Cell Accuracy: {mean_cell_acc:.2f}")
else:
    print("No tables were able to be processed.")


Table 1:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.00
----------------------------------------
Table 2:
Structure Accuracy: 0.00
Row Accuracy: 0.33
Cell Accuracy: 1.00
----------------------------------------
Table 3:
Structure Accuracy: 0.00
Row Accuracy: 0.25
Cell Accuracy: 0.33
----------------------------------------
Table 4:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.00
----------------------------------------
Table 5:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.00
----------------------------------------
Table 6:
Structure Accuracy: 0.00
Row Accuracy: 0.25
Cell Accuracy: 0.50
----------------------------------------
Table 7:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.00
----------------------------------------
Table 8:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.33
----------------------------------------
Table 9:
Structure Accuracy: 0.00
Row Accuracy: 0.00
Cell Accuracy: 0.00
---------------

## Test Model

In [None]:
from bs4 import BeautifulSoup

def parse_html(html_content):
    """Parses HTML content and returns a BeautifulSoup object."""
    return BeautifulSoup(html_content, 'html.parser')

def compare_tables(model_html, ground_truth_html):
    """Compares the structure and content of tables in two HTML documents."""
    model_soup = parse_html(model_html)
    gt_soup = parse_html(ground_truth_html)

    model_tables = model_soup.find_all('table')
    gt_tables = gt_soup.find_all('table')

    total_tables = len(gt_tables)
    correct_tables = 0
    correct_rows = 0
    correct_cells = 0
    total_rows = 0
    total_cells = 0

    for i in range(min(len(model_tables), len(gt_tables))):
        model_table = model_tables[i]
        gt_table = gt_tables[i]

        model_rows = model_table.find_all('tr')
        gt_rows = gt_table.find_all('tr')

        if len(model_rows) == len(gt_rows):
            correct_tables += 1

        total_rows += len(gt_rows)

        for j in range(min(len(model_rows), len(gt_rows))):
            model_row = model_rows[j]
            gt_row = gt_rows[j]

            model_cells = model_row.find_all(['td', 'th'])
            gt_cells = gt_row.find_all(['td', 'th'])

            if len(model_cells) == len(gt_cells):
                correct_rows += 1

            total_cells += len(gt_cells)

            for k in range(min(len(model_cells), len(gt_cells))):
                if model_cells[k].get_text(strip=True) == gt_cells[k].get_text(strip=True):
                    correct_cells += 1

    # Safeguard against division by zero in case there are no tables or rows
    structure_accuracy = correct_tables / total_tables if total_tables > 0 else 0
    row_accuracy = correct_rows / total_rows if total_rows > 0 else 0
    cell_accuracy = correct_cells / total_cells if total_cells > 0 else 0

    return structure_accuracy, row_accuracy, cell_accuracy

# List of Ground Truth HTMLs
ground_truth_htmls = [
    #Table 1
    """<table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    #Table 2
    """
    <table border="1">
      <tr><th>Name</th><th colspan="2">Details</th></tr>
      <tr><td>John</td><td>Age: 30</td><td>Country: USA</td></tr>
      <tr><td>Jane</td><td>Age: 25</td><td>Country: Canada</td></tr>
    </table>
    """,
    # Table 3
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Details</th></tr>
      <tr>
        <td>John</td><td>30</td>
        <td>
          <table border="1">
            <tr><td>Country</td><td>USA</td></tr>
            <tr><td>City</td><td>New York</td></tr>
          </table>
        </td>
      </tr>
    </table>
    """,
    # Table 4
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 5
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td></td><td>Canada</td></tr>
    </table>
    """,
    # Table 6
    """
    <table border="1">
      <tr><th rowspan="2">Name</th><th colspan="2">Details</th></tr>
      <tr><td>Age</td><td>Country</td></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """,
    # Table 7
    """
    <table border="1">
      <tr><th>Name</th><th>Age</th><th>Country</th><th>Occupation</th><th>Salary</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td><td>Engineer</td><td>$100,000</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td><td>Doctor</td><td>$150,000</td></tr>
      <tr><td>Tom</td><td>40</td><td>UK</td><td>Teacher</td><td>$50,000</td></tr>
    </table>
    """ ,
    # Table 8
    """
    <table border="1">
      <tr><th>Item</th><th>Description</th><th>Price</th></tr>
      <tr><td>Apple</td><td>Fresh red apple</td><td>$1</td></tr>
      <tr><td rowspan="2">Banana</td><td>Yellow banana</td><td>$0.5</td></tr>
      <tr><td>Green banana</td><td>$0.6</td></tr>
    </table>
    """ ,
    # Table 9
    """
    <table border="1">
      <tr><th colspan="3">Personal Information</th></tr>
      <tr><th>Name</th><th>Age</th><th>Country</th></tr>
      <tr><td>John</td><td>30</td><td>USA</td></tr>
      <tr><td>Jane</td><td>25</td><td>Canada</td></tr>
    </table>
    """ ,
    # Table 10
    """
    <table border="1">
      <tr><th>Name</th><td>John</td></tr>
      <tr><th>Age</th><td>30</td></tr>
      <tr><th>Country</th><td>USA</td></tr>
    </table>
    """
]

# List of Model HTMLs
model_htmls = [
    # Table 1
    """""",
    # Table 2
    """""",
    # Table 3
    """""",
    # Table 4
    """""",
    # Table 5
    """""",
    # Table 6
    """""",
    # Table 7
    """""",
    # Table 8
    """""",
    # Table 9
    """""",
    # Table 10
    """"""
]

# Debugging issues: Ensure that the lengths of ground_truth_htmls and model_htmls are equal
assert len(ground_truth_htmls) == len(model_htmls), "Mismatch in the number of ground truth and model HTMLs."

# Lists to store accuracy values
structure_accuracies = []
row_accuracies = []
cell_accuracies = []


# Iterate over each pair of ground truth and model output
for i in range(len(ground_truth_htmls)):
    structure_acc, row_acc, cell_acc = compare_tables(model_htmls[i], ground_truth_htmls[i])
    print(f"Table {i+1}:")
    print(f"Structure Accuracy: {structure_acc:.2f}")
    print(f"Row Accuracy: {row_acc:.2f}")
    print(f"Cell Accuracy: {cell_acc:.2f}")
    print("-" * 40)

    structure_accuracies.append(structure_acc)
    row_accuracies.append(row_acc)
    cell_accuracies.append(cell_acc)

# Calculate mean averages
if structure_accuracies:
    mean_structure_acc = sum(structure_accuracies) / len(structure_accuracies)
    mean_row_acc = sum(row_accuracies) / len(row_accuracies)
    mean_cell_acc = sum(cell_accuracies) / len(cell_accuracies)

    # Print mean averages
    print("Mean Averages:")
    print(f"Mean Structure Accuracy: {mean_structure_acc:.2f}")
    print(f"Mean Row Accuracy: {mean_row_acc:.2f}")
    print(f"Mean Cell Accuracy: {mean_cell_acc:.2f}")
else:
    print("No tables were able to be processed.")
