Resources

---



https://huggingface.co/transformers/v4.3.3/_modules/transformers/training_args.html#:~:text=[docs]%20@dataclass%20class%20TrainingArguments:%20%22%22%22%20TrainingArguments%20is%20the%20subset%20of

https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Table%20Transformer/Using_Table_Transformer_for_table_detection_and_table_structure_recognition.ipynb

https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html?highlight=cuda#cuda-tensors

https://github.com/microsoft/table-transformer/blob/main/docs/INFERENCE.md

https://camelot-py.readthedocs.io/en/master/

https://github.com/tesseract-ocr/tesseract

https://www.kaggle.com/datasets/sreesankar711/pubtables-subset-100k

https://github.com/microsoft/table-transformer

In [51]:
# Install packages
!pip install camelot-py[cv] ghostscript PyPDF2==1.26.0 pdf2image transformers torch timm
!apt-get install -y ghostscript poppler-utils
!pip install --upgrade camelot-py[cv]
!pip install pytesseract
!apt-get install -y tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ghostscript is already the newest version (9.55.0~dfsg1-0ubuntu5.9).
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Collecting camelot-py[cv]
  Using cached camelot_py-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting pypdf>=3.0.0 (from camelot-py[cv])
  Using cached pypdf-5.0.1-py3-none-any.whl.metadata (7.4 kB)
INFO: pip is looking at multiple versions of camelot-py[cv] to determine which version is compatible with other requirements. This could take a while.
Collecting camelot-py[cv]
  Using cached camelot_py-0.10.1-py3-none-any.whl.metadata (8.3 kB)
  Using cached camelot_py-0.10.0-py3-none-any.whl.metadata (8.3 kB)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed

In [60]:
# Import libraries
import os
import json
import camelot
import PyPDF2
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import pandas as pd
import torch
from transformers import TableTransformerForObjectDetection, AutoImageProcessor
import cv2
import numpy as np
from google.colab import files

In [61]:
# Upload PDF file
uploaded = files.upload()
pdf_file = next(iter(uploaded))

# Save the uploaded file to a temporary location
with open(pdf_file, 'wb') as f:
    f.write(uploaded[pdf_file])

Saving 3_merged_row_header_table.pdf to 3_merged_row_header_table.pdf


In [62]:
# Load pre-trained Table Transformer model and image processor
try:
    model = TableTransformerForObjectDetection.from_pretrained('microsoft/table-transformer-detection')
    image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    print("Table Transformer model loaded successfully.")
except Exception as e:
    print(f"Error loading the Table Transformer model: {e}")
    model = None
    image_processor = None

Some weights of the model checkpoint at microsoft/table-transformer-detection were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [63]:
def detect_tables(image):
    if model is None or image_processor is None:
        return []

    inputs = image_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    target_sizes = torch.tensor([image.size[::-1]])
    results = image_processor.post_process_object_detection(outputs, threshold=0.7, target_sizes=target_sizes)[0]
    return results['boxes'].tolist()

In [64]:
def extract_table_content(image, bbox):
    x1, y1, x2, y2 = [int(coord) for coord in bbox]
    cropped_image = image.crop((x1, y1, x2, y2))

    # Try Camelot first
    temp_image_path = 'temp_table.png'
    cropped_image.save(temp_image_path)

    try:
        tables = camelot.read_pdf(temp_image_path, flavor='stream')
        if tables.n > 0:
            extracted_table = tables[0].df
        else:
            tables = camelot.read_pdf(temp_image_path, flavor='lattice')
            extracted_table = tables[0].df if tables.n > 0 else None

        if extracted_table is not None:
            print("Table extracted using Camelot.")
            return extracted_table
    except Exception as e:
        print(f"Camelot extraction failed: {e}")

    # If Camelot fails, use OCR
    try:
        text = pytesseract.image_to_string(cropped_image)
        lines = text.split('\n')
        table_data = [line.split() for line in lines if line.strip()]
        extracted_table = pd.DataFrame(table_data)
        print("Table extracted using OCR.")
        return extracted_table
    except Exception as e:
        print(f"OCR extraction failed: {e}")

    return None

In [65]:
def extract_pdf_table(pdf_file):
    all_tables = []

    # Convert PDF to images
    try:
        images = convert_from_path(pdf_file, poppler_path="/usr/bin")
        print(f"Successfully converted PDF to {len(images)} images.")
    except Exception as e:
        print(f"Error converting PDF to images: {e}")
        return all_tables

    for i, image in enumerate(images):
        print(f"Processing page {i+1}")
        # Detect tables using Table Transformer
        bboxes = detect_tables(image)

        if bboxes:
            print(f"Detected {len(bboxes)} table(s) on page {i+1}.")
            for j, bbox in enumerate(bboxes):
                table = extract_table_content(image, bbox)
                if table is not None:
                    all_tables.append(table)
                    print(f"Extracted content from table {j+1} on page {i+1}.")
        else:
            print(f"No tables detected on page {i+1}.")

    return all_tables

In [66]:
def table_to_json(table):
    if isinstance(table, pd.DataFrame):
        # Convert DataFrame to list of dictionaries
        return table.to_dict(orient='records')
    else:
        print("Invalid table format for JSON conversion.")
        return []

In [67]:
# Extract tables from PDF file
tables = extract_pdf_table(pdf_file)

# Convert extracted tables to JSON
json_tables = [table_to_json(table) for table in tables]
# Save JSON to file
with open('tables.json', 'w') as f:
    json.dump(json_tables, f, indent=2)

print(f"Extracted {len(json_tables)} tables and saved as JSON.")

# Display first table as an example
if json_tables:
    print("\nFirst extracted table:")
    print(json.dumps(json_tables[0], indent=2))
else:
    print("No tables were extracted.")

# Clean up the temporary file
if os.path.exists(pdf_file):
    os.remove(pdf_file)

Successfully converted PDF to 1 images.
Processing page 1
Detected 1 table(s) on page 1.
Camelot extraction failed: File format not supported
Table extracted using OCR.
Extracted content from table 1 on page 1.
Extracted 1 tables and saved as JSON.

First extracted table:
[
  {
    "0": "Header",
    "1": "1",
    "2": "|",
    "3": "Header",
    "4": "2",
    "5": "|",
    "6": "Header",
    "7": "3",
    "8": "|",
    "9": "Header",
    "10": "4",
    "11": null,
    "12": null,
    "13": null,
    "14": null,
    "15": null,
    "16": null,
    "17": null
  },
  {
    "0": "Row",
    "1": "Header",
    "2": "2",
    "3": "|",
    "4": "Row",
    "5": "2b,",
    "6": "Col",
    "7": "2",
    "8": "|",
    "9": "Row",
    "10": "2b,",
    "11": "Col",
    "12": "3",
    "13": "|",
    "14": "Row",
    "15": "2b,",
    "16": "Col",
    "17": "4"
  },
  {
    "0": "Row",
    "1": "Header",
    "2": "1",
    "3": null,
    "4": null,
    "5": null,
    "6": null,
    "7": null,
    "8": 