In [None]:
import os

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install transformers easyocr pdf2image
!apt-get install poppler-utils

In [None]:
from transformers import AutoModelForObjectDetection, TableTransformerForObjectDetection
from torchvision import transforms
from PIL import Image, ImageDraw
from pdf2image import convert_from_path
import torch
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import Patch
import numpy as np
import easyocr
from tqdm.auto import tqdm
import csv
import pandas as pd
from tabulate import tabulate
import io
import json

import torch.nn.functional as F
from torchvision.transforms import functional as TF
from torchvision import transforms
from collections import defaultdict

In [None]:
config_path = ''

with open(config_path, 'r') as f:
    config = json.load(f)

pdf_path = config["pdf_path"]
csv_path = config["csv_path"]
images_path = config["images_path"]
tables_path = config["tables_path"]

In [None]:
os.makedirs(images_path, exist_ok=True)
os.makedirs(tables_path, exist_ok=True)

In [None]:
detection_model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm")
structure_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-structure-recognition-v1.1-all")
device = "cuda" if torch.cuda.is_available() else "cpu"
detection_model.to(device)
structure_model.to(device)

In [None]:
class MaxResize(object):
    def __init__(self, max_size=800):
        self.max_size = max_size
    def __call__(self, image):
        width, height = image.size
        current_max_size = max(width, height)
        scale = self.max_size / current_max_size
        resized_image = image.resize((int(round(scale*width)), int(round(scale*height))))
        return resized_image

detection_transform = transforms.Compose([
    MaxResize(800),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

structure_transform = transforms.Compose([
    MaxResize(1000),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
def pdf_to_images(pdf_path, output_folder):
    images = convert_from_path(pdf_path)
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f'image_{i}.jpg')
        image.save(image_path, 'JPEG')
        image_paths.append(image_path)
    return image_paths

image_paths = pdf_to_images(pdf_path, images_path)

def load_image(image_path):
    img = Image.open(image_path).convert("RGB")
    img_tensor = TF.to_tensor(img).unsqueeze(0)
    return img, img_tensor

In [None]:
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

id2label = detection_model.config.id2label
id2label[len(detection_model.config.id2label)] = "no object"

def outputs_to_objects(outputs, img_size, id2label):
    m = outputs.logits.softmax(-1).max(-1)
    pred_labels = list(m.indices.detach().cpu().numpy())[0]
    pred_scores = list(m.values.detach().cpu().numpy())[0]
    pred_bboxes = outputs['pred_boxes'].detach().cpu()[0]
    pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size)]
    objects = []
    for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
        class_label = id2label[int(label)]
        if not class_label == 'no object':
            objects.append({'label': class_label, 'score': float(score), 'bbox': [float(elem) for elem in bbox]})
    return objects

In [None]:
tables = []
tables_per_page = {}

for idx, image_path in enumerate(image_paths):
    print(f"Processing image {idx+1}/{len(image_paths)}: {image_path}")
    image = Image.open(image_path).convert("RGB")
    pixel_values = detection_transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = detection_model(pixel_values)

    objects = outputs_to_objects(outputs, image.size, id2label)

    print(f"Detected objects in image {idx+1}: {[obj['label'] for obj in objects]}")

    if any(obj['label'] == 'table' for obj in objects):
        tables.append((image, objects))
        if idx+1 in tables_per_page:
            tables_per_page[idx+1] += 1
        else:
            tables_per_page[idx+1] = 1

        print(f"Tables found on page {idx+1}:")
        for table_idx, obj in enumerate(objects):
            if obj['label'] == 'table':
                print(f"Table {table_idx+1}: Bounding Box: {obj['bbox']}")

    else:
        print(f"No table detected in image {idx+1}")

if not tables:
    raise ValueError("No tables detected in the PDF.")

In [None]:
def objects_to_crops(img, objects, class_thresholds, padding=10):
    table_crops = []
    for obj in objects:
        if obj['score'] < class_thresholds[obj['label']]:
            continue

        bbox = obj['bbox']
        bbox_with_padding = (max(0, bbox[0] - padding),
                             max(0, bbox[1] - padding),
                             min(img.width, bbox[2] + padding),
                             min(img.height, bbox[3] + padding))

        cropped_img = img.crop(bbox_with_padding)

        table_crops.append({'image': cropped_img, 'bbox': bbox_with_padding})
    return table_crops

In [None]:
class_thresholds = {
    'table': 0,
    'table rotated': 0,
    'no object': 0
}

In [None]:
for idx, (image, objects) in enumerate(tables):
    print(f"Processing tables in image {idx+1}/{len(tables)}")

    table_crops = objects_to_crops(image, objects, class_thresholds, padding=10)

    for table_idx, crop_info in enumerate(table_crops):
        cropped_img = crop_info['image']
        bbox = crop_info['bbox']
        cropped_img.save(f"/content/table/table_{idx}_{table_idx}.jpg")
        print(f"Segment {table_idx+1} cropped and saved.")

if not tables:
    raise ValueError("No tables detected in the PDF.")

In [None]:
def get_cell_coordinates_by_row(table_data):
    rows = [entry for entry in table_data if entry['label'] == 'table row']
    columns = [entry for entry in table_data if entry['label'] == 'table column']
    rows.sort(key=lambda x: x['bbox'][1])
    columns.sort(key=lambda x: x['bbox'][0])

    def find_cell_coordinates(row, column):
        cell_bbox = [column['bbox'][0], row['bbox'][1], column['bbox'][2], row['bbox'][3]]
        return cell_bbox

    cell_coordinates = []
    for row in rows:
        row_cells = []
        for column in columns:
            cell_bbox = find_cell_coordinates(row, column)
            row_cells.append({'column': column['bbox'], 'cell': cell_bbox})
        row_cells.sort(key=lambda x: x['column'][0])
        cell_coordinates.append({'row': row['bbox'], 'cells': row_cells, 'cell_count': len(row_cells)})
    cell_coordinates.sort(key=lambda x: x['row'][1])
    return cell_coordinates

In [None]:
reader = easyocr.Reader(['en'])

In [None]:
def apply_ocr(cell_coordinates, cropped_table):
    data = dict()
    max_num_columns = 0
    for idx, row in enumerate(tqdm(cell_coordinates)):
        row_text = []
        for cell in row["cells"]:
            cell_image = np.array(cropped_table.crop(cell["cell"]))
            result = reader.readtext(np.array(cell_image))
            if result:
                text = " ".join([x[1] for x in result])
                row_text.append(text)
        if len(row_text) > max_num_columns:
            max_num_columns = len(row_text)
        data[idx] = row_text

    for row, row_data in data.copy().items():
        if len(row_data) != max_num_columns:
            row_data += [""] * (max_num_columns - len(row_data))
        data[row] = row_data

    return data

In [None]:
def process_tables(tables_path, csv_path, structure_model, device):
    with open(csv_path, 'w', newline='') as result_file:
        wr = csv.writer(result_file, dialect='excel')


        table_files = [filename for filename in os.listdir(tables_path) if filename.lower().endswith('.jpg')]

        for idx, table_image_filename in enumerate(table_files):
            table_image_path = os.path.join(tables_path, table_image_filename)
            print(f"Processing table {idx+1}/{len(table_files)}: {table_image_path}")

            img, img_tensor = load_image(table_image_path)

            with torch.no_grad():
                outputs = structure_model(img_tensor.to(device))

            structure_id2label = structure_model.config.id2label
            structure_id2label[len(structure_id2label)] = "no object"

            cells = outputs_to_objects(outputs, img.size, structure_id2label)

            if len(cells) == 0:
                print(f"No cells detected in table {idx+1}")
                continue

            cell_coordinates = get_cell_coordinates_by_row(cells)

            data = apply_ocr(cell_coordinates, img)

            for row, row_text in data.items():
                wr.writerow(row_text)

            for _ in range(3):
                wr.writerow([])

            print(f"Table {idx+1} processed successfully")

    print("All tables processed and data written to", csv_path)

process_tables(tables_path, csv_path, structure_model, device)

In [None]:
def read_csv_with_empty_lines(file_path):
    with open(file_path, newline='', encoding='utf-8-sig') as csvfile:
        reader = csv.reader(csvfile)
        data = []
        for row in reader:
            if len(row) == 0 or all(cell == '' for cell in row):
                data.append([])
            else:
                data.append(row)
    return data

def save_table_as_csv(table_data, table_index, output_folder):
    with open(f'{output_folder}/table_{table_index}.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile)
        for row in table_data:
            writer.writerow(row)

def split_csv_into_tables(csv_file, output_folder):
    csv_data = read_csv_with_empty_lines(csv_file)
    table_index = 1
    start_index = 0

    for idx, row in enumerate(csv_data):
        if len(row) == 0:
            if idx > start_index:
                table_data = csv_data[start_index:idx]
                save_table_as_csv(table_data, table_index, output_folder)
                table_index += 1
            start_index = idx + 1

    if start_index < len(csv_data):
        table_data = csv_data[start_index:]
        save_table_as_csv(table_data, table_index, output_folder)

In [None]:
split_csv_into_tables(csv_path, tables_path)

In [None]:
!zip -r /content/table.zip /content/table

In [None]:
from google.colab import files
files.download('/content/table.zip')