<a href="https://www.kaggle.com/code/shravankumar147/table-extraction-from-pdf?scriptVersionId=219098507" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [17]:
!pip install reportlab pandas lorem -q

In [13]:
!pip install --upgrade pymupdf -q

In [2]:
import pandas as pd

# Sample data
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [24, 27, 22, 32, 29],
    'Occupation': ['Engineer', 'Doctor', 'Artist', 'Lawyer', 'Teacher']
}

# Create a DataFrame
df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,ID,Name,Age,Occupation
0,1,Alice,24,Engineer
1,2,Bob,27,Doctor
2,3,Charlie,22,Artist
3,4,David,32,Lawyer
4,5,Eva,29,Teacher


In [4]:
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph

# Create a PDF document
pdf_filename = "test_data_set.pdf"
doc = SimpleDocTemplate(pdf_filename, pagesize=letter)

# Create a list to hold the elements of the PDF
elements = []

# Add a title to the PDF
styles = getSampleStyleSheet()
title = Paragraph("Test Data Set for RAG Project", styles['Title'])
elements.append(title)

# Convert DataFrame to a list of lists for the table
table_data = [df.columns.to_list()] + df.values.tolist()

# Create the table
table = Table(table_data)

# Add style to the table
style = TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
    ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
    ('GRID', (0, 0), (-1, -1), 1, colors.black),
])

table.setStyle(style)

# Add the table to the elements list
elements.append(table)

# Build the PDF
doc.build(elements)

print(f"PDF created successfully: {pdf_filename}")

PDF created successfully: test_data_set.pdf


In [10]:
import fitz

In [14]:
import fitz  # PyMuPDF
import pandas as pd

def extract_tables_with_context(pdf_path, context_window=300):
    """Extract tables with surrounding context from PDF using find_tables()"""
    doc = fitz.open(pdf_path)
    table_data = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        # Find tables on the page
        tables = page.find_tables()
        
        if tables.tables:  # Check if any tables were found
            for table_num, table in enumerate(tables.tables):
                try:
                    # Extract table as DataFrame
                    df = table.to_pandas()
                    
                    # Get bounding box of the table
                    table_bbox = table.bbox
                    
                    # If table_bbox is a tuple, convert it to a fitz.Rect object
                    if isinstance(table_bbox, tuple):
                        table_bbox = fitz.Rect(table_bbox)
                    
                    # Get surrounding context
                    y0 = max(0, table_bbox.y0 - context_window)
                    context = page.get_text("text", clip=(0, y0, page.rect.width, table_bbox.y0))
                    
                    table_data.append({
                        "page": page_num + 1,
                        "table_num": table_num + 1,
                        "context": context.strip(),
                        "table": df.to_markdown(index=False),
                        "full_text": f"Context: {context}\nTable:\n{df.to_markdown(index=False)}"
                    })
                except Exception as e:
                    print(f"Error on page {page_num} table {table_num}: {str(e)[:50]}")
        else:
            print(f"No tables found on page {page_num + 1}")
    
    return table_data

In [12]:
PDF_PATH = "/kaggle/working/test_data_set.pdf"
table_data = extract_tables_with_context(pdf_path=PDF_PATH, context_window=300)

Error on page 0 table 0: 'tuple' object has no attribute 'y0'


In [15]:
pdf_path = "test_data_set.pdf"  # Replace with your PDF path
tables_with_context = extract_tables_with_context(pdf_path)

# Print the extracted data
for table in tables_with_context:
    print(f"Page: {table['page']}, Table: {table['table_num']}")
    print(table['full_text'])
    print("-" * 50)

Page: 1, Table: 1
Context: Test Data Set for RAG Project

Table:
|   ID | Name    |   Age | Occupation   |
|-----:|:--------|------:|:-------------|
|    1 | Alice   |    24 | Engineer     |
|    2 | Bob     |    27 | Doctor       |
|    3 | Charlie |    22 | Artist       |
|    4 | David   |    32 | Lawyer       |
|    5 | Eva     |    29 | Teacher      |
--------------------------------------------------
