<a href="https://www.kaggle.com/code/shravankumar147/multi-page-tables-extraction-using-pymupdf?scriptVersionId=219101321" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Mutli-page Tables Extraction from PDF using PyMuPDF

## Step 1: Install Required Libraries

In [1]:
!pip install reportlab pandas lorem pymupdf wget -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Building wheel for wget (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires notebook==6.5.5, but you have notebook 6.5.4 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 w

## Helper Functions

In [4]:
import fitz  # PyMuPDF
import pandas as pd

def extract_tables_with_context(pdf_path, context_window=300):
    """Extract tables with surrounding context from PDF using find_tables()"""
    doc = fitz.open(pdf_path)
    table_data = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        # Find tables on the page
        tables = page.find_tables()
        
        if tables.tables:  # Check if any tables were found
            for table_num, table in enumerate(tables.tables):
                try:
                    # Extract table as DataFrame
                    df = table.to_pandas()
                    
                    # Get bounding box of the table
                    table_bbox = table.bbox
                    
                    # If table_bbox is a tuple, convert it to a fitz.Rect object
                    if isinstance(table_bbox, tuple):
                        table_bbox = fitz.Rect(table_bbox)
                    
                    # Get surrounding context
                    y0 = max(0, table_bbox.y0 - context_window)
                    context = page.get_text("text", clip=(0, y0, page.rect.width, table_bbox.y0))
                    
                    table_data.append({
                        "page": page_num + 1,
                        "table_num": table_num + 1,
                        "context": context.strip(),
                        "table": df.to_markdown(index=False),
                        "full_text": f"Context: {context}\nTable:\n{df.to_markdown(index=False)}"
                    })
                except Exception as e:
                    print(f"Error on page {page_num} table {table_num}: {str(e)[:50]}")
        else:
            print(f"No tables found on page {page_num + 1}")
    
    return table_data

## Test over real data(PDF)

In [7]:
import requests

pdf_url = "https://morth.nic.in/sites/default/files/AR-MoRTH_Annual%20Report_2023-24_English.pdf"
pdf_filename = "AR-MoRTH_Annual_Report_2023-24_English.pdf"

# Download the file
response = requests.get(pdf_url)
with open(pdf_filename, "wb") as f:
    f.write(response.content)

print(f"Downloaded: {pdf_filename}")

Downloaded: AR-MoRTH_Annual_Report_2023-24_English.pdf


In [8]:
pdf_filename

'AR-MoRTH_Annual_Report_2023-24_English.pdf'

In [9]:
pdf_path = pdf_filename  # Replace with your PDF path
tables_with_context = extract_tables_with_context(pdf_path)

No tables found on page 1
No tables found on page 2
No tables found on page 3
No tables found on page 4
No tables found on page 6
No tables found on page 7
No tables found on page 8
No tables found on page 9
No tables found on page 10
No tables found on page 11
No tables found on page 12
No tables found on page 13
No tables found on page 14
No tables found on page 15
No tables found on page 16
No tables found on page 17
No tables found on page 18
No tables found on page 19
No tables found on page 20
No tables found on page 21
No tables found on page 22
No tables found on page 31
No tables found on page 32
No tables found on page 33
No tables found on page 34
No tables found on page 35
No tables found on page 37
No tables found on page 38
No tables found on page 39
No tables found on page 40
No tables found on page 41
No tables found on page 42
No tables found on page 43
No tables found on page 44
No tables found on page 45
No tables found on page 47
No tables found on page 48
No tables

In [13]:
print(f"Total Number of Tables Found: {len(tables_with_context)}")

Total Number of Tables Found: 47


In [14]:
# Print the extracted data
for table in tables_with_context[:5]:
    print(f"Page: {table['page']}, Table: {table['table_num']}")
    print(table['table'])
    print("*" * 80)
    print("\n")

Page: 5, Table: 1
| Col0        | Table of Contents                                       | Col2    |
|:------------|:--------------------------------------------------------|:--------|
| Sl. No.     | CHAPTER                                                 | PAGE    |
| I Introd    | uction                                                  | 5-7     |
| II Year 2   | 023-24 at a Glance                                      | 9-16    |
| III Road    | Development                                             | 19-30   |
| IV Logist   | ics and Allied Highway Infrastructure                   | 33-40   |
| V Devel     | opment of National Highways in the North Eastern Region | 43-49   |
| VI Road     | Transport                                               | 51-70   |
| VII Road    | Safety                                                  | 73-82   |
| VIII Resear | ch and Training                                         | 85-92   |
| IX Admin    | istration and Finance                     