In [1]:
import fitz
import pandas as pd
from typing import List, Dict
import re

def extract_tables_with_keyword(pdf_path: str, keyword: str, pages: List[int] = None) -> List[pd.DataFrame]:
    """
    Extract tables from a PDF that contain a specific keyword.
    
    Parameters:
    pdf_path (str): Path to the PDF file
    keyword (str): Keyword to search for in tables
    pages (List[int], optional): List of specific pages to search. If None, searches all pages.
    
    Returns:
    List[pd.DataFrame]: List of pandas DataFrames containing the matching tables
    """
    # Open the PDF
    doc = fitz.open(pdf_path)
    matching_tables = []
    
    # Determine pages to process
    if pages is None:
        pages = range(len(doc))
    
    for page_num in pages:
        page = doc[page_num]
        
        # Extract tables from the page
        tables = page.find_tables()
        
        if tables.tables:
            for table in tables:
                # Convert table to pandas DataFrame
                df = pd.DataFrame(table.extract())
                
                # Check if keyword exists in any cell of the table
                table_text = df.astype(str).values.tolist()
                found_keyword = any(
                    keyword.lower() in str(cell).lower()
                    for row in table_text
                    for cell in row
                )
                
                if found_keyword:
                    # Add page number information to help identify the table's source
                    df.insert(0, 'Page_Number', page_num + 1)
                    matching_tables.append(df)
    
    doc.close()
    return matching_tables

def process_extracted_tables(tables: List[pd.DataFrame], output_format: str = 'csv') -> Dict:
    """
    Process the extracted tables and save them in the specified format.
    
    Parameters:
    tables (List[pd.DataFrame]): List of tables extracted from the PDF
    output_format (str): Format to save tables in ('csv' or 'excel')
    
    Returns:
    Dict: Dictionary containing processing results and statistics
    """
    results = {
        'total_tables_found': len(tables),
        'tables_info': []
    }
    
    for i, table in enumerate(tables, 1):
        table_info = {
            'table_number': i,
            'page_number': table['Page_Number'].iloc[0],
            'rows': len(table),
            'columns': len(table.columns)
        }
        results['tables_info'].append(table_info)
        
        # Save individual tables
        if output_format == 'csv':
            table.to_csv(f'extracted_table_{i}.csv', index=False)
        elif output_format == 'excel':
            table.to_excel(f'extracted_table_{i}.xlsx', index=False)
            
    return results

In [2]:
# Example usage
pdf_path = r"klebl_pdf_digitizer\data\02_Binder\FT_XX_02-001_c_F.pdf"
keyword = "Planschlüssel"

# Extract tables containing the keyword
matching_tables = extract_tables_with_keyword(pdf_path, keyword)

# Process and save the extracted tables
results = process_extracted_tables(matching_tables, output_format='csv')

# Print results
print(f"Found {results['total_tables_found']} tables containing the keyword.")
for table_info in results['tables_info']:
    print(f"Table {table_info['table_number']} found on page {table_info['page_number']}")
    print(f"Dimensions: {table_info['rows']} rows × {table_info['columns']} columns")

Found 1 tables containing the keyword.
Table 1 found on page 1
Dimensions: 4 rows × 4 columns


In [3]:
import fitz
import pandas as pd
from typing import List, Dict, Tuple
import re

def extract_tables_with_keyword(pdf_path: str, keyword: str, pages: List[int] = None) -> List[Tuple[pd.DataFrame, Dict]]:
    """
    Extract tables from a PDF that contain a specific keyword, along with their location information.
    
    Parameters:
    pdf_path (str): Path to the PDF file
    keyword (str): Keyword to search for in tables
    pages (List[int], optional): List of specific pages to search. If None, searches all pages.
    
    Returns:
    List[Tuple[pd.DataFrame, Dict]]: List of tuples containing:
        - pandas DataFrame of the table
        - Dictionary with location information
    """
    # Open the PDF
    doc = fitz.open(pdf_path)
    matching_tables = []
    
    # Determine pages to process
    if pages is None:
        pages = range(len(doc))
    
    for page_num in pages:
        page = doc[page_num]
        
        # Extract tables from the page
        tables = page.find_tables()
        
        if tables.tables:
            for table_idx, table in enumerate(tables):
                # Convert table to pandas DataFrame
                df = pd.DataFrame(table.extract())
                
                # Check if keyword exists in any cell of the table
                table_text = df.astype(str).values.tolist()
                found_keyword = False
                keyword_locations = []
                
                # Search for keyword and record its location
                for row_idx, row in enumerate(table_text):
                    for col_idx, cell in enumerate(row):
                        if keyword.lower() in str(cell).lower():
                            found_keyword = True
                            keyword_locations.append({
                                'row': row_idx,
                                'column': col_idx,
                                'cell_content': cell
                            })
                
                if found_keyword:
                    # Get table boundaries on the page
                    table_rect = table.bbox  # Gets coordinates (x0, y0, x1, y1)
                    
                    # Location information
                    location_info = {
                        'page_number': page_num + 1,
                        'table_number': table_idx + 1,
                        'keyword_locations': keyword_locations,
                        'table_coordinates': {
                            'top_left': (table_rect.x0, table_rect.y0),
                            'bottom_right': (table_rect.x1, table_rect.y1)
                        },
                        'table_dimensions': {
                            'rows': len(df),
                            'columns': len(df.columns)
                        },
                        'surrounding_text': get_surrounding_text(page, table_rect)
                    }
                    
                    matching_tables.append((df, location_info))
    
    doc.close()
    return matching_tables

def get_surrounding_text(page, table_rect, margin=50):
    """
    Extract text surrounding the table within a specified margin.
    
    Parameters:
    page: PDF page object
    table_rect: Rectangle coordinates of the table
    margin: Pixel margin around the table to search for text
    
    Returns:
    Dict: Text before and after the table
    """
    # Create rectangles for areas above and below the table
    above_rect = fitz.Rect(
        table_rect.x0,
        max(0, table_rect.y0 - margin),
        table_rect.x1,
        table_rect.y0
    )
    
    below_rect = fitz.Rect(
        table_rect.x0,
        table_rect.y1,
        table_rect.x1,
        min(page.rect.height, table_rect.y1 + margin)
    )
    
    # Extract text from these areas
    return {
        'text_above': page.get_text("text", clip=above_rect).strip(),
        'text_below': page.get_text("text", clip=below_rect).strip()
    }

def save_table_with_context(table_data: Tuple[pd.DataFrame, Dict], output_prefix: str, format: str = 'csv'):
    """
    Save a table and its context information.
    
    Parameters:
    table_data: Tuple containing DataFrame and location information
    output_prefix: Prefix for output files
    format: Output format ('csv' or 'excel')
    """
    df, info = table_data
    
    # Save table data
    if format == 'csv':
        df.to_csv(f"{output_prefix}_table.csv", index=False)
    elif format == 'excel':
        df.to_excel(f"{output_prefix}_table.xlsx", index=False)
    
    # Save context information
    context_df = pd.DataFrame([{
        'Page Number': info['page_number'],
        'Table Number': info['table_number'],
        'Rows': info['table_dimensions']['rows'],
        'Columns': info['table_dimensions']['columns'],
        'Text Above': info['surrounding_text']['text_above'],
        'Text Below': info['surrounding_text']['text_below'],
        'Keyword Locations': str(info['keyword_locations'])
    }])
    
    if format == 'csv':
        context_df.to_csv(f"{output_prefix}_context.csv", index=False)
    elif format == 'excel':
        context_df.to_excel(f"{output_prefix}_context.xlsx", index=False)

In [4]:
# Example usage
pdf_path = r"klebl_pdf_digitizer\data\02_Binder\FT_XX_02-001_c_F.pdf"
keyword = "Planschlüssel"

# Extract tables containing the keyword with their location information
matching_tables = extract_tables_with_keyword(pdf_path, keyword)

# Process each table and its context
for i, (table, info) in enumerate(matching_tables, 1):
    print(f"\nTable {i} found on page {info['page_number']}:")
    print(f"Dimensions: {info['table_dimensions']['rows']} rows × {info['table_dimensions']['columns']} columns")
    print("\nKeyword found in cells:")
    for loc in info['keyword_locations']:
        print(f"Row {loc['row']}, Column {loc['column']}: {loc['cell_content']}")
    print("\nSurrounding text:")
    print("Above:", info['surrounding_text']['text_above'])
    print("Below:", info['surrounding_text']['text_below'])
    
    # Save table and context information
    save_table_with_context((table, info), f"table_{i}", format='csv')

AttributeError: 'tuple' object has no attribute 'x0'

In [5]:
import fitz
import pandas as pd
from typing import List, Dict, Tuple
import re

def extract_tables_with_keyword(pdf_path: str, keyword: str, pages: List[int] = None) -> List[Tuple[pd.DataFrame, Dict]]:
    """
    Extract tables from a PDF that contain a specific keyword, along with their location information.
    
    Parameters:
    pdf_path (str): Path to the PDF file
    keyword (str): Keyword to search for in tables
    pages (List[int], optional): List of specific pages to search. If None, searches all pages.
    
    Returns:
    List[Tuple[pd.DataFrame, Dict]]: List of tuples containing:
        - pandas DataFrame of the table
        - Dictionary with location information
    """
    # Open the PDF
    doc = fitz.open(pdf_path)
    matching_tables = []
    
    # Determine pages to process
    if pages is None:
        pages = range(len(doc))
    
    for page_num in pages:
        page = doc[page_num]
        
        # Extract tables from the page
        tables = page.find_tables()
        
        if tables.tables:
            for table_idx, table in enumerate(tables):
                # Convert table to pandas DataFrame
                df = pd.DataFrame(table.extract())
                
                # Check if keyword exists in any cell of the table
                table_text = df.astype(str).values.tolist()
                found_keyword = False
                keyword_locations = []
                
                # Search for keyword and record its location
                for row_idx, row in enumerate(table_text):
                    for col_idx, cell in enumerate(row):
                        if keyword.lower() in str(cell).lower():
                            found_keyword = True
                            keyword_locations.append({
                                'row': row_idx,
                                'column': col_idx,
                                'cell_content': cell
                            })
                
                if found_keyword:
                    # Get table boundaries on the page - bbox returns (x0, y0, x1, y1)
                    x0, y0, x1, y1 = table.bbox
                    
                    # Location information
                    location_info = {
                        'page_number': page_num + 1,
                        'table_number': table_idx + 1,
                        'keyword_locations': keyword_locations,
                        'table_coordinates': {
                            'top_left': (x0, y0),
                            'bottom_right': (x1, y1)
                        },
                        'table_dimensions': {
                            'rows': len(df),
                            'columns': len(df.columns)
                        },
                        'surrounding_text': get_surrounding_text(page, (x0, y0, x1, y1))
                    }
                    
                    matching_tables.append((df, location_info))
    
    doc.close()
    return matching_tables

def get_surrounding_text(page, table_coords: Tuple[float, float, float, float], margin=50):
    """
    Extract text surrounding the table within a specified margin.
    
    Parameters:
    page: PDF page object
    table_coords: Tuple of (x0, y0, x1, y1) coordinates
    margin: Pixel margin around the table to search for text
    
    Returns:
    Dict: Text before and after the table
    """
    x0, y0, x1, y1 = table_coords
    
    # Create rectangles for areas above and below the table
    above_rect = fitz.Rect(
        x0,
        max(0, y0 - margin),
        x1,
        y0
    )
    
    below_rect = fitz.Rect(
        x0,
        y1,
        x1,
        min(page.rect.height, y1 + margin)
    )
    
    # Extract text from these areas
    return {
        'text_above': page.get_text("text", clip=above_rect).strip(),
        'text_below': page.get_text("text", clip=below_rect).strip()
    }

def save_table_with_context(table_data: Tuple[pd.DataFrame, Dict], output_prefix: str, format: str = 'csv'):
    """
    Save a table and its context information.
    
    Parameters:
    table_data: Tuple containing DataFrame and location information
    output_prefix: Prefix for output files
    format: Output format ('csv' or 'excel')
    """
    df, info = table_data
    
    # Save table data
    if format == 'csv':
        df.to_csv(f"{output_prefix}_table.csv", index=False)
    elif format == 'excel':
        df.to_excel(f"{output_prefix}_table.xlsx", index=False)
    
    # Save context information
    context_df = pd.DataFrame([{
        'Page Number': info['page_number'],
        'Table Number': info['table_number'],
        'Rows': info['table_dimensions']['rows'],
        'Columns': info['table_dimensions']['columns'],
        'Text Above': info['surrounding_text']['text_above'],
        'Text Below': info['surrounding_text']['text_below'],
        'Keyword Locations': str(info['keyword_locations'])
    }])
    
    if format == 'csv':
        context_df.to_csv(f"{output_prefix}_context.csv", index=False)
    elif format == 'excel':
        context_df.to_excel(f"{output_prefix}_context.xlsx", index=False)

In [6]:
pdf_path = r"klebl_pdf_digitizer\data\02_Binder\FT_XX_02-001_c_F.pdf"
keyword = "Planschlüssel"

# Extract tables containing the keyword with their location information
matching_tables = extract_tables_with_keyword(pdf_path, keyword)

# Process each table and its context
for i, (table, info) in enumerate(matching_tables, 1):
    print(f"\nTable {i} found on page {info['page_number']}:")
    print(f"Dimensions: {info['table_dimensions']['rows']} rows × {info['table_dimensions']['columns']} columns")
    print("\nKeyword found in cells:")
    for loc in info['keyword_locations']:
        print(f"Row {loc['row']}, Column {loc['column']}: {loc['cell_content']}")


Table 1 found on page 1:
Dimensions: 4 rows × 3 columns

Keyword found in cells:
Row 3, Column 0: Planschlüssel


In [7]:
import fitz
import pandas as pd
from typing import List, Dict, Tuple
import re

def extract_tables_with_keyword(pdf_path: str, keyword: str, pages: List[int] = None) -> List[Tuple[pd.DataFrame, Dict]]:
    """
    Extract tables from a PDF that contain a specific keyword, along with their location information.
    
    Parameters:
    pdf_path (str): Path to the PDF file
    keyword (str): Keyword to search for in tables
    pages (List[int], optional): List of specific pages to search. If None, searches all pages.
    
    Returns:
    List[Tuple[pd.DataFrame, Dict]]: List of tuples containing:
        - pandas DataFrame of the table
        - Dictionary with location information
    """
    # Open the PDF
    doc = fitz.open(pdf_path)
    matching_tables = []
    
    # Determine pages to process
    if pages is None:
        pages = range(len(doc))
    
    for page_num in pages:
        page = doc[page_num]
        
        # Extract tables from the page
        tables = page.find_tables()
        
        if tables.tables:
            for table_idx, table in enumerate(tables):
                # Convert table to pandas DataFrame
                df = pd.DataFrame(table.extract())
                
                # Check if keyword exists in any cell of the table
                table_text = df.astype(str).values.tolist()
                found_keyword = False
                keyword_locations = []
                
                # Search for keyword and record its location
                for row_idx, row in enumerate(table_text):
                    for col_idx, cell in enumerate(row):
                        if keyword.lower() in str(cell).lower():
                            found_keyword = True
                            keyword_locations.append({
                                'row': row_idx,
                                'column': col_idx,
                                'cell_content': cell
                            })
                
                if found_keyword:
                    # Get table boundaries
                    x0, y0, x1, y1 = table.bbox
                    
                    # Location information
                    location_info = {
                        'page_number': page_num + 1,
                        'table_number': table_idx + 1,
                        'keyword_locations': keyword_locations,
                        'table_coordinates': {
                            'top_left': (x0, y0),
                            'bottom_right': (x1, y1)
                        },
                        'table_dimensions': {
                            'rows': len(df),
                            'columns': len(df.columns)
                        }
                    }
                    
                    matching_tables.append((df, location_info))
    
    doc.close()
    return matching_tables

def display_full_table(table_data: Tuple[pd.DataFrame, Dict]):
    """
    Display the full table content along with its location information.
    
    Parameters:
    table_data: Tuple containing DataFrame and location information
    """
    df, info = table_data
    
    print(f"\nTable found on page {info['page_number']} (Table #{info['table_number']}):")
    print(f"Dimensions: {info['table_dimensions']['rows']} rows × {info['table_dimensions']['columns']} columns")
    print("\nKeyword found in positions:")
    for loc in info['keyword_locations']:
        print(f"Row {loc['row'] + 1}, Column {loc['column'] + 1}")
    
    print("\nFull Table Content:")
    # Set display options to show all rows and columns
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    print(df)
    print("\n" + "="*50 + "\n")  # Separator between tables

In [8]:
pdf_path = r"klebl_pdf_digitizer\data\02_Binder\FT_XX_02-001_c_F.pdf"
keyword = "Planschlüssel"

# Extract tables containing the keyword
matching_tables = extract_tables_with_keyword(pdf_path, keyword)

# Display full content of each matching table
for table_data in matching_tables:
    display_full_table(table_data)
    
# Optionally, save specific tables to CSV or Excel
for i, (df, info) in enumerate(matching_tables, 1):
    df.to_csv(f"table_{i}.csv", index=False)


Table found on page 1 (Table #9):
Dimensions: 4 rows × 3 columns

Keyword found in positions:
Row 4, Column 1

Full Table Content:
                                                   0  \
0  Datum 16.12.2019 B a u v o r h a b e n / B a u...   
1      Gez. Jan. S\nNeub\nstat.Pos B01\nMaßstab 1:25   
2                                               None   
3                                      Planschlüssel   

                                                   1  \
0                                               None   
1  and- und Baustoffwerke Neumarkt GmbH & Co.KG\n...   
2                                               None   
3                                   FT_XX_02-001_c_F   

                           2  
0                       None  
1                     819-19  
2  Plan. Nr / Index / Status  
3                       None  




In [10]:
import fitz
import pandas as pd
from typing import List, Dict, Tuple
import re

def clean_table_data(df):
    """
    Clean and format table data for better readability.
    """
    # Replace NaN with empty string
    df = df.fillna('')
    
    # Clean whitespace and normalize spaces
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str).apply(lambda x: ' '.join(x.split()))
    
    return df

def extract_tables_with_keyword(pdf_path: str, keyword: str, pages: List[int] = None) -> List[Tuple[pd.DataFrame, Dict]]:
    """
    Extract tables from a PDF that contain a specific keyword.
    """
    doc = fitz.open(pdf_path)
    matching_tables = []
    
    # Determine pages to process
    if pages is None:
        pages = range(len(doc))
    
    for page_num in pages:
        page = doc[page_num]
        tables = page.find_tables()
        
        if tables.tables:
            for table_idx, table in enumerate(tables):
                # Convert table to pandas DataFrame and clean it
                df = clean_table_data(pd.DataFrame(table.extract()))
                
                # Check if keyword exists in any cell
                table_text = df.astype(str).values.tolist()
                found_keyword = False
                keyword_locations = []
                
                for row_idx, row in enumerate(table_text):
                    for col_idx, cell in enumerate(row):
                        if keyword.lower() in str(cell).lower():
                            found_keyword = True
                            keyword_locations.append({
                                'row': row_idx,
                                'column': col_idx,
                                'cell_content': cell
                            })
                
                if found_keyword:
                    matching_tables.append((df, {
                        'page_number': page_num + 1,
                        'table_number': table_idx + 1,
                        'keyword_locations': keyword_locations
                    }))
    
    doc.close()
    return matching_tables

def format_table_as_text(df: pd.DataFrame, info: Dict) -> str:
    """
    Format table data as readable text.
    """
    output = []
    
    # Header
    output.append(f"=== Table found on Page {info['page_number']} ===\n")
    
    # Format each row of the table
    for row_idx, row in df.iterrows():
        row_content = []
        for col_idx, value in enumerate(row):
            # Check if this cell contains the keyword
            is_keyword_cell = any(
                loc['row'] == row_idx and loc['column'] == col_idx 
                for loc in info['keyword_locations']
            )
            
            # Format the cell value
            cell_text = str(value).strip()
            if is_keyword_cell:
                cell_text = f"*{cell_text}*"  # Mark keyword cells with asterisks
            
            if cell_text:  # Only add non-empty cells
                row_content.append(cell_text)
        
        if row_content:  # Only add non-empty rows
            output.append(" | ".join(row_content))
    
    output.append("\n" + "="*50 + "\n")
    return "\n".join(output)

def display_tables(matching_tables: List[Tuple[pd.DataFrame, Dict]]):
    """
    Display all matching tables in a clean, readable format.
    """
    for df, info in matching_tables:
        formatted_text = format_table_as_text(df, info)
        print(formatted_text)

In [11]:
pdf_path = r"klebl_pdf_digitizer\data\02_Binder\FT_XX_02-001_c_F.pdf"
keyword = "Planschlüssel"

# Extract and display tables
matching_tables = extract_tables_with_keyword(pdf_path, keyword)
display_tables(matching_tables)

=== Table found on Page 1 ===

Datum 16.12.2019 B a u v o r h a b e n / B a u t e i l Auftr. Nr
Gez. Jan. S Neub stat.Pos B01 Maßstab 1:25 | and- und Baustoffwerke Neumarkt GmbH & Co.KG au einer Ausstellungshalle, und Containerhalle mit Büro Dachbinder Pos. 02-001 | 819-19
Plan. Nr / Index / Status
*Planschlüssel* | FT_XX_02-001_c_F




In [9]:
import fitz
import pandas as pd
import cv2
import numpy as np
from typing import List, Dict, Tuple
import os

def extract_table_with_image(pdf_path: str, keyword: str, output_dir: str = "table_images", pages: List[int] = None):
    """
    Extract tables containing keyword and save their images using coordinates.
    
    Parameters:
    pdf_path (str): Path to the PDF file
    keyword (str): Keyword to search for in tables
    output_dir (str): Directory to save the cropped table images
    pages (List[int], optional): List of specific pages to search. If None, searches all pages.
    
    Returns:
    List[Dict]: List of dictionaries containing table information and image paths
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Open the PDF
    doc = fitz.open(pdf_path)
    tables_info = []
    
    # Determine pages to process
    if pages is None:
        pages = range(len(doc))
    
    for page_num in pages:
        page = doc[page_num]
        
        # Get page dimensions
        page_width = page.rect.width
        page_height = page.rect.height
        
        # Convert PDF page to image
        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
        
        # Extract tables from the page
        tables = page.find_tables()
        
        if tables.tables:
            for table_idx, table in enumerate(tables):
                # Convert table to DataFrame
                df = pd.DataFrame(table.extract())
                
                # Check if keyword exists in any cell
                table_text = df.astype(str).values.tolist()
                found_keyword = False
                keyword_locations = []
                
                for row_idx, row in enumerate(table_text):
                    for col_idx, cell in enumerate(row):
                        if keyword.lower() in str(cell).lower():
                            found_keyword = True
                            keyword_locations.append({
                                'row': row_idx,
                                'column': col_idx,
                                'cell_content': cell
                            })
                
                if found_keyword:
                    # Get table coordinates
                    x0, y0, x1, y1 = table.bbox
                    
                    # Scale coordinates to match image resolution
                    scale = 300/72  # Scale factor for 300 DPI
                    x0, y0, x1, y1 = [int(coord * scale) for coord in [x0, y0, x1, y1]]
                    
                    # Crop the image
                    table_image = img[y0:y1, x0:x1]
                    
                    # Save the cropped image
                    image_path = os.path.join(output_dir, f"table_page{page_num+1}_idx{table_idx}.png")
                    cv2.imwrite(image_path, cv2.cvtColor(table_image, cv2.COLOR_RGB2BGR))
                    
                    # Store table information
                    table_info = {
                        'page_number': page_num + 1,
                        'table_number': table_idx + 1,
                        'keyword_locations': keyword_locations,
                        'coordinates': {
                            'x0': x0,
                            'y0': y0,
                            'x1': x1,
                            'y1': y1,
                            'width': x1 - x0,
                            'height': y1 - y0
                        },
                        'original_pdf_size': {
                            'width': page_width,
                            'height': page_height
                        },
                        'image_path': image_path,
                        'dataframe': df
                    }
                    tables_info.append(table_info)
    
    doc.close()
    return tables_info

def display_table_info(table_info: Dict):
    """
    Display information about the extracted table.
    """
    print(f"\n=== Table found on Page {table_info['page_number']} ===")
    print("\nCoordinates (in pixels at 300 DPI):")
    print(f"Top-left: ({table_info['coordinates']['x0']}, {table_info['coordinates']['y0']})")
    print(f"Bottom-right: ({table_info['coordinates']['x1']}, {table_info['coordinates']['y1']})")
    print(f"Width: {table_info['coordinates']['width']} pixels")
    print(f"Height: {table_info['coordinates']['height']} pixels")
    print(f"\nImage saved to: {table_info['image_path']}")
    print("\nKeyword found in:")
    for loc in table_info['keyword_locations']:
        print(f"Row {loc['row'] + 1}, Column {loc['column'] + 1}: {loc['cell_content']}")
    print("\nTable content:")
    print(table_info['dataframe'])
    print("\n" + "="*50)

In [10]:
# Example usage
pdf_path = r"klebl_pdf_digitizer\data\02_Binder\FT_XX_02-001_c_F.pdf"
keyword = "Planschlüssel"

# Extract tables containing the keyword
matching_tables = extract_tables_with_keyword(pdf_path, keyword)

# Display full content of each matching table
for table_data in matching_tables:
    display_full_table(table_data)
    
# Optionally, save specific tables to CSV or Excel
for i, (df, info) in enumerate(matching_tables, 1):
    df.to_csv(f"table_{i}.csv", index=False)

# Extract tables and save images
tables_info = extract_table_with_image(pdf_path, keyword)

# Display information for each table
for table_info in tables_info:
    display_table_info(table_info)

NameError: name 'extract_tables_with_keyword' is not defined

In [11]:
import fitz
import cv2
import numpy as np
from typing import List, Dict, Tuple
import os

def get_table_images(pdf_path: str, keyword: str) -> List[Dict]:
    """
    Extract tables containing keyword and convert them to PNG images.
    
    Parameters:
    pdf_path (str): Path to the PDF file
    keyword (str): Keyword to search for in tables
    
    Returns:
    List[Dict]: List of dictionaries containing table images and their information
    """
    doc = fitz.open(pdf_path)
    table_images = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # Convert PDF page to image at 300 DPI
        zoom = 300/72  # 300 DPI resolution
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
        
        # Find tables in the page
        tables = page.find_tables()
        
        if tables.tables:
            for table_idx, table in enumerate(tables):
                # Convert table to DataFrame to search for keyword
                df = pd.DataFrame(table.extract())
                
                # Search for keyword
                if df.astype(str).apply(lambda x: x.str.contains(keyword, case=False)).any().any():
                    # Get table coordinates
                    x0, y0, x1, y1 = table.bbox
                    
                    # Scale coordinates to match image resolution
                    x0, y0, x1, y1 = [int(coord * zoom) for coord in [x0, y0, x1, y1]]
                    
                    # Add small padding around the table (optional)
                    padding = 10
                    x0 = max(0, x0 - padding)
                    y0 = max(0, y0 - padding)
                    x1 = min(img.shape[1], x1 + padding)
                    y1 = min(img.shape[0], y1 + padding)
                    
                    # Crop the table image
                    table_img = img[y0:y1, x0:x1]
                    
                    # Convert RGB to BGR for OpenCV
                    table_img_bgr = cv2.cvtColor(table_img, cv2.COLOR_RGB2BGR)
                    
                    table_info = {
                        'page_number': page_num + 1,
                        'table_number': table_idx + 1,
                        'coordinates': {
                            'x0': x0, 'y0': y0,
                            'x1': x1, 'y1': y1,
                            'width': x1 - x0,
                            'height': y1 - y0
                        },
                        'image': table_img_bgr  # The actual image data
                    }
                    
                    table_images.append(table_info)
    
    doc.close()
    return table_images

def save_table_images(table_images: List[Dict], output_dir: str = "table_images"):
    """
    Save the extracted table images to files.
    
    Parameters:
    table_images: List of dictionaries containing table images and info
    output_dir: Directory to save the images
    
    Returns:
    List[str]: List of saved image paths
    """
    os.makedirs(output_dir, exist_ok=True)
    saved_paths = []
    
    for table in table_images:
        # Generate filename
        filename = f"table_page{table['page_number']}_num{table['table_number']}.png"
        filepath = os.path.join(output_dir, filename)
        
        # Save image
        cv2.imwrite(filepath, table['image'])
        saved_paths.append(filepath)
        
        print(f"Saved table image from page {table['page_number']} to: {filepath}")
        print(f"Table dimensions: {table['coordinates']['width']}x{table['coordinates']['height']} pixels")
    
    return saved_paths

In [13]:
# Example usage
pdf_path = r"klebl_pdf_digitizer\data\02_Binder\FT_XX_02-001_c_F.pdf"
keyword = "Breite"

# Get table images
table_images = get_table_images(pdf_path, keyword)

# Save the images
saved_files = save_table_images(table_images)

# If you want to display an image (if you're in a Jupyter notebook)
import matplotlib.pyplot as plt
plt.imshow(cv2.cvtColor(table_images[0]['image'], cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

# If you want to do additional processing on a specific table image
for table in table_images:
    img = table['image']
    # Example: Apply some image processing
    # enhanced_img = cv2.enhance(img)  # placeholder for any CV operation
    print(f"Found table on page {table['page_number']}")
    print(f"Coordinates: {table['coordinates']}")

IndexError: list index out of range

In [14]:
!pip install matplotlib

