In [6]:
import pdfplumber
import pandas as pd
from pathlib import Path

def extract_and_display_tables(pdf_path, max_pages=None):
    
    table_count = 0
    
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        pages_to_process = min(max_pages, total_pages) if max_pages else total_pages
        
        for page_num, page in enumerate(pdf.pages[:pages_to_process], 1):
            tables = page.extract_tables()
            
            if tables:
                for table_idx, table in enumerate(tables, 1):
                    table_count += 1
                    for row in table[:5]:  # Show first 5 rows
                        print(row)
                    
                    if len(table) > 5:
                        print(f"... ({len(table) - 5} more rows)")
                    
                    try:
                        if len(table) > 1:
                            df = pd.DataFrame(table[1:], columns=table[0])
                            print("\nAs DataFrame:")
                            print(df.head())
                            print(f"\nShape: {df.shape}")
                        else:
                            df = pd.DataFrame(table)
                            print("\nAs DataFrame:")
                            print(df)
                    except Exception as e:
                        print(f"\nCould not convert to DataFrame: {e}")
                    
                    print("\n" + "=" * 80)
    
    print(f"\n\nTotal tables found: {table_count}")
    return table_count



In [7]:

def extract_tables_to_list(pdf_path):
    tables_data = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            tables = page.extract_tables()
            
            for table_idx, table in enumerate(tables, 1):
                try:
                    if len(table) > 1:
                        df = pd.DataFrame(table[1:], columns=table[0])
                    else:
                        df = pd.DataFrame(table)
                except:
                    df = None
                
                tables_data.append({
                    'page': page_num,
                    'table_index': table_idx,
                    'raw_data': table,
                    'dataframe': df
                })
    
    return tables_data




In [4]:
extract_and_display_tables("/Users/santhosh/Documents/study_projects/ringcentral_assessment/pdfs/astor_manual.pdf")

['Note']
['The content marked with “Note”\nindicateshelpfulinformationneedtobe\nnotedduringvehicleoperation.Ignoring\nsuch information may lead to wrong\noperation.']

As DataFrame:
                                                Note
0  The content marked with “Note”\nindicateshelpf...

Shape: (1, 1)

['IMPORTANT']
['The statements stated here must be\nfollowed strictly, otherwise your car could\nbe damaged.']

As DataFrame:
                                           IMPORTANT
0  The statements stated here must be\nfollowed s...

Shape: (1, 1)


As DataFrame:

Shape: (1, 1)

['Caution']
['The content marked with “Caution”\nindicates possibility of vehicle damage.\nIgnoring such information may lead to\nvehicledamage.']

As DataFrame:
                                             Caution
0  The content marked with “Caution”\nindicates p...

Shape: (1, 1)

['IMPORTANT']
['To protect the engine from damage, never\nallow the pointer to remain in the red\nsector of the gauge for prolonged p

128

In [8]:
tables = extract_tables_to_list("/Users/santhosh/Documents/study_projects/ringcentral_assessment/pdfs/astor_manual.pdf")

128