In [3]:
import json
from docx import Document

def is_text_below_table(table, paragraph):
    """Check if there is any text below the given table."""
    next_elem = table._element.getnext()
    while next_elem is not None:
        if next_elem.tag.endswith('p'):
            if paragraph.text.strip() != "":
                return True
        next_elem = next_elem.getnext()
    return False

def extract_table_data(table):
    """Extract data from a single table."""
    data = []
    keys = None
    for i, row in enumerate(table.rows):
        text = [cell.text.strip() for cell in row.cells]
        if i == 0:
            keys = text
        else:
            row_data = {keys[j]: text[j] for j in range(len(keys))}
            data.append(row_data)
    return data

def merge_tables(doc):
    """Merge tables spanning multiple pages into a single JSON structure."""
    all_data = []
    prev_table = None
    
    for paragraph in doc.paragraphs:
        if paragraph._element.tag.endswith('tbl'):
            table = paragraph._element
            current_table = doc.tables[list(doc.element.body).index(table)]
            table_data = extract_table_data(current_table)
            
            if prev_table is None:
                prev_table = table_data
            else:
                prev_table.extend(table_data)
            
            if is_text_below_table(current_table, paragraph):
                all_data.extend(prev_table)
                prev_table = None
        else:
            if prev_table is not None:
                all_data.extend(prev_table)
                prev_table = None
    
    if prev_table is not None:
        all_data.extend(prev_table)
    
    return all_data

def tables_to_json(doc_path):
    doc = Document(doc_path)
    merged_data = merge_tables(doc)
    json_data = json.dumps(merged_data, indent=4)
    return json_data

# Usage example
doc_path = 'sample_document.docx'  # Replace with your document path
json_output = tables_to_json(doc_path)

# Print or save the JSON data
print(json_output)

# Optionally, save to a file
with open('output.json', 'w') as f:
    f.write(json_output)


[]
