<a href="https://colab.research.google.com/github/shiragelb/NCC-Statistical-Reports/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber
!pip install camelot-py[cv]
!pip install tabula-py
!pip install python-docx

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m678.4 kB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

Imports

In [2]:
import requests
import os
from docx import Document
import pandas as pd
from google.colab import files
import camelot
import tabula
import pdfplumber
from docx.shared import Inches # Import Inches for setting image size
import json

Extract Tables

In [3]:
def setup_directory():
    """Create tables directory if it doesn't exist"""
    if not os.path.exists('tables'):
        os.makedirs('tables')
        print("Created 'tables/' directory")
    else:
        print("'tables/' directory already exists")

def extract_tables_with_names(docx_path):
    """Extract tables with their names from DOCX"""
    doc = Document(docx_path)
    tables = []

    for i, table in enumerate(doc.tables):
        # Extract table data
        data = []
        for row in table.rows:
            data.append([cell.text.strip() for cell in row.cells])

        if data:
            df = pd.DataFrame(data)

            # Try to find table name from first row or use default
            # Assuming first row might contain the table name
            table_name = f"Table_{i+1}"  # Default name
            if len(data[0]) > 0 and len(data) >= 1:  # Single cell in first row might be title
                table_name = data[0][0] if data[0][0] else table_name
                df = pd.DataFrame(data[1:])  # Skip title row

            tables.append((table_name, df))

    return tables

def save_tables_to_csv(tables, chapter, year):
    """Save tables to CSV files and return reference dictionary"""
    reference_dict = {}

    for i, (name, df) in enumerate(tables, 1):
        # Create filename: table{i}{j}{k}.csv
        filename = f"table{i}{chapter}{year}.csv"
        filepath = os.path.join('tables', filename)

        # Save dataframe to CSV
        df.to_csv(filepath, index=False, header=False)

        # Add to reference dictionary
        reference_dict[name] = filepath
        print(f"Saved: {filepath}")

    return reference_dict

def save_dictionary_to_json(reference_dict, filename='table_references.json'):
    """Save reference dictionary to JSON file with proper Unicode support"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(reference_dict, f, indent=2, ensure_ascii=False)
    print(f"Reference dictionary saved to {filename}")

def process_documents(doc1_path, chapter1, year1, doc2_path, chapter2, year2):
    """Main function to process both documents"""
    # Setup directory
    setup_directory()

    # Combined dictionary for all tables
    all_references = {}

    # Process first document
    print(f"\nProcessing: {doc1_path}")
    tables1 = extract_tables_with_names(doc1_path)
    ref_dict1 = save_tables_to_csv(tables1, chapter1, year1)
    all_references.update(ref_dict1)

    # Process second document
    print(f"\nProcessing: {doc2_path}")
    tables2 = extract_tables_with_names(doc2_path)
    ref_dict2 = save_tables_to_csv(tables2, chapter2, year2)
    all_references.update(ref_dict2)

    # Save combined dictionary
    save_dictionary_to_json(all_references)

    print(f"\nTotal tables processed: {len(all_references)}")
    return all_references

In [4]:
from google.colab import files

# Upload
chp1_2001_raw = files.upload()
chp1_2002_raw = files.upload()

# Extract file names
chp1_2001 = list(chp1_2001_raw.keys())[0]
chp1_2002 = list(chp1_2002_raw.keys())[0]


Saving chap 01.docx to chap 01.docx


Saving chap 01.docx to chap 01 (1).docx


In [5]:
process_documents(chp1_2001, 1, 2001, chp1_2002, 1, 2002)

Created 'tables/' directory

Processing: chap 01.docx
Saved: tables/table112001.csv
Saved: tables/table212001.csv
Saved: tables/table312001.csv
Saved: tables/table412001.csv
Saved: tables/table512001.csv
Saved: tables/table612001.csv
Saved: tables/table712001.csv
Saved: tables/table812001.csv
Saved: tables/table912001.csv
Saved: tables/table1012001.csv
Saved: tables/table1112001.csv
Saved: tables/table1212001.csv
Saved: tables/table1312001.csv
Saved: tables/table1412001.csv
Saved: tables/table1512001.csv
Saved: tables/table1612001.csv
Saved: tables/table1712001.csv
Saved: tables/table1812001.csv
Saved: tables/table1912001.csv
Saved: tables/table2012001.csv
Saved: tables/table2112001.csv
Saved: tables/table2212001.csv
Saved: tables/table2312001.csv
Saved: tables/table2412001.csv
Saved: tables/table2512001.csv
Saved: tables/table2612001.csv

Processing: chap 01 (1).docx
Saved: tables/table112002.csv
Saved: tables/table212002.csv
Saved: tables/table312002.csv
Saved: tables/table412002.csv

{'Table_1': 'tables/table112002.csv',
 'ילדים בישראל*\nלפי דת (אלפים ושיעור גידולם)\n2000-1970': 'tables/table212001.csv',
 'אחוז הילדים בישראל מכלל האוכלוסייה\nלפי דת \n2000-1970': 'tables/table312001.csv',
 'מספר הילדים\nלפי גיל, דת, סוג וגודל יישוב (אלפים*)\nממוצע 2000': 'tables/table412001.csv',
 'מספר הילדים\nלפי גיל, דת, סוג וגודל יישוב (אלפים*)\nממוצע 2000 (המשך)': 'tables/table512001.csv',
 'מספר הילדים, חלקם באוכלוסייה ודתם \nלפי מחוז ונפה (אלפים ואחוזים)\nממוצע 2000': 'tables/table612001.csv',
 'מספר הילדים ביישובים מעורבים נבחרים \nלפי דת (אלפים ואחוזים)\nסוף דצמבר 2000': 'tables/table712001.csv',
 'מספר הילדים לפי גיל וחלקם באוכלוסיית היישובים* \n(אלפים ואחוזים)\nסוף דצמבר 2000': 'tables/table812001.csv',
 'מספר הילדים לפי גיל וחלקם באוכלוסיית היישובים* \n(אלפים ואחוזים)\nסוף דצמבר 2000 (המשך)': 'tables/table1212001.csv',
 'חלקם של  הילדים באוכלוסיית היישובים שמנו 10,000 תושבים ויותר \n(אחוזים)\nסוף דצמבר 2000': 'tables/table1312001.csv',
 'חלקם של  הילדים באוכלוסיית היישוב

# Table Extraction


In [8]:
!pip install python-docx
!pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9


In [17]:
# Automated Table Extraction System
# Phase 1: Environment Setup & Dependencies

# Install required packages (run this in Google Colab)
# !pip install python-docx python-docx2txt

# Core imports
import os
import json
import pandas as pd
from pathlib import Path
import mimetypes

# Document processing imports
from docx import Document
import docx2txt

# Google Colab specific imports
from google.colab import files

# File validation and utilities
import zipfile
from typing import Tuple, List, Dict, Optional, Union

print("✅ All required libraries imported successfully")
print("📋 Environment setup complete")

# Phase 1, Step 1.2: Enhanced Directory Structure Function
def setup_directory_structure(year: int, chapter: int) -> str:
    """
    Create directory structure: Tables/year/chapter_j/

    Args:
        year: Year for the directory structure
        chapter: Chapter number for the directory

    Returns:
        str: Path to the created chapter directory

    Raises:
        OSError: If directory creation fails
    """
    try:
        # Create the directory path
        chapter_dir = f"Tables/{year}/chapter_{chapter}"

        # Create directories (including parents)
        os.makedirs(chapter_dir, exist_ok=True)

        print(f"✅ Directory structure created: {chapter_dir}/")
        return chapter_dir

    except OSError as e:
        error_msg = f"❌ Failed to create directory structure Tables/{year}/chapter_{chapter}/: {e}"
        print(error_msg)
        raise OSError(error_msg)
    except Exception as e:
        error_msg = f"❌ Unexpected error creating directory: {e}"
        print(error_msg)
        raise Exception(error_msg)

# Phase 2: File Processing & Validation
# Step 2.1: File Format Detection and Validation

def validate_and_identify_file(filepath: str) -> str:
    """
    Validate file exists and identify if it's DOC or DOCX format

    Args:
        filepath: Path to the file to validate

    Returns:
        str: 'docx' or 'doc' depending on file type

    Raises:
        FileNotFoundError: If file doesn't exist
        ValueError: If file is not DOC or DOCX format
    """
    try:
        # Check if file exists
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"❌ File not found: {filepath}")

        # Get file extension
        _, extension = os.path.splitext(filepath.lower())

        # Check if it's a supported format
        if extension == '.docx':
            # Additional validation: try to open as zip (DOCX is zip-based)
            try:
                with zipfile.ZipFile(filepath, 'r') as zip_file:
                    # Check if it has the typical DOCX structure
                    if 'word/document.xml' in zip_file.namelist():
                        print(f"✅ Valid DOCX file detected: {os.path.basename(filepath)}")
                        return 'docx'
                    else:
                        raise ValueError(f"❌ File appears corrupted or invalid DOCX: {filepath}")
            except zipfile.BadZipFile:
                raise ValueError(f"❌ File is not a valid DOCX format: {filepath}")

        elif extension == '.doc':
            # Basic validation for DOC files (check file size > 0)
            file_size = os.path.getsize(filepath)
            if file_size > 0:
                print(f"✅ DOC file detected: {os.path.basename(filepath)}")
                return 'doc'
            else:
                raise ValueError(f"❌ DOC file appears empty: {filepath}")

        else:
            supported_formats = ['.doc', '.docx']
            raise ValueError(f"❌ Unsupported file format: {extension}. Supported formats: {supported_formats}")

    except Exception as e:
        if isinstance(e, (FileNotFoundError, ValueError)):
            raise e
        else:
            raise ValueError(f"❌ Error validating file {filepath}: {e}")

# Step 2.2: Unified Document Loader with DOC to DOCX Conversion

def load_document(filepath: str, file_type: str) -> Document:
    """
    Load document using appropriate method, converting DOC to DOCX if needed

    Args:
        filepath: Path to the document file
        file_type: 'doc' or 'docx' as returned by validate_and_identify_file

    Returns:
        Document: python-docx Document object (same for both DOC and DOCX)

    Raises:
        Exception: If document loading or conversion fails
    """
    try:
        if file_type == 'docx':
            # Load DOCX directly using python-docx
            try:
                doc = Document(filepath)
                print(f"✅ DOCX document loaded successfully: {os.path.basename(filepath)}")
                return doc
            except Exception as e:
                raise Exception(f"❌ Failed to load DOCX document: {e}")

        elif file_type == 'doc':
            # Convert DOC to DOCX first, then load
            try:
                print(f"🔄 Converting DOC to DOCX: {os.path.basename(filepath)}")

                # Create temporary file for converted DOCX
                with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as temp_file:
                    temp_docx_path = temp_file.name

                try:
                    # Convert DOC to DOCX using pypandoc (preserves tables)
                    pypandoc.convert_file(
                        filepath,
                        'docx',
                        outputfile=temp_docx_path,
                        extra_args=['--preserve-tabs']  # Help preserve table structure
                    )

                    # Load the converted DOCX file
                    doc = Document(temp_docx_path)
                    print(f"✅ DOC converted and loaded successfully: {os.path.basename(filepath)}")
                    return doc

                finally:
                    # Clean up temporary file
                    try:
                        os.unlink(temp_docx_path)
                    except:
                        pass  # Ignore cleanup errors

            except Exception as e:
                raise Exception(f"❌ Failed to convert/load DOC document: {e}")

        else:
            raise ValueError(f"❌ Unsupported file type: {file_type}")

    except Exception as e:
        error_msg = f"❌ Document loading error for {filepath}: {e}"
        print(error_msg)
        raise Exception(error_msg)

# Phase 3: Table Extraction & Filtering
# Step 3.1: Enhanced Table Extraction with Hebrew Filtering and Name Preservation

def extract_filtered_tables(doc: Document) -> List[Tuple[str, pd.DataFrame]]:
    """
    Extract tables that contain 'לוח' in the title row, preserving table names

    Args:
        doc: python-docx Document object

    Returns:
        List[Tuple[str, pd.DataFrame]]: List of (table_name, dataframe) tuples

    Raises:
        Exception: If table extraction fails
    """
    try:
        tables = []
        filtered_count = 0
        total_tables = len(doc.tables)

        print(f"🔍 Found {total_tables} tables in document, filtering for 'לוח'...")

        for i, table in enumerate(doc.tables):
            try:
                # Extract table data
                data = []
                for row in table.rows:
                    row_data = [cell.text.strip() for cell in row.cells]
                    data.append(row_data)

                # Skip empty tables
                if not data or not any(cell for row in data for cell in row if cell.strip()):
                    print(f"⚠️  Table {i+1}: Empty table, skipping")
                    continue

                # Check if first row contains 'לוח'
                first_row = data[0]
                contains_hebrew_table = any('לוח' in str(cell) for cell in first_row)

                if contains_hebrew_table:
                    # Extract table name from first cell or use default
                    table_name = f"Table_{i+1}"  # Default name
                    if first_row[0] and first_row[0].strip():
                        table_name = first_row[0].strip()

                    # Create DataFrame from data (skip title row)
                    if len(data) > 1:
                        df = pd.DataFrame(data[1:])  # Skip title row

                        # Validate DataFrame has content
                        if not df.empty and df.notna().any().any():
                            tables.append((table_name, df))
                            print(f"✅ Table {i+1}: '{table_name}' - extracted ({len(df)} rows)")
                            filtered_count += 1
                        else:
                            print(f"⚠️  Table {i+1}: '{table_name}' - no extractable data after title row")
                    else:
                        print(f"⚠️  Table {i+1}: '{table_name}' - only title row found, no data to extract")
                else:
                    print(f"❌ Table {i+1}: No 'לוח' found in title row, skipping")

            except Exception as e:
                print(f"⚠️  Error processing table {i+1}: {e}")
                continue

        print(f"📊 Filtering complete: {filtered_count}/{total_tables} tables contain 'לוח'")

        if not tables:
            print("⚠️  No tables with 'לוח' found in document")

        return tables

    except Exception as e:
        error_msg = f"❌ Table extraction failed: {e}"
        print(error_msg)
        raise Exception(error_msg)

# Step 3.2: Table Naming and Numbering

def generate_table_filename(table_number: int, chapter: int, year: int) -> str:
    """
    Generate standardized filename for tables: Ti_chpj_year.csv

    Args:
        table_number: Sequential number of the table (i)
        chapter: Chapter number (j)
        year: Year

    Returns:
        str: Formatted filename (e.g., "T1_chp2_2023.csv")
    """
    filename = f"T{table_number}_chp{chapter}_{year}.csv"
    return filename

def assign_table_numbers(tables: List[Tuple[str, pd.DataFrame]]) -> List[Tuple[str, pd.DataFrame, str]]:
    """
    Assign sequential numbers and generate filenames for filtered tables

    Args:
        tables: List of (table_name, dataframe) tuples from extract_filtered_tables

    Returns:
        List[Tuple[str, pd.DataFrame, str]]: List of (table_name, dataframe, filename) tuples
    """
    numbered_tables = []

    for i, (table_name, df) in enumerate(tables, 1):
        # Note: chapter and year will be provided when saving
        # For now, we just assign the table number
        numbered_tables.append((table_name, df, i))

    print(f"📝 Assigned sequential numbers to {len(numbered_tables)} tables")
    return numbered_tables

# Phase 4: File Operations & Output
# Step 4.1: Enhanced CSV Saving with Directory Structure

def save_tables_to_csv(numbered_tables: List[Tuple[str, pd.DataFrame, int]],
                       chapter: int, year: int, chapter_dir: str) -> Dict[str, str]:
    """
    Save tables to CSV files in organized directory structure

    Args:
        numbered_tables: List of (table_name, dataframe, table_number) tuples
        chapter: Chapter number for filename generation
        year: Year for filename generation
        chapter_dir: Directory path where CSV files should be saved

    Returns:
        Dict[str, str]: Reference dictionary mapping table names to file paths
    """
    reference_dict = {}
    total_tables = len(numbered_tables)

    if total_tables == 0:
        print("⚠️  No tables to save")
        return reference_dict

    for table_name, df, table_number in numbered_tables:
        try:
            # Generate filename using the standardized format
            filename = generate_table_filename(table_number, chapter, year)
            filepath = os.path.join(chapter_dir, filename)

            # Save DataFrame to CSV (no index, no header as per original code)
            df.to_csv(filepath, index=False, header=False, encoding='utf-8')

            # Add to reference dictionary
            reference_dict[table_name] = filepath

        except Exception as e:
            print(f"❌ Failed to save table '{table_name}': {e}")
            continue

    successful_saves = len(reference_dict)
    print(f"✅ Successfully saved {successful_saves}/{total_tables} tables")
    return reference_dict

# Step 4.2: Enhanced JSON Reference Saving

def save_file_reference_dictionary(reference_dict: Dict[str, str],
                                 chapter: int, year: int, chapter_dir: str) -> str:
    """
    Save reference dictionary to JSON file with per-file naming convention

    Args:
        reference_dict: Dictionary mapping table names to file paths
        chapter: Chapter number for filename generation
        year: Year for filename generation
        chapter_dir: Directory where JSON file should be saved

    Returns:
        str: Path to the saved JSON file

    Raises:
        Exception: If JSON saving fails
    """
    try:
        # Generate filename: references_chpj_year.json
        json_filename = f"references_chp{chapter}_{year}.json"
        json_filepath = os.path.join(chapter_dir, json_filename)

        # Save reference dictionary with proper Unicode support
        with open(json_filepath, 'w', encoding='utf-8') as f:
            json.dump(reference_dict, f, indent=2, ensure_ascii=False)

        print(f"✅ Reference dictionary saved: {json_filename}")
        return json_filepath

    except Exception as e:
        error_msg = f"❌ Failed to save reference dictionary: {e}"
        print(error_msg)
        raise Exception(error_msg)

# Phase 5: Main Automation Loop
# Step 5.1: Single-File Processing Function

def process_single_file(filepath: str, chapter: int, year: int) -> Dict[str, Union[int, str, bool]]:
    """
    Complete pipeline to process a single document file

    Args:
        filepath: Path to the document file to process
        chapter: Chapter number for organization and naming
        year: Year for organization and naming

    Returns:
        Dict: Processing statistics and status including:
            - success: bool
            - tables_found: int
            - tables_filtered: int
            - tables_saved: int
            - json_saved: bool
            - error_message: str (if error occurred)
    """
    stats = {
        'success': False,
        'tables_found': 0,
        'tables_filtered': 0,
        'tables_saved': 0,
        'json_saved': False,
        'error_message': None
    }

    try:
        print(f"\n🔄 Processing: {os.path.basename(filepath)} (Chapter {chapter}, Year {year})")

        # Step 1: Validate and identify file type
        try:
            file_type = validate_and_identify_file(filepath)
        except Exception as e:
            stats['error_message'] = f"File validation failed: {e}"
            return stats

        # Step 2: Setup directory structure
        try:
            chapter_dir = setup_directory_structure(year, chapter)
        except Exception as e:
            stats['error_message'] = f"Directory setup failed: {e}"
            return stats

        # Step 3: Load document
        try:
            doc = load_document(filepath, file_type)
            stats['tables_found'] = len(doc.tables)
        except Exception as e:
            stats['error_message'] = f"Document loading failed: {e}"
            return stats

        # Step 4: Extract and filter tables
        try:
            filtered_tables = extract_filtered_tables(doc)
            stats['tables_filtered'] = len(filtered_tables)

            if not filtered_tables:
                print("⚠️  No tables containing 'לוח' found in document")
                stats['success'] = True  # Not an error, just no matching tables
                return stats

        except Exception as e:
            stats['error_message'] = f"Table extraction failed: {e}"
            return stats

        # Step 5: Assign table numbers
        try:
            numbered_tables = assign_table_numbers(filtered_tables)
        except Exception as e:
            stats['error_message'] = f"Table numbering failed: {e}"
            return stats

        # Step 6: Save tables to CSV
        try:
            reference_dict = save_tables_to_csv(numbered_tables, chapter, year, chapter_dir)
            stats['tables_saved'] = len(reference_dict)
        except Exception as e:
            stats['error_message'] = f"CSV saving failed: {e}"
            return stats

        # Step 7: Save reference dictionary
        try:
            json_filepath = save_file_reference_dictionary(reference_dict, chapter, year, chapter_dir)
            stats['json_saved'] = True
            print(f"📁 Files saved to: {chapter_dir}/")
        except Exception as e:
            stats['error_message'] = f"JSON saving failed: {e}"
            return stats

        # Success!
        stats['success'] = True
        print(f"✅ Processing complete: {stats['tables_saved']} tables extracted and saved")

    except Exception as e:
        stats['error_message'] = f"Unexpected error: {e}"
        print(f"❌ Processing failed: {stats['error_message']}")

    return stats

✅ All required libraries imported successfully
📋 Environment setup complete


## Extra steps, needs more refinement.

In [None]:
# Step 5.2: Main Automation Interface

def automated_table_extractor():
    """
    Main automation interface for processing multiple files with Google Colab integration
    Handles file upload loop and displays comprehensive summary statistics
    """
    print("🚀 Automated Table Extractor Started")
    print("📋 This system will process DOC/DOCX files and extract tables containing 'לוח'")
    print("=" * 60)

    # Initialize summary statistics
    session_stats = {
        'files_processed': 0,
        'files_successful': 0,
        'files_failed': 0,
        'total_tables_found': 0,
        'total_tables_filtered': 0,
        'total_tables_saved': 0,
        'processing_results': []
    }

    while True:
        print(f"\n📁 Upload Document #{session_stats['files_processed'] + 1}")
        print("   (Upload your DOC or DOCX file using the file picker)")

        # Upload file
        try:
            uploaded = files.upload()
            if not uploaded:
                print("⚠️  No file uploaded")
                continue

            # Get uploaded filename
            filename = list(uploaded.keys())[0]
            filepath = filename

        except Exception as e:
            print(f"❌ File upload failed: {e}")
            continue

        # Get chapter and year from user
        try:
            chapter = int(input(f"📖 Enter chapter number for '{filename}': ").strip())
            year = int(input(f"📅 Enter year for '{filename}': ").strip())
        except ValueError:
            print("❌ Invalid chapter or year. Please enter numbers only.")
            continue
        except KeyboardInterrupt:
            print("\n🔴 Process interrupted by user")
            break

        # Process the file
        try:
            file_stats = process_single_file(filepath, chapter, year)

            # Update session statistics
            session_stats['files_processed'] += 1
            session_stats['total_tables_found'] += file_stats['tables_found']
            session_stats['total_tables_filtered'] += file_stats['tables_filtered']
            session_stats['total_tables_saved'] += file_stats['tables_saved']

            # Track success/failure
            if file_stats['success']:
                session_stats['files_successful'] += 1
            else:
                session_stats['files_failed'] += 1

            # Store individual file results
            session_stats['processing_results'].append({
                'filename': filename,
                'chapter': chapter,
                'year': year,
                'stats': file_stats
            })

        except Exception as e:
            print(f"❌ Unexpected error processing {filename}: {e}")
            session_stats['files_failed'] += 1

        # Ask if user wants to continue
        print(f"\n📊 Current Session: {session_stats['files_successful']} successful, {session_stats['files_failed']} failed")

        try:
            continue_choice = input("❓ Process another file? (y/n): ").strip().lower()
            if continue_choice not in ['y', 'yes']:
                break
        except KeyboardInterrupt:
            print("\n🔴 Process interrupted by user")
            break

    # Display final summary
    print("\n" + "=" * 60)
    print("📋 FINAL SESSION SUMMARY")
    print("=" * 60)
    print(f"Files Processed: {session_stats['files_processed']}")
    print(f"✅ Successful: {session_stats['files_successful']}")
    print(f"❌ Failed: {session_stats['files_failed']}")
    print(f"📊 Total Tables Found: {session_stats['total_tables_found']}")
    print(f"🔍 Tables Filtered (contain 'לוח'): {session_stats['total_tables_filtered']}")
    print(f"💾 Tables Saved: {session_stats['total_tables_saved']}")

    # Detailed breakdown
    if session_stats['processing_results']:
        print(f"\n📄 FILE-BY-FILE BREAKDOWN:")
        for result in session_stats['processing_results']:
            status = "✅" if result['stats']['success'] else "❌"
            print(f"  {status} {result['filename']} (Ch.{result['chapter']}, {result['year']}): "
                  f"{result['stats']['tables_saved']} tables saved")
            if not result['stats']['success'] and result['stats']['error_message']:
                print(f"      Error: {result['stats']['error_message']}")

    print(f"\n🎉 Automation session complete!")
    return session_stats

# Phase 6: Error Handling & Reporting
# Step 6.1 & 6.2: Comprehensive Error Handling and System Completion

def display_system_info():
    """Display system information and usage instructions"""
    print("🔧 AUTOMATED TABLE EXTRACTION SYSTEM")
    print("=" * 50)
    print("📋 Features:")
    print("  • Supports both DOC and DOCX files")
    print("  • Filters tables containing 'לוח' in title row")
    print("  • Organized output: Tables/year/chapter_j/")
    print("  • File naming: Ti_chpj_year.csv")
    print("  • Reference dictionaries: references_chpj_year.json")
    print("  • Hebrew text support with UTF-8 encoding")
    print("\n🚀 Usage:")
    print("  1. Run: automated_table_extractor()")
    print("  2. Upload files one by one")
    print("  3. Provide chapter and year for each file")
    print("  4. Review extraction results and summary")
    print("=" * 50)

def validate_system_dependencies():
    """
    Validate that all required dependencies are available
    Returns True if all dependencies are available, False otherwise
    """
    try:
        # Test critical imports
        from docx import Document
        import pypandoc
        import pandas as pd
        from google.colab import files

        print("✅ All system dependencies verified")
        return True

    except ImportError as e:
        print(f"❌ Missing dependency: {e}")
        print("🔧 Please run the pip install commands:")
        print("   !pip install python-docx pypandoc")
        return False

# System initialization and final setup
print("✅ All required libraries imported successfully")
print("📋 Environment setup complete")
print("🔧 System ready for automated table extraction")

# Display system information
display_system_info()

# Validate dependencies
if validate_system_dependencies():
    print("\n🎯 READY TO START")
    print("Run: automated_table_extractor()")
else:
    print("\n⚠️  Please install missing dependencies before starting")