<a href="https://colab.research.google.com/github/shiragelb/NCC-Statistical-Reports/blob/main/Table_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install pandoc
!pip install pypandoc
!pip install python-docx
!pip install docx2txt

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libcmark-gfm-extensions0.29.0.gfm.3 libcmark-gfm0.29.0.gfm.3 pandoc-data
Suggested packages:
  texlive-latex-recommended texlive-xetex texlive-luatex pandoc-citeproc
  texlive-latex-extra context wkhtmltopdf librsvg2-bin groff ghc nodejs php
  python ruby libjs-mathjax libjs-katex citation-style-language-styles
The following NEW packages will be installed:
  libcmark-gfm-extensions0.29.0.gfm.3 libcmark-gfm0.29.0.gfm.3 pandoc
  pandoc-data
0 upgraded, 4 newly installed, 0 to remove and 35 not upgraded.
Need to get 20.6 MB of archives.
After this operation, 156 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libcmark-gfm0.29.0.gfm.3 amd64 0.29.0.gfm.3-3 [115 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libcmark-gfm-extensions0.29.0.gfm.3 amd64 0.29.0.gfm.3-3 [25.1 kB

# New data extraction from drive

In [2]:
from google.colab import auth
from googleapiclient.discovery import build
import pandas as pd
import os
import io
from googleapiclient.http import MediaIoBaseDownload
import logging

# Set up logging
logger = logging.getLogger('__main__')
logger.setLevel(logging.INFO)


class GoogleDriveManager:
    """
    Manages Google Drive operations including listing, filtering, downloading, and uploading files.
    """

    def __init__(self, folder_id):
        """
        Initialize the GoogleDriveManager with authentication and folder ID.

        Args:
            folder_id: The Google Drive folder ID to work with
        """
        self.folder_id = folder_id
        self.drive_service = None
        self.files_df = None  # Cache for file listings

        # Authenticate and build service
        self._authenticate()

    def _authenticate(self):
        """Authenticate with Google Drive and build the service object."""
        try:
            auth.authenticate_user()
            self.drive_service = build('drive', 'v3')
            logger.info("✅ Successfully authenticated with Google Drive")
        except Exception as e:
            logger.error(f"❌ Authentication failed: {e}")
            raise

    def list_all_files(self, force_refresh=False):
        """
        Recursively list all files in the folder and subfolders.

        Args:
            force_refresh: If True, force a new listing even if cached data exists

        Returns:
            pd.DataFrame: DataFrame with columns [file_name, file_path, file_id, file_url]
        """
        if self.files_df is not None and not force_refresh:
            logger.info("📋 Using cached file list")
            return self.files_df

        logger.info("🔍 Listing all files in folder...")
        all_files = self._list_files_recursive(self.folder_id)

        # Convert to DataFrame
        if all_files:
            self.files_df = pd.DataFrame(all_files)

            # Deduplicate by folder+name (file_path already encodes folder)
            self.files_df = self.files_df.drop_duplicates(
                subset=["file_path", "file_name"], keep="first"
            )

            logger.info(f"✅ Found {len(self.files_df)} unique files")
        else:
            self.files_df = pd.DataFrame(columns=['file_name', 'file_path', 'file_id', 'file_url'])
            logger.info("📁 No files found in folder")

        return self.files_df

    def _list_files_recursive(self, parent_id, parent_path=""):
        """
        Recursively list files in a folder.

        Args:
            parent_id: Google Drive folder ID
            parent_path: Path string for tracking folder hierarchy

        Returns:
            list: List of file dictionaries
        """
        all_files = []
        query = f"'{parent_id}' in parents and trashed=false"
        page_token = None

        while True:
            try:
                response = self.drive_service.files().list(
                    q=query,
                    spaces='drive',
                    fields='nextPageToken, files(id, name, mimeType)',
                    pageToken=page_token
                ).execute()

                for item in response.get('files', []):
                    item_path = f"{parent_path}/{item['name']}" if parent_path else item['name']

                    if item['mimeType'] == 'application/vnd.google-apps.folder':
                        # Recurse into subfolder
                        all_files.extend(self._list_files_recursive(item['id'], item_path))
                    else:
                        all_files.append({
                            "file_name": item['name'],
                            "file_path": item_path,
                            "file_id": item['id'],
                            "file_url": f"https://drive.google.com/file/d/{item['id']}/view?usp=sharing"
                        })

                page_token = response.get('nextPageToken', None)
                if page_token is None:
                    break

            except Exception as e:
                logger.error(f"❌ Error listing files in {parent_path}: {e}")
                break

        return all_files

    def filter_files(self, df=None, years=None, chapters=None):
        """
        Filter files based on specified years and chapters using exact matching.

        Args:
            df: DataFrame to filter (if None, uses cached files_df)
            years: List of years to include (e.g., [2021, 2022, 2023])
            chapters: List of chapter numbers to include (e.g., [1, 2, 5, 10])

        Returns:
            pd.DataFrame: Filtered DataFrame containing only requested files
        """
        # Use provided df or cached one
        if df is None:
            if self.files_df is None:
                logger.warning("⚠️ No files listed yet. Running list_all_files() first.")
                self.list_all_files()
            df = self.files_df.copy()
        else:
            df = df.copy()

        if df.empty:
            logger.warning("⚠️ No files to filter")
            return df

        # Apply year filter
        if years is not None:
            year_strings = [str(year) for year in years]
            # Exact match: year must be a folder in the path
            year_mask = df['file_path'].apply(
                lambda path: any(f"/{year}/" in f"/{path}" or path.startswith(f"{year}/")
                               for year in year_strings)
            )
            df = df[year_mask]
            logger.info(f"📅 Filtered for years: {years} - {len(df)} files")

        # Apply chapter filter
        if chapters is not None:
            # Exact match for filename pattern: 01.docx, 02.docx, etc.
            chapter_filenames = [f"{ch:02d}.docx" for ch in chapters]
            chapter_mask = df['file_name'].apply(
                lambda name: name in chapter_filenames
            )
            df = df[chapter_mask]
            logger.info(f"📖 Filtered for chapters: {chapters} - {len(df)} files")

        return df

    def download_files(self, filtered_df, download_dir="/content/reports"):
        """
        Download files from a filtered DataFrame.

        Args:
            filtered_df: DataFrame containing files to download
            download_dir: Base directory for downloads

        Returns:
            dict: Dictionary mapping file paths to local paths
        """
        if filtered_df is None or filtered_df.empty:
            logger.warning("⚠️ No files to download")
            return {}

        downloaded_files = {}
        total_files = len(filtered_df)

        logger.info(f"📥 Starting download of {total_files} files...")

        for idx, row in filtered_df.iterrows():
            file_id = row['file_id']
            file_name = row['file_name']
            file_path = row['file_path']

            # Extract year from path (assuming structure: year/filename)
            path_parts = file_path.split('/')
            if len(path_parts) >= 2:
                year = path_parts[0]
                local_path = os.path.join(download_dir, year, file_name)
            else:
                local_path = os.path.join(download_dir, file_name)

            # Ensure directory exists
            os.makedirs(os.path.dirname(local_path), exist_ok=True)

            try:
                # Download file
                request = self.drive_service.files().get_media(fileId=file_id)
                fh = io.FileIO(local_path, "wb")
                downloader = MediaIoBaseDownload(fh, request)

                done = False
                while not done:
                    status, done = downloader.next_chunk()
                    if status:
                        progress = int(status.progress() * 100)
                        print(f"⬇️  Downloading {file_name}: {progress}%", end='\r')

                logger.info(f"✅ Downloaded {file_name} to {local_path}")
                downloaded_files[file_path] = local_path

            except Exception as e:
                logger.warning(f"⚠️ Failed to download {file_name}: {e}")
                continue

        logger.info(f"✅ Download complete: {len(downloaded_files)}/{total_files} files")
        return downloaded_files

    def download_selective(self, years=None, chapters=None, download_dir="/content/reports"):
        """
        Convenience method to list, filter, and download files in one operation.

        Args:
            years: List of years to download (e.g., [2021, 2022, 2023])
            chapters: List of chapter numbers to download (e.g., [1, 2, 5, 10])
            download_dir: Base directory for downloads

        Returns:
            dict: Dictionary mapping file paths to local paths

        Example:
            # Download chapters 1-5 for years 2021-2023
            manager.download_selective(
                years=range(2021, 2024),
                chapters=range(1, 6),
                download_dir="/content/reports"
            )
        """
        # Step 1: List all files
        logger.info("🚀 Starting selective download workflow...")
        all_files = self.list_all_files()

        # Step 2: Filter files
        filtered_files = self.filter_files(all_files, years=years, chapters=chapters)

        if filtered_files is None or filtered_files.empty:
            logger.warning("⚠️ No files match the specified criteria")
            return {}

        logger.info(f"📊 Found {len(filtered_files)} files matching criteria")

        # Step 3: Download filtered files
        downloaded = self.download_files(filtered_files, download_dir)

        return downloaded

    def get_summary(self, df=None):
        """
        Get summary statistics about the files.

        Args:
            df: DataFrame to summarize (if None, uses cached files_df)

        Returns:
            dict: Summary statistics
        """
        if df is None:
            if self.files_df is None:
                logger.warning("⚠️ No files listed yet. Running list_all_files() first.")
                self.list_all_files()
            df = self.files_df

        if df is None or df.empty:
            return {"total_files": 0, "years": [], "chapters": []}

        # Extract years from paths
        years = df['file_path'].apply(lambda x: x.split('/')[0] if '/' in x else None)
        years = sorted(years.dropna().unique())

        # Extract chapters from filenames (assuming pattern: 01.docx, 02.docx)
        chapters = df['file_name'].apply(
            lambda x: int(x[:2]) if x[:2].isdigit() and x.endswith('.docx') else None
        )
        chapters = sorted(chapters.dropna().unique())

        summary = {
            "total_files": len(df),
            "years": years,
            "year_count": len(years),
            "chapters": chapters,
            "chapter_count": len(chapters),
            "file_types": df['file_name'].apply(lambda x: x.split('.')[-1]).value_counts().to_dict()
        }

        return summary

    def preview_files(self, df=None, n=10):
        """
        Preview first n files from the DataFrame.

        Args:
            df: DataFrame to preview (if None, uses cached files_df)
            n: Number of files to preview
        """
        if df is None:
            if self.files_df is None:
                logger.warning("⚠️ No files listed yet. Running list_all_files() first.")
                self.list_all_files()
            df = self.files_df

        if df is None or df.empty:
            logger.info("No files to preview")
            return

        preview = df.head(n)[['file_name', 'file_path']]
        logger.info(f"\n📋 Preview of first {min(n, len(df))} files:")
        for idx, row in preview.iterrows():
            logger.info(f"  {row['file_path']}")

    def check_missing_files(self, years, chapters):
        """
        Check which year/chapter combinations are missing.

        Args:
            years: List of years to check
            chapters: List of chapter numbers to check

        Returns:
            list: List of missing (year, chapter) tuples
        """
        if self.files_df is None:
            self.list_all_files()

        missing = []

        for year in years:
            for chapter in chapters:
                # Check if this combination exists
                filtered = self.filter_files(
                    self.files_df,
                    years=[year],
                    chapters=[chapter]
                )

                if filtered is None or filtered.empty:
                    missing.append((year, chapter))
                    logger.warning(f"⚠️ Missing: Year {year}, Chapter {chapter:02d}")

        if missing:
            logger.info(f"📊 Total missing files: {len(missing)}")
        else:
            logger.info("✅ All requested files are present")

        return missing

In [3]:
# Initialize
folder_id = "1e0eA-AIsz_BSwVHOppJMXECX42hBfG4J"
manager = GoogleDriveManager(folder_id)

# Download specific years and chapters
downloaded = manager.download_selective(
    years=range(2001, 2025),
    chapters=[1]
)

print(f"Downloaded {len(downloaded)} files")

INFO:__main__:✅ Successfully authenticated with Google Drive
INFO:__main__:🚀 Starting selective download workflow...
INFO:__main__:🔍 Listing all files in folder...
INFO:__main__:✅ Found 882 unique files
INFO:__main__:📅 Filtered for years: range(2001, 2025) - 882 files
INFO:__main__:📖 Filtered for chapters: [1] - 24 files
INFO:__main__:📊 Found 24 files matching criteria
INFO:__main__:📥 Starting download of 24 files...
INFO:__main__:✅ Downloaded 01.docx to /content/reports/2020/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2015/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2004/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2003/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2018/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2019/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2021/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2021/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2002/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2005/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2010/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2011/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2009/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2017/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2006/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2007/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2008/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2001/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2024/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2022/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2023/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2023/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2012/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2014/01.docx
INFO:__main__:✅ Download complete: 24/24 files


⬇️  Downloading 01.docx: 100%Downloaded 24 files


# Data extraction class

In [11]:
import os
import json
import pandas as pd
from docx import Document


class TableExtractor:
    """Simple class for extracting tables from Word documents with statistics tracking."""

    def __init__(self, base_dir="/content/reports", out_dir="/content/tables"):
        """Initialize the extractor with directories and statistics."""
        self.base_dir = base_dir
        self.out_dir = out_dir

        # Configuration constants
        self.YEAR_RANGE = (2001, 2025)
        self.VALID_EXTENSION = ".docx"
        self.TABLE_MARKER = "לוח"  # Hebrew for "table"
        self.EXCLUDE_MARKER = "תרשים"  # Hebrew for "diagram" - exclude these
        self.ENCODING = "utf-8-sig"
        self.SUMMARY_FILE = "tables_summary.json"
        self.COLUMNS_FILE = "tables_columns.json"

        # Statistics tracking
        self.total_tables = 0

        # Metadata collectors
        self.all_summaries = {}
        self.all_colnames = {}

        # Create output directory
        os.makedirs(self.out_dir, exist_ok=True)

    def _is_valid_table(self, table):
        """
        Check if a table is valid (contains Hebrew table marker in first row).

        Args:
            table: A docx table object

        Returns:
            tuple: (is_valid: bool, table_name: str)
        """
        if len(table.rows) == 0:
            return False, ""

        # Check first row cells for table marker
        for cell in table.rows[0].cells:
            cell_text = cell.text
            if self.TABLE_MARKER in cell_text and self.EXCLUDE_MARKER not in cell_text:
                return True, cell_text.strip()

        return False, ""

    def _extract_table_data(self, table):
        """
        Extract data from a docx table and convert to DataFrame.

        Args:
            table: A docx table object

        Returns:
            pd.DataFrame: Table data as a DataFrame
        """
        data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
        return pd.DataFrame(data)

    def _save_table_data(self, df, identifier, year, chapter):
        """
        Save DataFrame as CSV file in the appropriate directory structure.

        Args:
            df: pandas DataFrame to save
            identifier: Unique identifier for the table
            year: Year of the document
            chapter: Chapter identifier

        Returns:
            str: Path where the file was saved
        """
        save_dir = os.path.join(self.out_dir, str(year), chapter)
        os.makedirs(save_dir, exist_ok=True)

        save_path = os.path.join(save_dir, f"{identifier}.csv")
        df.to_csv(save_path, index=False, encoding=self.ENCODING)

        return save_path

    def _process_document(self, fpath, year, chapter):
        """
        Process a single Word document and extract all valid tables.

        Args:
            fpath: Full path to the document
            year: Year of the document
            chapter: Chapter identifier from filename

        Returns:
            int: Number of tables extracted from this document
        """
        summary = {}
        colnames_map = {}
        tables_extracted = 0

        try:
            doc = Document(fpath)
        except Exception as e:
            print(f"skip {fpath}: {e}")
            return 0

        serial = 1

        for table in doc.tables:
            # Validate table
            is_valid, table_name = self._is_valid_table(table)
            if not is_valid:
                continue

            # Extract data
            df = self._extract_table_data(table)

            # Skip empty tables
            if len(df) == 0:
                continue

            # Remove ".docx" suffix from chapter name
            if ".docx" in chapter:
              chapter = chapter.replace('.docx', '')

            # Create identifier
            identifier = f"{serial}_{chapter}_{year}"

            # Record mapping for JSON
            if len(df) > 0:
                # Deduplicate consecutive repeated text in header
                header_cells = df.iloc[0].astype(str).tolist()
                unique_header = []
                for cell in header_cells:
                    if not unique_header or cell != unique_header[-1]:
                        unique_header.append(cell)
                summary[identifier] = " ".join(unique_header)
            else:
                continue

            # Combine rows [1] and [2] for column names
            if len(df) > 2:
                row1 = df.iloc[1].astype(str).tolist()
                row2 = df.iloc[2].astype(str).tolist()
                colnames_map[identifier] = [f"{r1} {r2}".strip() for r1, r2 in zip(row1, row2)]
            elif len(df) > 1:
                colnames_map[identifier] = df.iloc[1].astype(str).tolist()
            else:
                colnames_map[identifier] = []

            # Save to CSV
            self._save_table_data(df, identifier, year, chapter)

            tables_extracted += 1
            serial += 1

        # Update metadata collectors
        self.all_summaries.update(summary)
        self.all_colnames.update(colnames_map)

        return tables_extracted

    def _save_metadata(self):
        """Save summary and column metadata to JSON files."""
        with open(os.path.join(self.out_dir, self.SUMMARY_FILE), "w", encoding="utf-8") as f:
            json.dump(self.all_summaries, f, ensure_ascii=False, indent=2)

        with open(os.path.join(self.out_dir, self.COLUMNS_FILE), "w", encoding="utf-8") as f:
            json.dump(self.all_colnames, f, ensure_ascii=False, indent=2)

    def process_files(self, years=None, chapters=None):
        """
        Process Word documents filtered by years and chapters.

        Args:
            years: List/range of years to process (None = all years in YEAR_RANGE)
            chapters: List of chapter identifiers to process (None = all chapters)

        Example:
            extractor.process_files()  # Process all files
            extractor.process_files(years=[2023, 2024])  # Specific years
            extractor.process_files(chapters=['1', '2', '3'])  # Specific chapters
            extractor.process_files(years=range(2020, 2025), chapters=['1', '2'])  # Both
        """
        # Reset statistics for new extraction session
        self.total_tables = 0
        self.tables_per_chapter_year = {}
        self.all_summaries = {}
        self.all_colnames = {}

        # Determine which years to process
        if years is None:
            years_to_process = range(2001, 2025)
        else:
            years_to_process = years

        # Convert chapters to set for faster lookup (if provided)
        chapters_to_process = set(map(str, chapters)) if chapters else None

        # Process each year
        for year in years_to_process:
            print(year)
            year_path = os.path.join(self.base_dir, str(year))

            if not os.path.isdir(year_path):
                continue

            # Process each document in the year directory
            for fname in os.listdir(year_path):
                if not fname.endswith(self.VALID_EXTENSION):
                    continue

                # Extract chapter from filename
                chapter = fname.split("_")[0]

                # Skip if not in chapters to process
                if chapters_to_process and chapter not in chapters_to_process:
                    continue

                fpath = os.path.join(year_path, fname)

                # Process the document and get table count
                tables_in_doc = self._process_document(fpath, year, chapter)


                # Update statistics
                self.total_tables += tables_in_doc
                if chapter not in self.tables_per_chapter_year:
                    self.tables_per_chapter_year[chapter] = {}
                if year not in self.tables_per_chapter_year[chapter]:
                    self.tables_per_chapter_year[chapter][year] = 0
                self.tables_per_chapter_year[chapter][year] += tables_in_doc

        # Save consolidated metadata
        self._save_metadata()

        print(f"\nExtraction complete! Total tables: {self.total_tables}")

    def get_stats(self):
        """
        Get extraction statistics.

        Returns:
            dict: Dictionary with 'total' and 'per_chapter_year' statistics
        """
        return {
            'total': self.total_tables,
            'per_chapter_year': self.tables_per_chapter_year
        }

    def print_summary(self):
        """Print a formatted summary of extraction statistics."""
        print("\n" + "="*50)
        print("EXTRACTION SUMMARY")
        print("="*50)
        print(f"Total tables extracted: {self.total_tables}")

        if self.tables_per_chapter_year:
            print("\nTables per chapter per year:")
            for chapter in sorted(self.tables_per_chapter_year.keys()):
                print(f"\nChapter {chapter}:")
                for year in sorted(self.tables_per_chapter_year[chapter].keys()):
                    count = self.tables_per_chapter_year[chapter][year]
                    if count > 0:  # Only show years with tables
                        print(f"  {year}: {count}")
        else:
            print("\nNo tables extracted.")
        print("="*50)

    def _identify_continuation_groups(self, summaries):
        """
        Identify groups of tables that should be combined (original + continuations).

        Args:
            summaries: Dictionary of table summaries

        Returns:
            dict: Groups of related tables {original_id: [original_id, continuation_ids...]}
        """
        groups = {}
        continuation_marker = "(המשך)"

        # Sort identifiers to process them in order (important for maintaining sequence)
        sorted_ids = sorted(summaries.keys(), key=lambda x: (
            int(x.split('_')[2]),  # year
            x.split('_')[1],        # chapter
            int(x.split('_')[0])    # serial number
        ))

        current_group_original = None
        current_base_header = None

        for identifier in sorted_ids:
            header = summaries[identifier]

            # Check if this is a continuation
            if continuation_marker in header:
                # Extract base header (without continuation marker)
                base_header = header.replace(continuation_marker, "").strip()

                # If we have a current group and the headers are similar enough, add to group
                if current_group_original and current_base_header:
                    if self._headers_match(base_header, current_base_header):
                        groups[current_group_original].append(identifier)
                    else:
                        print(f"Warning: Orphaned continuation found: {identifier}")
                        print(f"  Looking for base: '{base_header}'")
                        print(f"  Current base: '{current_base_header}'")
            else:
                # This is an original table (not a continuation)
                # Start a new group
                current_group_original = identifier
                current_base_header = header  # Store the full header as the base
                groups[identifier] = [identifier]  # Group starts with the original

        # Filter out groups with only one table (no continuations)
        groups_with_continuations = {k: v for k, v in groups.items() if len(v) > 1}

        return groups_with_continuations

    def _headers_match(self, header1, header2, threshold=0.85):
        """
        Check if two headers are similar enough to be considered the same table.
        Allows for small differences in whitespace, punctuation, etc.

        Args:
            header1: First header string
            header2: Second header string
            threshold: Similarity threshold (0.0 to 1.0)

        Returns:
            bool: True if headers match
        """
        # Normalize headers for comparison
        def normalize(text):
            # Remove extra whitespace, normalize spaces
            text = ' '.join(text.split())
            # Remove common variations
            text = text.replace('\n', ' ').replace('\r', ' ')
            text = text.replace('  ', ' ')
            return text.strip()

        h1_normalized = normalize(header1)
        h2_normalized = normalize(header2)

        # Exact match after normalization
        if h1_normalized == h2_normalized:
            return True

        # Calculate similarity ratio
        # Use character-level similarity
        longer = max(len(h1_normalized), len(h2_normalized))
        if longer == 0:
            return True

        # Simple character matching approach
        matches = 0
        shorter_text = h1_normalized if len(h1_normalized) <= len(h2_normalized) else h2_normalized
        longer_text = h2_normalized if len(h1_normalized) <= len(h2_normalized) else h1_normalized

        # Check if shorter text is substantially contained in longer text
        if shorter_text in longer_text:
            return True

        # Calculate overlap by finding longest common substring ratio
        # For Hebrew text with potential small differences
        words1 = set(h1_normalized.split())
        words2 = set(h2_normalized.split())

        if not words1 or not words2:
            return False

        # Calculate Jaccard similarity of words
        intersection = words1.intersection(words2)
        union = words1.union(words2)

        if not union:
            return False

        similarity = len(intersection) / len(union)

        return similarity >= threshold

    def _combine_csv_files(self, identifiers, summaries):
        """
        Load and combine multiple CSV files into one, removing duplicate headers.

        Args:
            identifiers: List of table identifiers [original, continuation1, ...]
            summaries: Dictionary of table summaries for header detection

        Returns:
            pd.DataFrame: Combined dataframe
        """
        if not identifiers:
            return None

        combined_df = None
        original_id = identifiers[0]
        original_header = summaries[original_id]

        # Parse identifier to get year and chapter
        parts = original_id.split('_')
        year = parts[2]
        chapter = parts[1]

        for i, identifier in enumerate(identifiers):
            # Build path to CSV file
            csv_path = os.path.join(self.out_dir, year, chapter, f"{identifier}.csv")

            if not os.path.exists(csv_path):
                print(f"Warning: CSV file not found: {csv_path}")
                continue

            # Load the CSV
            df = pd.read_csv(csv_path, encoding=self.ENCODING)

            if i == 0:
                # First table (original) - keep everything
                combined_df = df
            else:
                # Continuation table - need to remove duplicate header rows
                # Get the header text from the first table to compare
                if len(combined_df) > 0:
                    # Create a string representation of the first row for comparison
                    original_first_row = combined_df.iloc[0].astype(str).tolist()

                    # Find and skip duplicate header rows in continuation
                    start_idx = 0
                    for idx in range(min(3, len(df))):  # Check first 3 rows max
                        current_row = df.iloc[idx].astype(str).tolist()
                        # Check if this row matches the original header pattern
                        if self._is_header_row(current_row, original_first_row):
                            start_idx = idx + 1
                            break

                    # Append from after the header
                    if start_idx < len(df):
                        combined_df = pd.concat([combined_df, df.iloc[start_idx:]],
                                               ignore_index=True)
                    else:
                        # If all rows were headers, just append everything
                        combined_df = pd.concat([combined_df, df], ignore_index=True)
                else:
                    # Original was empty, just append
                    combined_df = pd.concat([combined_df, df], ignore_index=True)

        return combined_df

    def _is_header_row(self, row, original_header_row):
        """
        Check if a row appears to be a duplicate header row.

        Args:
            row: Row to check (as list of strings)
            original_header_row: Original header row to compare against

        Returns:
            bool: True if this appears to be a duplicate header
        """
        # Check if the rows have significant overlap in content
        # This handles cases where headers might have slight variations
        if len(row) != len(original_header_row):
            return False

        # Count matching cells (allowing for continuation marker)
        matches = 0
        for cell1, cell2 in zip(row, original_header_row):
            # Remove continuation marker for comparison
            cell1_clean = cell1.replace("(המשך)", "").strip()
            cell2_clean = cell2.replace("(המשך)", "").strip()

            if cell1_clean == cell2_clean and cell1_clean:  # Non-empty match
                matches += 1

        # If most cells match, it's likely a header row
        return matches >= len(row) * 0.7  # 70% threshold

    def combine_continuation_tables(self):
        """
        Combine continuation tables with their originals after extraction.
        This should be called after process_files() to merge any continuation tables.

        Returns:
            dict: Information about combined tables
        """
        # Load current summaries
        summary_path = os.path.join(self.out_dir, self.SUMMARY_FILE)
        columns_path = os.path.join(self.out_dir, self.COLUMNS_FILE)

        if not os.path.exists(summary_path):
            print("No summaries file found. Run process_files() first.")
            return {}

        # Load metadata
        with open(summary_path, 'r', encoding='utf-8') as f:
            summaries = json.load(f)

        with open(columns_path, 'r', encoding='utf-8') as f:
            colnames = json.load(f)

        # Identify continuation groups
        groups = self._identify_continuation_groups(summaries)

        if not groups:
            print("No continuation tables found.")
            return {}

        print(f"\nFound {len(groups)} table(s) with continuations to combine...")

        # Track what we combined
        combined_info = {}

        # Process each group
        for original_id, identifier_list in groups.items():
            print(f"\nCombining {original_id} with {len(identifier_list)-1} continuation(s)...")

            # Combine the CSV files
            combined_df = self._combine_csv_files(identifier_list, summaries)

            if combined_df is not None:
                # Parse identifier to get year and chapter
                parts = original_id.split('_')
                year = parts[2]
                chapter = parts[1]

                # Save the combined CSV (overwriting the original)
                save_path = os.path.join(self.out_dir, year, chapter, f"{original_id}.csv")
                combined_df.to_csv(save_path, index=False, encoding=self.ENCODING)

                # Delete continuation CSV files
                for continuation_id in identifier_list[1:]:  # Skip the original
                    continuation_path = os.path.join(self.out_dir, year, chapter, f"{continuation_id}.csv")
                    if os.path.exists(continuation_path):
                        os.remove(continuation_path)
                        print(f"  Removed: {continuation_id}.csv")

                # Track combination info
                combined_info[original_id] = {
                    'parts_combined': len(identifier_list),
                    'continuation_ids': identifier_list[1:],
                    'rows_in_combined': len(combined_df)
                }

                print(f"  Combined table saved as: {original_id}.csv ({len(combined_df)} rows)")

        # Update metadata files (remove continuation entries)
        updated_summaries = {k: v for k, v in summaries.items()
                            if "(המשך)" not in v}
        updated_colnames = {k: v for k, v in colnames.items()
                           if "(המשך)" not in summaries.get(k, "")}

        # Save updated metadata
        with open(summary_path, 'w', encoding='utf-8') as f:
            json.dump(updated_summaries, f, ensure_ascii=False, indent=2)

        with open(columns_path, 'w', encoding='utf-8') as f:
            json.dump(updated_colnames, f, ensure_ascii=False, indent=2)

        # Save combination tracking info
        tracking_path = os.path.join(self.out_dir, "combined_tables_info.json")
        with open(tracking_path, 'w', encoding='utf-8') as f:
            json.dump(combined_info, f, ensure_ascii=False, indent=2)

        print(f"\n✓ Combination complete! Combined {len(groups)} table(s)")
        print(f"  Combination details saved to: combined_tables_info.json")

        # Update statistics to reflect combinations
        self._update_statistics_after_combination(combined_info)

        return combined_info

    def _update_statistics_after_combination(self, combined_info):
        """
        Update statistics after combining continuation tables.

        Args:
            combined_info: Dictionary with combination information
        """
        if not combined_info:
            return

        # Recalculate statistics based on combinations
        for original_id, info in combined_info.items():
            parts = original_id.split('_')
            year = int(parts[2])
            chapter = parts[1]

            # Reduce count by number of continuations removed
            continuations_removed = info['parts_combined'] - 1

            # Update total
            self.total_tables -= continuations_removed

            # Update per-chapter-year statistics
            if chapter in self.tables_per_chapter_year:
                if year in self.tables_per_chapter_year[chapter]:
                    self.tables_per_chapter_year[chapter][year] -= continuations_removed

                    # Clean up if count becomes 0
                    if self.tables_per_chapter_year[chapter][year] <= 0:
                        self.tables_per_chapter_year[chapter][year] = 1  # At least the combined table exists

        print(f"\n📊 Statistics updated: {self.total_tables} unique tables after combination")

In [12]:
# Initialize
extractor = TableExtractor(base_dir="/content/reports", out_dir="/content/tables")

# Process everything
extractor.process_files()

2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024

Extraction complete! Total tables: 266


In [13]:
# Combine continuation tables
combined = extractor.combine_continuation_tables()

# Get statistics
stats = extractor.get_stats()

  Looking for base: 'לידות חי 
לפי יישוב* 
(מספרים מוחלטים) 2000-1995  לוח 1.13'
  Current base: 'לידות חי 
 לפי יישוב* (מספרים)
2000-1995 לוח 1.13'
  Looking for base: 'לידות חי 
לפי יישוב* 
(מספרים מוחלטים) 2001-1995  לוח 1.11'
  Current base: 'לידות חי 
 לפי יישוב* (מספרים)
2001-1995 לוח 1.11'
  Looking for base: 'לידות חי 
לפי יישוב* 
(מספרים מוחלטים) 2001-1995  לוח 1.11'
  Current base: 'לידות חי 
 לפי יישוב* (מספרים)
2001-1995 לוח 1.11'
  Looking for base: 'מספר הילדים לפי גיל וחלקם באוכלוסיית היישובים* 
(אלפים ואחוזים)
סוףשנת2003  לוח 1.6'
  Current base: 'מספר הילדים לפי גיל וחלקם באוכלוסיית היישובים* 
(אלפים ואחוזים)
סוף שנת 2003 לוח 1.6'
  Looking for base: 'מספר הילדים 
לפי גיל וחלקם באוכלוסיית היישוב* (אלפים ואחוזים)
דצמבר 2011  לוח 1.5'
  Current base: 'מספר הילדים  להוסיף את דייר אל אסד ב10000+
לפי גיל וחלקם באוכלוסיית היישוב* (מספרים ואחוזים)
דצמבר 2011 לוח 1.5'
  Looking for base: 'מספר הילדים 
לפי גיל וחלקם באוכלוסיית היישוב* (אלפים ואחוזים)
דצמבר 2011  לוח 1.5'
  Curr

In [14]:
# Print summary
extractor.print_summary()


EXTRACTION SUMMARY
Total tables extracted: 197

Tables per chapter per year:

Chapter 01.docx:
  2001: 23
  2002: 19
  2003: 14
  2004: 16
  2005: 12
  2006: 14
  2007: 15
  2008: 20
  2009: 15
  2010: 14
  2011: 14
  2012: 31
  2014: 14
  2015: 13
  2017: 2
  2018: 2
  2019: 11
  2020: 1
  2021: 1
  2022: 13
  2023: 1
  2024: 1
