<a href="https://colab.research.google.com/github/shiragelb/NCC-Statistical-Reports/blob/main/Table_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!apt-get install pandoc
!pip install pypandoc
!pip install python-docx
!pip install docx2txt

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
pandoc is already the newest version (2.9.2.1-3ubuntu2).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


# Data extraction from drive

In [16]:
from google.colab import auth
from googleapiclient.discovery import build
import pandas as pd
import os
import io
from googleapiclient.http import MediaIoBaseDownload
import logging

# Set up logging
logger = logging.getLogger('__main__')
logger.setLevel(logging.INFO)


class GoogleDriveManager:
    """
    Manages Google Drive operations including listing, filtering, downloading, and uploading files.
    """

    def __init__(self, folder_id):
        """
        Initialize the GoogleDriveManager with authentication and folder ID.

        Args:
            folder_id: The Google Drive folder ID to work with
        """
        self.folder_id = folder_id
        self.drive_service = None
        self.files_df = None  # Cache for file listings

        # Authenticate and build service
        self._authenticate()

    def _authenticate(self):
        """Authenticate with Google Drive and build the service object."""
        try:
            auth.authenticate_user()
            self.drive_service = build('drive', 'v3')
            logger.info("✅ Successfully authenticated with Google Drive")
        except Exception as e:
            logger.error(f"❌ Authentication failed: {e}")
            raise

    def list_all_files(self, force_refresh=False):
        """
        Recursively list all files in the folder and subfolders.

        Args:
            force_refresh: If True, force a new listing even if cached data exists

        Returns:
            pd.DataFrame: DataFrame with columns [file_name, file_path, file_id, file_url]
        """
        if self.files_df is not None and not force_refresh:
            logger.info("📋 Using cached file list")
            return self.files_df

        logger.info("🔍 Listing all files in folder...")
        all_files = self._list_files_recursive(self.folder_id)

        # Convert to DataFrame
        if all_files:
            self.files_df = pd.DataFrame(all_files)

            # Deduplicate by folder+name (file_path already encodes folder)
            self.files_df = self.files_df.drop_duplicates(
                subset=["file_path", "file_name"], keep="first"
            )

            logger.info(f"✅ Found {len(self.files_df)} unique files")
        else:
            self.files_df = pd.DataFrame(columns=['file_name', 'file_path', 'file_id', 'file_url'])
            logger.info("📁 No files found in folder")

        return self.files_df

    def _list_files_recursive(self, parent_id, parent_path=""):
        """
        Recursively list files in a folder.

        Args:
            parent_id: Google Drive folder ID
            parent_path: Path string for tracking folder hierarchy

        Returns:
            list: List of file dictionaries
        """
        all_files = []
        query = f"'{parent_id}' in parents and trashed=false"
        page_token = None

        while True:
            try:
                response = self.drive_service.files().list(
                    q=query,
                    spaces='drive',
                    fields='nextPageToken, files(id, name, mimeType)',
                    pageToken=page_token
                ).execute()

                for item in response.get('files', []):
                    item_path = f"{parent_path}/{item['name']}" if parent_path else item['name']

                    if item['mimeType'] == 'application/vnd.google-apps.folder':
                        # Recurse into subfolder
                        all_files.extend(self._list_files_recursive(item['id'], item_path))
                    else:
                        all_files.append({
                            "file_name": item['name'],
                            "file_path": item_path,
                            "file_id": item['id'],
                            "file_url": f"https://drive.google.com/file/d/{item['id']}/view?usp=sharing"
                        })

                page_token = response.get('nextPageToken', None)
                if page_token is None:
                    break

            except Exception as e:
                logger.error(f"❌ Error listing files in {parent_path}: {e}")
                break

        return all_files

    def filter_files(self, df=None, years=None, chapters=None):
        """
        Filter files based on specified years and chapters using exact matching.

        Args:
            df: DataFrame to filter (if None, uses cached files_df)
            years: List of years to include (e.g., [2021, 2022, 2023])
            chapters: List of chapter numbers to include (e.g., [1, 2, 5, 10])

        Returns:
            pd.DataFrame: Filtered DataFrame containing only requested files
        """
        # Use provided df or cached one
        if df is None:
            if self.files_df is None:
                logger.warning("⚠️ No files listed yet. Running list_all_files() first.")
                self.list_all_files()
            df = self.files_df.copy()
        else:
            df = df.copy()

        if df.empty:
            logger.warning("⚠️ No files to filter")
            return df

        # Apply year filter
        if years is not None:
            year_strings = [str(year) for year in years]
            # Exact match: year must be a folder in the path
            year_mask = df['file_path'].apply(
                lambda path: any(f"/{year}/" in f"/{path}" or path.startswith(f"{year}/")
                               for year in year_strings)
            )
            df = df[year_mask]
            logger.info(f"📅 Filtered for years: {years} - {len(df)} files")

        # Apply chapter filter
        if chapters is not None:
            # Exact match for filename pattern: 01.docx, 02.docx, etc.
            chapter_filenames = [f"{ch:02d}.docx" for ch in chapters]
            chapter_mask = df['file_name'].apply(
                lambda name: name in chapter_filenames
            )
            df = df[chapter_mask]
            logger.info(f"📖 Filtered for chapters: {chapters} - {len(df)} files")

        return df

    def download_files(self, filtered_df, download_dir="/content/reports"):
        """
        Download files from a filtered DataFrame.

        Args:
            filtered_df: DataFrame containing files to download
            download_dir: Base directory for downloads

        Returns:
            dict: Dictionary mapping file paths to local paths
        """
        if filtered_df is None or filtered_df.empty:
            logger.warning("⚠️ No files to download")
            return {}

        downloaded_files = {}
        total_files = len(filtered_df)

        logger.info(f"📥 Starting download of {total_files} files...")

        for idx, row in filtered_df.iterrows():
            file_id = row['file_id']
            file_name = row['file_name']
            file_path = row['file_path']

            # Extract year from path (assuming structure: year/filename)
            path_parts = file_path.split('/')
            if len(path_parts) >= 2:
                year = path_parts[0]
                local_path = os.path.join(download_dir, year, file_name)
            else:
                local_path = os.path.join(download_dir, file_name)

            # Ensure directory exists
            os.makedirs(os.path.dirname(local_path), exist_ok=True)

            try:
                # Download file
                request = self.drive_service.files().get_media(fileId=file_id)
                fh = io.FileIO(local_path, "wb")
                downloader = MediaIoBaseDownload(fh, request)

                done = False
                while not done:
                    status, done = downloader.next_chunk()
                    if status:
                        progress = int(status.progress() * 100)
                        print(f"⬇️  Downloading {file_name}: {progress}%", end='\r')

                logger.info(f"✅ Downloaded {file_name} to {local_path}")
                downloaded_files[file_path] = local_path

            except Exception as e:
                logger.warning(f"⚠️ Failed to download {file_name}: {e}")
                continue

        logger.info(f"✅ Download complete: {len(downloaded_files)}/{total_files} files")
        return downloaded_files

    def download_selective(self, years=None, chapters=None, download_dir="/content/reports"):
        """
        Convenience method to list, filter, and download files in one operation.

        Args:
            years: List of years to download (e.g., [2021, 2022, 2023])
            chapters: List of chapter numbers to download (e.g., [1, 2, 5, 10])
            download_dir: Base directory for downloads

        Returns:
            dict: Dictionary mapping file paths to local paths

        Example:
            # Download chapters 1-5 for years 2021-2023
            manager.download_selective(
                years=range(2021, 2024),
                chapters=range(1, 6),
                download_dir="/content/reports"
            )
        """
        # Step 1: List all files
        logger.info("🚀 Starting selective download workflow...")
        all_files = self.list_all_files()

        # Step 2: Filter files
        filtered_files = self.filter_files(all_files, years=years, chapters=chapters)

        if filtered_files is None or filtered_files.empty:
            logger.warning("⚠️ No files match the specified criteria")
            return {}

        logger.info(f"📊 Found {len(filtered_files)} files matching criteria")

        # Step 3: Download filtered files
        downloaded = self.download_files(filtered_files, download_dir)

        return downloaded

    def get_summary(self, df=None):
        """
        Get summary statistics about the files.

        Args:
            df: DataFrame to summarize (if None, uses cached files_df)

        Returns:
            dict: Summary statistics
        """
        if df is None:
            if self.files_df is None:
                logger.warning("⚠️ No files listed yet. Running list_all_files() first.")
                self.list_all_files()
            df = self.files_df

        if df is None or df.empty:
            return {"total_files": 0, "years": [], "chapters": []}

        # Extract years from paths
        years = df['file_path'].apply(lambda x: x.split('/')[0] if '/' in x else None)
        years = sorted(years.dropna().unique())

        # Extract chapters from filenames (assuming pattern: 01.docx, 02.docx)
        chapters = df['file_name'].apply(
            lambda x: int(x[:2]) if x[:2].isdigit() and x.endswith('.docx') else None
        )
        chapters = sorted(chapters.dropna().unique())

        summary = {
            "total_files": len(df),
            "years": years,
            "year_count": len(years),
            "chapters": chapters,
            "chapter_count": len(chapters),
            "file_types": df['file_name'].apply(lambda x: x.split('.')[-1]).value_counts().to_dict()
        }

        return summary

    def preview_files(self, df=None, n=10):
        """
        Preview first n files from the DataFrame.

        Args:
            df: DataFrame to preview (if None, uses cached files_df)
            n: Number of files to preview
        """
        if df is None:
            if self.files_df is None:
                logger.warning("⚠️ No files listed yet. Running list_all_files() first.")
                self.list_all_files()
            df = self.files_df

        if df is None or df.empty:
            logger.info("No files to preview")
            return

        preview = df.head(n)[['file_name', 'file_path']]
        logger.info(f"\n📋 Preview of first {min(n, len(df))} files:")
        for idx, row in preview.iterrows():
            logger.info(f"  {row['file_path']}")

    def check_missing_files(self, years, chapters):
        """
        Check which year/chapter combinations are missing.

        Args:
            years: List of years to check
            chapters: List of chapter numbers to check

        Returns:
            list: List of missing (year, chapter) tuples
        """
        if self.files_df is None:
            self.list_all_files()

        missing = []

        for year in years:
            for chapter in chapters:
                # Check if this combination exists
                filtered = self.filter_files(
                    self.files_df,
                    years=[year],
                    chapters=[chapter]
                )

                if filtered is None or filtered.empty:
                    missing.append((year, chapter))
                    logger.warning(f"⚠️ Missing: Year {year}, Chapter {chapter:02d}")

        if missing:
            logger.info(f"📊 Total missing files: {len(missing)}")
        else:
            logger.info("✅ All requested files are present")

        return missing

In [17]:
# Initialize
folder_id = "1e0eA-AIsz_BSwVHOppJMXECX42hBfG4J"
manager = GoogleDriveManager(folder_id)

# Download specific years and chapters
downloaded = manager.download_selective(
    years=range(2001,2025),
    chapters=range(1,16)
)

print(f"Downloaded {len(downloaded)} files")

INFO:__main__:✅ Successfully authenticated with Google Drive
INFO:__main__:🚀 Starting selective download workflow...
INFO:__main__:🔍 Listing all files in folder...
INFO:__main__:✅ Found 896 unique files
INFO:__main__:📅 Filtered for years: range(2001, 2025) - 895 files
INFO:__main__:📖 Filtered for chapters: range(1, 16) - 374 files
INFO:__main__:📊 Found 374 files matching criteria
INFO:__main__:📥 Starting download of 374 files...
INFO:__main__:✅ Downloaded 04.docx to /content/reports/2016/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2016/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2016/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2016/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2016/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2016/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2016/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2016/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2016/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2016/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2016/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2016/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2020/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2020/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2020/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2020/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2020/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2020/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2020/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2020/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2020/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2020/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2020/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2020/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2020/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2020/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2015/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2015/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2015/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2015/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2015/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2015/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2015/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2015/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2015/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2015/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2015/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2015/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2015/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2015/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2015/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2004/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2004/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2004/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2004/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2004/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2004/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2004/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2004/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2004/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2004/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2004/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2004/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2004/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2003/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2003/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2003/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2003/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2003/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2003/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2003/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2003/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2003/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2003/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2003/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2003/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2003/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2003/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2003/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2018/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2018/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2018/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2018/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2018/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2018/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2018/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2018/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2018/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2018/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2018/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2018/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2018/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2018/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2019/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2019/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2019/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2019/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2019/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2019/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2019/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2019/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2019/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2019/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2019/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2019/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2019/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2019/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2021/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2021/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2021/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2021/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2021/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2021/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2021/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2021/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2021/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2021/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2021/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2021/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2021/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2021/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2021/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2021/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2021/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2021/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2021/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2021/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2021/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2021/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2021/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2002/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2002/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2002/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2002/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2002/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2002/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2002/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2002/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2002/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2002/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2002/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2002/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2002/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2002/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2002/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2005/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2005/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2005/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2005/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2005/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2005/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2005/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2005/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2005/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2005/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2005/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2005/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2005/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2005/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2010/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2010/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2010/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2010/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2010/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2010/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2010/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2010/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2010/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2010/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2010/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2010/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2010/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2010/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2010/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2011/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2011/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2011/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2011/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2011/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2011/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2011/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2011/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2011/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2011/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2011/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2011/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2011/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2011/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2011/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2009/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2009/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2009/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2009/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2009/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2009/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2009/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2009/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2009/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2009/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2009/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2009/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2009/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2009/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2017/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2017/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2017/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2017/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2017/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2017/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2017/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2017/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2017/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2017/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2017/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2017/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2017/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2017/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2017/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2013/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2013/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2013/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2013/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2013/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2013/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2013/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2013/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2013/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2013/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2013/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2013/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2013/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2013/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2006/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2006/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2006/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2006/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2006/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2006/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2006/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2006/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2006/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2006/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2006/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2006/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2006/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2006/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2006/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2007/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2007/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2007/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2007/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2007/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2007/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2007/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2007/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2007/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2007/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2007/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2007/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2007/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2007/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2007/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2008/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2008/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2008/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2008/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2008/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2008/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2008/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2008/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2008/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2008/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2008/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2008/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2008/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2008/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2008/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2001/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2001/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2001/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2001/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2001/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2001/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2001/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2001/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2001/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2001/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2001/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2001/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2001/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2001/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2001/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2024/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2024/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2024/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2024/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2024/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2024/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2024/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2024/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2024/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2024/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2024/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2024/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2024/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2024/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2024/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2024/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2024/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2024/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2024/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2024/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2024/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2024/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2024/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2024/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2024/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2024/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2024/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2022/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2022/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2022/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2022/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2022/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2022/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2022/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2022/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2022/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2022/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2022/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2022/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2022/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2022/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2023/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2023/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2023/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2023/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2023/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2023/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2023/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2023/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2023/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2023/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2023/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2023/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2023/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2023/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2023/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2023/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2023/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2023/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2023/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2023/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2023/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2023/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2012/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2012/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 15.docx to /content/reports/2012/15.docx


⬇️  Downloading 15.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2012/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2012/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2012/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2012/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2012/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2012/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2012/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2012/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2012/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2012/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2012/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2012/06.docx


⬇️  Downloading 06.docx: 100%

INFO:__main__:✅ Downloaded 11.docx to /content/reports/2014/11.docx


⬇️  Downloading 11.docx: 100%

INFO:__main__:✅ Downloaded 01.docx to /content/reports/2014/01.docx


⬇️  Downloading 01.docx: 100%

INFO:__main__:✅ Downloaded 14.docx to /content/reports/2014/14.docx


⬇️  Downloading 14.docx: 100%

INFO:__main__:✅ Downloaded 02.docx to /content/reports/2014/02.docx


⬇️  Downloading 02.docx: 100%

INFO:__main__:✅ Downloaded 09.docx to /content/reports/2014/09.docx


⬇️  Downloading 09.docx: 100%

INFO:__main__:✅ Downloaded 07.docx to /content/reports/2014/07.docx


⬇️  Downloading 07.docx: 100%

INFO:__main__:✅ Downloaded 13.docx to /content/reports/2014/13.docx


⬇️  Downloading 13.docx: 100%

INFO:__main__:✅ Downloaded 04.docx to /content/reports/2014/04.docx


⬇️  Downloading 04.docx: 100%

INFO:__main__:✅ Downloaded 12.docx to /content/reports/2014/12.docx


⬇️  Downloading 12.docx: 100%

INFO:__main__:✅ Downloaded 05.docx to /content/reports/2014/05.docx


⬇️  Downloading 05.docx: 100%

INFO:__main__:✅ Downloaded 08.docx to /content/reports/2014/08.docx


⬇️  Downloading 08.docx: 100%

INFO:__main__:✅ Downloaded 10.docx to /content/reports/2014/10.docx


⬇️  Downloading 10.docx: 100%

INFO:__main__:✅ Downloaded 03.docx to /content/reports/2014/03.docx


⬇️  Downloading 03.docx: 100%

INFO:__main__:✅ Downloaded 06.docx to /content/reports/2014/06.docx
INFO:__main__:✅ Download complete: 374/374 files


⬇️  Downloading 06.docx: 100%Downloaded 374 files


# Data extraction class

In [18]:
import os
import json
import pandas as pd
from docx import Document


class TableExtractor:
    """Simple class for extracting tables from Word documents with statistics tracking."""

    def __init__(self, base_dir="/content/reports", out_dir="/content/tables"):
        """Initialize the extractor with directories and statistics."""
        self.base_dir = base_dir
        self.out_dir = out_dir

        # Configuration constants
        self.YEAR_RANGE = (2001, 2025)
        self.VALID_EXTENSION = ".docx"
        self.TABLE_MARKER = "לוח"  # Hebrew for "table"
        self.EXCLUDE_MARKER = "תרשים"  # Hebrew for "diagram" - exclude these
        self.ENCODING = "utf-8-sig"
        self.SUMMARY_FILE = "tables_summary.json"
        self.COLUMNS_FILE = "tables_columns.json"

        # Metadata collectors
        self.all_summaries = {}
        self.all_colnames = {}

        # Create output directory
        os.makedirs(self.out_dir, exist_ok=True)

    def _is_valid_table(self, table):
        """
        Check if a table is valid (contains Hebrew table marker in first row).

        Args:
            table: A docx table object

        Returns:
            tuple: (is_valid: bool, table_name: str)
        """
        if len(table.rows) <= 1:
            return False, ""

        # Check first row cells for table marker
        for cell in table.rows[0].cells:
            cell_text = cell.text
            if self.TABLE_MARKER in cell_text and self.EXCLUDE_MARKER not in cell_text:
                return True, cell_text.strip()

        return False, ""

    def _extract_table_data(self, table):
        """
        Extract data from a docx table and convert to DataFrame.

        Args:
            table: A docx table object

        Returns:
            pd.DataFrame: Table data as a DataFrame
        """
        data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
        return pd.DataFrame(data)

    def _save_table_data(self, df, identifier, year, chapter):
        """
        Save DataFrame as CSV file in the appropriate directory structure.

        Args:
            df: pandas DataFrame to save
            identifier: Unique identifier for the table
            year: Year of the document
            chapter: Chapter identifier

        Returns:
            str: Path where the file was saved
        """
        save_dir = os.path.join(self.out_dir, str(year), chapter)
        os.makedirs(save_dir, exist_ok=True)

        save_path = os.path.join(save_dir, f"{identifier}.csv")
        df.to_csv(save_path, index=False, encoding=self.ENCODING)

        return save_path

    def _process_document(self, fpath, year, chapter):
        """
        Process a single Word document and extract all valid tables.

        Args:
            fpath: Full path to the document
            year: Year of the document
            chapter: Chapter identifier from filename

        Returns:
            int: Number of tables extracted from this document
        """
        summary = {}
        colnames_map = {}
        tables_extracted = 0

        try:
            doc = Document(fpath)
        except Exception as e:
            print(f"skip {fpath}: {e}")
            return 0

        serial = 1

        for table in doc.tables:
            # Validate table
            is_valid, table_name = self._is_valid_table(table)
            if not is_valid:
                continue

            # Extract data
            df = self._extract_table_data(table)

            # Skip empty tables
            if len(df) == 0:
                continue

            # Create identifier
            chapter = chapter.replace(".docx", "")
            identifier = f"{serial}_{chapter}_{year}"

            # Record mapping for JSON
            if len(df) > 0:
                # Deduplicate consecutive repeated text in header
                header_cells = df.iloc[0].astype(str).tolist()
                unique_header = []
                for cell in header_cells:
                    if not unique_header or cell != unique_header[-1]:
                        unique_header.append(cell)
                summary[identifier] = " ".join(unique_header)
            else:
                continue

            # Combine rows [1] and [2] for column names
            if len(df) > 2:
                row1 = df.iloc[1].astype(str).tolist()
                row2 = df.iloc[2].astype(str).tolist()
                colnames_map[identifier] = [f"{r1} {r2}".strip() for r1, r2 in zip(row1, row2)]
            elif len(df) > 1:
                colnames_map[identifier] = df.iloc[1].astype(str).tolist()
            else:
                colnames_map[identifier] = []

            # Save to CSV
            self._save_table_data(df, identifier, year, chapter)

            serial += 1

        # Update metadata collectors
        self.all_summaries.update(summary)
        self.all_colnames.update(colnames_map)

    def _save_metadata(self):
        """Save summary and column metadata to JSON files."""
        with open(os.path.join(self.out_dir, self.SUMMARY_FILE), "w", encoding="utf-8") as f:
            json.dump(self.all_summaries, f, ensure_ascii=False, indent=2)

        with open(os.path.join(self.out_dir, self.COLUMNS_FILE), "w", encoding="utf-8") as f:
            json.dump(self.all_colnames, f, ensure_ascii=False, indent=2)

    def process_files(self, years=None, chapters=None):
        """
        Process Word documents filtered by years and chapters.

        Args:
            years: List/range of years to process (None = all years in YEAR_RANGE)
            chapters: List of chapter identifiers to process (None = all chapters)

        Example:
            extractor.process_files()  # Process all files
            extractor.process_files(years=[2023, 2024])  # Specific years
            extractor.process_files(chapters=['1', '2', '3'])  # Specific chapters
            extractor.process_files(years=range(2020, 2025), chapters=['1', '2'])  # Both
        """
        # Reset statistics for new extraction session
        self.all_summaries = {}
        self.all_colnames = {}

        # Determine which years to process
        if years is None:
            years_to_process = range(*self.YEAR_RANGE)
        else:
            years_to_process = years

        # Convert chapters to set for faster lookup (if provided)
        chapters_to_process = set(map(str, chapters)) if chapters else None

        # Process each year
        for year in years_to_process:
            print(year)
            year_path = os.path.join(self.base_dir, str(year))

            if not os.path.isdir(year_path):
                continue

            # Process each document in the year directory
            for fname in os.listdir(year_path):
                if not fname.endswith(self.VALID_EXTENSION):
                    continue

                # Extract chapter from filename
                chapter = fname.split("_")[0]

                # Skip if not in chapters to process
                if chapters_to_process and chapter not in chapters_to_process:
                    continue

                fpath = os.path.join(year_path, fname)

                # Process the document
                self._process_document(fpath, year, chapter)

        # Save consolidated metadata
        self._save_metadata()

    def _identify_continuation_groups(self, summaries):
        """
        Identify groups of tables that should be combined (original + continuations).
        Groups are formed by sequential position - any table with "(המשך)" belongs
        to the most recent table without "(המשך)".

        Args:
            summaries: Dictionary of table summaries

        Returns:
            dict: Groups of related tables {original_id: [original_id, continuation_ids...]}
        """
        groups = {}
        continuation_marker = "(המשך)"

        # Sort identifiers to process them in order (important for maintaining sequence)
        sorted_ids = sorted(summaries.keys(), key=lambda x: (
            int(x.split('_')[2]),  # year
            x.split('_')[1],        # chapter
            int(x.split('_')[0])    # serial number
        ))

        current_group_original = None

        for identifier in sorted_ids:
            header = summaries[identifier]

            # Check if this is a continuation
            if continuation_marker in header:
                # This is a continuation - add to current group
                if current_group_original:
                    groups[current_group_original].append(identifier)
                else:
                    # This shouldn't happen - continuation without an original
                    print(f"Warning: Continuation table found without a preceding original: {identifier}")
            else:
                # This is an original table (not a continuation)
                # Start a new group
                current_group_original = identifier
                groups[identifier] = [identifier]  # Group starts with the original

        # Filter out groups with only one table (no continuations)
        groups_with_continuations = {k: v for k, v in groups.items() if len(v) > 1}

        return groups_with_continuations

    def _combine_csv_files(self, identifiers, summaries):
        """
        Load and combine multiple CSV files into one, removing duplicate headers.

        Args:
            identifiers: List of table identifiers [original, continuation1, ...]
            summaries: Dictionary of table summaries (not used in simplified version)

        Returns:
            pd.DataFrame: Combined dataframe
        """
        if not identifiers:
            return None

        combined_df = None
        original_id = identifiers[0]

        # Parse identifier to get year and chapter
        parts = original_id.split('_')
        year = parts[2]
        chapter = parts[1]

        for i, identifier in enumerate(identifiers):
            # Build path to CSV file
            csv_path = os.path.join(self.out_dir, year, chapter, f"{identifier}.csv")

            if not os.path.exists(csv_path):
                print(f"Warning: CSV file not found: {csv_path}")
                continue

            # Load the CSV
            df = pd.read_csv(csv_path, encoding=self.ENCODING)

            if i == 0:
                # First table (original) - keep everything
                combined_df = df
            else:
                # Continuation table - skip first row (the title row)
                if len(df) > 1:
                    combined_df = pd.concat([combined_df, df.iloc[1:]],
                                           ignore_index=True)
                else:
                    # If continuation only has header, skip it entirely
                    print(f"  Note: Continuation {identifier} has no data rows")

        return combined_df

    def combine_continuation_tables(self):
      """
      Combine continuation tables with their originals after extraction.
      This should be called after process_files() to merge any continuation tables.

      Returns:
          dict: Information about combined tables
      """
      # Load current summaries
      summary_path = os.path.join(self.out_dir, self.SUMMARY_FILE)
      columns_path = os.path.join(self.out_dir, self.COLUMNS_FILE)

      if not os.path.exists(summary_path):
          print("No summaries file found. Run process_files() first.")
          return {}

      # Load metadata
      with open(summary_path, 'r', encoding='utf-8') as f:
          summaries = json.load(f)

      with open(columns_path, 'r', encoding='utf-8') as f:
          colnames = json.load(f)

      # Identify continuation groups
      groups = self._identify_continuation_groups(summaries)

      if not groups:
          print("No continuation tables found.")
          return {}

      print(f"\nFound {len(groups)} table(s) with continuations to combine...")

      # Track what we combined
      combined_info = {}

      # Process each group
      for original_id, identifier_list in groups.items():
          print(f"\nCombining {original_id} with {len(identifier_list)-1} continuation(s)...")

          # Combine the CSV files
          combined_df = self._combine_csv_files(identifier_list, summaries)

          if combined_df is not None:
              # Parse identifier to get year and chapter
              parts = original_id.split('_')
              year = parts[2]
              chapter = parts[1]

              # Save the combined CSV (overwriting the original)
              save_path = os.path.join(self.out_dir, year, chapter, f"{original_id}.csv")
              combined_df.to_csv(save_path, index=False, encoding=self.ENCODING)

              # Delete continuation CSV files
              for continuation_id in identifier_list[1:]:  # Skip the original
                  continuation_path = os.path.join(self.out_dir, year, chapter, f"{continuation_id}.csv")
                  if os.path.exists(continuation_path):
                      os.remove(continuation_path)
                      print(f"  Removed: {continuation_id}.csv")

              # Track combination info
              combined_info[original_id] = {
                  'parts_combined': len(identifier_list),
                  'continuation_ids': identifier_list[1:],
                  'rows_in_combined': len(combined_df)
              }

              print(f"  Combined table saved as: {original_id}.csv ({len(combined_df)} rows)")

      # Remove continuation entries from metadata
      summaries_without_continuations = {k: v for k, v in summaries.items()
                                        if "(המשך)" not in v}
      colnames_without_continuations = {k: v for k, v in colnames.items()
                                      if "(המשך)" not in summaries.get(k, "")}

      # Renumber tables sequentially per chapter-year
      print("\nRenumbering tables sequentially...")

      # Group by chapter and year
      grouped = {}
      for identifier in summaries_without_continuations.keys():
          parts = identifier.split('_')
          if len(parts) >= 3:
              chapter = parts[1]
              year = parts[2]
              key = f"{chapter}_{year}"
              if key not in grouped:
                  grouped[key] = []
              grouped[key].append(identifier)

      # Sort each group by original serial number
      for key in grouped:
          grouped[key].sort(key=lambda x: int(x.split('_')[0]))

      # Create new dictionaries with sequential numbering
      new_summaries = {}
      new_colnames = {}
      rename_map = {}  # Track old -> new identifier mapping

      for chapter_year, identifiers in grouped.items():
          chapter, year = chapter_year.split('_')

          for new_serial, old_identifier in enumerate(identifiers, start=1):
              new_identifier = f"{new_serial}_{chapter}_{year}"
              rename_map[old_identifier] = new_identifier

              # Copy to new dictionaries with new key
              new_summaries[new_identifier] = summaries_without_continuations[old_identifier]
              if old_identifier in colnames_without_continuations:
                  new_colnames[new_identifier] = colnames_without_continuations[old_identifier]

      # Rename CSV files
      for old_id, new_id in rename_map.items():
          if old_id != new_id:  # Only rename if different
              parts_old = old_id.split('_')
              parts_new = new_id.split('_')
              year = parts_old[2]
              chapter = parts_old[1]

              old_path = os.path.join(self.out_dir, year, chapter, f"{old_id}.csv")
              new_path = os.path.join(self.out_dir, year, chapter, f"{new_id}.csv")

              if os.path.exists(old_path):
                  os.rename(old_path, new_path)
                  print(f"  Renamed: {old_id}.csv -> {new_id}.csv")

      # Update combined_info with new identifiers
      updated_combined_info = {}
      for old_id, info in combined_info.items():
          new_id = rename_map.get(old_id, old_id)
          updated_combined_info[new_id] = info

      # Save updated metadata with sequential numbering
      with open(summary_path, 'w', encoding='utf-8') as f:
          json.dump(new_summaries, f, ensure_ascii=False, indent=2)

      with open(columns_path, 'w', encoding='utf-8') as f:
          json.dump(new_colnames, f, ensure_ascii=False, indent=2)

      # Save combination tracking info
      tracking_path = os.path.join(self.out_dir, "combined_tables_info.json")
      with open(tracking_path, 'w', encoding='utf-8') as f:
          json.dump(updated_combined_info, f, ensure_ascii=False, indent=2)

      print(f"\n✓ Combination complete! Combined {len(groups)} table(s)")
      print(f"  Tables renumbered sequentially per chapter-year")
      print(f"  Combination details saved to: combined_tables_info.json")

      return updated_combined_info

    def calculate_statistics(self):
      """
      Calculate statistics from the table_summary.json file.

      Returns:
          dict: Statistics with 'total' and 'per_chapter_year' breakdown
      """
      summary_path = os.path.join(self.out_dir, self.SUMMARY_FILE)

      if not os.path.exists(summary_path):
          return {'total': 0, 'per_chapter_year': {}}

      # Load summaries
      with open(summary_path, 'r', encoding='utf-8') as f:
          summaries = json.load(f)

      # Calculate statistics
      total = len(summaries)
      per_chapter_year = {}

      for identifier in summaries.keys():
          # Parse identifier: "serial_chapter_year"
          parts = identifier.split('_')
          if len(parts) >= 3:
              chapter = parts[1]
              year = int(parts[2])

              if chapter not in per_chapter_year:
                  per_chapter_year[chapter] = {}
              if year not in per_chapter_year[chapter]:
                  per_chapter_year[chapter][year] = 0
              per_chapter_year[chapter][year] += 1

      return {
          'total': total,
          'per_chapter_year': per_chapter_year
      }

    def print_summary(self):
        """Print a formatted summary of extraction statistics."""
        stats = self.calculate_statistics()

        print("\n" + "="*50)
        print("EXTRACTION SUMMARY")
        print("="*50)
        print(f"Total tables extracted: {stats['total']}")

        if stats['per_chapter_year']:
            print("\nTables per chapter per year:")
            for chapter in sorted(stats['per_chapter_year'].keys()):
                print(f"\nChapter {chapter}:")
                for year in sorted(stats['per_chapter_year'][chapter].keys()):
                    count = stats['per_chapter_year'][chapter][year]
                    if count > 0:  # Only show years with tables
                        print(f"  {year}: {count}")
        else:
            print("\nNo tables found.")
        print("="*50)

In [19]:
# Initialize
extractor = TableExtractor(base_dir="/content/reports", out_dir="/content/tables")

# Process everything
extractor.process_files()

2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024


In [20]:
# Combine continuation tables
combined = extractor.combine_continuation_tables()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Combining 24_07_2010 with 1 continuation(s)...
  Removed: 25_07_2010.csv
  Combined table saved as: 24_07_2010.csv (23 rows)

Combining 29_07_2010 with 1 continuation(s)...
  Removed: 30_07_2010.csv
  Combined table saved as: 29_07_2010.csv (27 rows)

Combining 20_08_2010 with 1 continuation(s)...
  Removed: 21_08_2010.csv
  Combined table saved as: 20_08_2010.csv (34 rows)

Combining 22_08_2010 with 1 continuation(s)...
  Removed: 23_08_2010.csv
  Combined table saved as: 22_08_2010.csv (34 rows)

Combining 25_08_2010 with 2 continuation(s)...
  Removed: 26_08_2010.csv
  Removed: 27_08_2010.csv
  Combined table saved as: 25_08_2010.csv (187 rows)

Combining 33_08_2010 with 1 continuation(s)...
  Removed: 34_08_2010.csv
  Combined table saved as: 33_08_2010.csv (38 rows)

Combining 39_08_2010 with 2 continuation(s)...
  Removed: 40_08_2010.csv
  Removed: 41_08_2010.csv
  Combined table saved as: 39_08_2010.csv (176 rows)

In [21]:
# Print summary
extractor.print_summary()


EXTRACTION SUMMARY
Total tables extracted: 5207

Tables per chapter per year:

Chapter 01:
  2001: 13
  2002: 11
  2003: 8
  2004: 9
  2005: 9
  2006: 10
  2007: 10
  2008: 10
  2009: 10
  2010: 9
  2011: 8
  2012: 8
  2014: 8
  2015: 8
  2017: 2
  2018: 2
  2019: 10
  2020: 1
  2021: 1
  2022: 9
  2023: 1
  2024: 9

Chapter 02:
  2001: 10
  2002: 13
  2003: 13
  2004: 13
  2005: 13
  2006: 12
  2007: 12
  2008: 12
  2009: 14
  2010: 11
  2011: 14
  2012: 14
  2013: 14
  2014: 15
  2015: 14
  2016: 14
  2017: 1
  2018: 1
  2019: 10
  2022: 5
  2023: 5
  2024: 6

Chapter 03:
  2001: 14
  2002: 15
  2003: 15
  2004: 16
  2005: 15
  2006: 16
  2007: 17
  2008: 19
  2009: 17
  2010: 17
  2011: 16
  2012: 17
  2013: 14
  2014: 14
  2015: 13
  2016: 13
  2017: 1
  2018: 1
  2019: 11
  2022: 11
  2024: 11

Chapter 04:
  2001: 4
  2002: 5
  2003: 7
  2004: 11
  2005: 11
  2006: 14
  2007: 13
  2008: 12
  2009: 15
  2010: 13
  2011: 11
  2012: 11
  2013: 9
  2014: 9
  2015: 11
  2016: 10
  201