<a href="https://colab.research.google.com/github/shiragelb/NCC-Statistical-Reports/blob/main/Table_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!apt-get install pandoc
!pip install pypandoc
!pip install python-docx
!pip install docx2txt

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
pandoc is already the newest version (2.9.2.1-3ubuntu2).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9


In [16]:
#extracting folders id from shared memory

from google.colab import auth
from googleapiclient.discovery import build
import pandas as pd

# Authenticate
auth.authenticate_user()
drive_service = build('drive', 'v3')

folder_id = "1e0eA-AIsz_BSwVHOppJMXECX42hBfG4J"

def list_all_files_in_folder_recursive(parent_id, parent_path=""):
    """Recursively list all files in a folder and subfolders"""
    all_files = []

    query = f"'{parent_id}' in parents and trashed=false"
    page_token = None

    while True:
        response = drive_service.files().list(
            q=query,
            spaces='drive',
            fields='nextPageToken, files(id, name, mimeType)',
            pageToken=page_token
        ).execute()

        for item in response.get('files', []):
            item_path = f"{parent_path}/{item['name']}" if parent_path else item['name']

            if item['mimeType'] == 'application/vnd.google-apps.folder':
                # Recurse into subfolder
                all_files.extend(list_all_files_in_folder_recursive(item['id'], item_path))
            else:
                all_files.append({
                    "file_name": item['name'],
                    "file_path": item_path,
                    "file_id": item['id'],
                    "file_url": f"https://drive.google.com/file/d/{item['id']}/view?usp=sharing"
                })

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

    return all_files

# Run the recursive function
files_list = list_all_files_in_folder_recursive(folder_id)

# Convert to DataFrame
df_files = pd.DataFrame(files_list)
##df_files.head()

##getting local id of the file
!pip install python-docx
import os
from googleapiclient.http import MediaIoBaseDownload
import io

def get_chapter_file(chapter_num: int, year: str, download_dir="/content") -> str:
    """
    Download a chapter file for a given year from Google Drive and return its local path.

    Args:
        chapter_num (int): The chapter number (e.g., 1, 2, ..., 14).
        year (str): The year folder name.
        download_dir (str): Local folder to save the file. Defaults to /content in Colab.

    Returns:
        str: Local path to the downloaded file, or None if not found.
    """
    # Normalize chapter filename (01, 02, ...)
    chapter_str = f"{chapter_num:02}"

    # Find matching file in df_files
    match = df_files[
        (df_files["file_name"].str.contains(chapter_str)) &
        (df_files["file_path"].str.contains(f"{year}/"))
    ]

    if match.empty:
        print(f"❌ No file found for chapter {chapter_str} in year {year}")
        return None

    file_id = match.iloc[0]["file_id"]
    file_name = match.iloc[0]["file_name"]
    local_path = os.path.join(download_dir, year, file_name)

    # Ensure year directory exists locally
    os.makedirs(os.path.dirname(local_path), exist_ok=True)

    # Download the file from Google Drive
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.FileIO(local_path, "wb")
    downloader = MediaIoBaseDownload(fh, request)

    done = False
    while not done:
        status, done = downloader.next_chunk()
        if status:
            print(f"⬇️ Download {int(status.progress() * 100)}%.")

    print(f"✅ Saved to {local_path}")
    return local_path

from docx import Document

# After downloading the file
# sanity check
path = get_chapter_file(14, "2015")
print("Local file:", path)

if path and path.endswith(".docx"):
    doc = Document(path)
    full_text = "\n".join([p.text for p in doc.paragraphs if p.text.strip() != ""])

    # Split into sentences (naive split by period/full stop)
    sentences = full_text.replace("\n", " ").split(".")

    # Print first few sentences
    preview_count = 3
    preview_sentences = [s.strip() for s in sentences if s.strip()][:preview_count]
    print("\nPreview of content:")
    for i, s in enumerate(preview_sentences, 1):
        print(f"{i}. {s}.")



    #a code that downloads the files to a directory called content/reports
import os
import io
from googleapiclient.http import MediaIoBaseDownload

def download_all_chapters(download_dir="/content/reports", years=range(2001, 2025), chapters=range(1, 16)):
    """
    Downloads all chapters for all years to local environment.

    Returns:
        dict: {year: {chapter_number: local_path}}
    """
    all_paths = {}

    for year in years:
        year_str = str(year)
        all_paths[year_str] = {}

        for chapter in chapters:
            chapter_str = f"{chapter:02}"

            # Find matching file in df_files
            match = df_files[
                (df_files["file_name"].str.contains(chapter_str)) &
                (df_files["file_path"].str.contains(f"{year_str}/"))
            ]

            if match.empty:
                print(f"⚠️ Chapter {chapter_str} not found for year {year_str}")
                continue

            file_id = match.iloc[0]["file_id"]
            file_name = match.iloc[0]["file_name"]
            local_path = os.path.join(download_dir, year_str, f"{chapter_str}_{file_name}")

            # Ensure folder exists
            os.makedirs(os.path.dirname(local_path), exist_ok=True)

            # Download file
            request = drive_service.files().get_media(fileId=file_id)
            fh = io.FileIO(local_path, "wb")
            downloader = MediaIoBaseDownload(fh, request)

            done = False
            while not done:
                status, done = downloader.next_chunk()
                if status:
                    print(f"⬇️ Download {int(status.progress() * 100)}% for {file_name}")

            print(f"✅ Saved {file_name} to {local_path}")
            all_paths[year_str][chapter] = local_path

    return all_paths
download_all_chapters()

⬇️ Download 100%.
✅ Saved to /content/2015/14.docx
Local file: /content/2015/14.docx

Preview of content:
1. 14 ילדים במצבי סיכון ומצוקה מבוא פרק זה, המוקדש לילדים במצבי סיכון ומצוקה, הוא אחד הפרקים המורכבים בשנתון.
2. ההגדרה מיהו ילד בסיכון או במצוקה תלויה בסביבה, בחברה, בתקופה ובמגדיר.
3. לא רק זאת, קשה לאמוד את היקף התופעות הקשורות לילדים במצבי סיכון ומצוקה, שכן לפי טבען לא תמיד הן חשופות או מדווחות, והמקרים הידועים אינם אלא קצה הקרחון.
⬇️ Download 100% for 01.doc
✅ Saved 01.doc to /content/reports/2001/01_01.doc
⬇️ Download 100% for 02.doc
✅ Saved 02.doc to /content/reports/2001/02_02.doc
⬇️ Download 100% for 03.doc
✅ Saved 03.doc to /content/reports/2001/03_03.doc
⬇️ Download 100% for 04.doc
✅ Saved 04.doc to /content/reports/2001/04_04.doc
⬇️ Download 100% for 05.doc
✅ Saved 05.doc to /content/reports/2001/05_05.doc
⬇️ Download 100% for 06.doc
✅ Saved 06.doc to /content/reports/2001/06_06.doc
⬇️ Download 100% for 07.doc
✅ Saved 07.doc to /content/reports/2001/07_07.doc
⬇️ Downloa

{'2001': {1: '/content/reports/2001/01_01.doc',
  2: '/content/reports/2001/02_02.doc',
  3: '/content/reports/2001/03_03.doc',
  4: '/content/reports/2001/04_04.doc',
  5: '/content/reports/2001/05_05.doc',
  6: '/content/reports/2001/06_06.doc',
  7: '/content/reports/2001/07_07.doc',
  8: '/content/reports/2001/08_08.doc',
  9: '/content/reports/2001/09_09.doc',
  10: '/content/reports/2001/10_10.doc',
  11: '/content/reports/2001/11_11.doc',
  12: '/content/reports/2001/12_12.doc',
  13: '/content/reports/2001/13_13.doc',
  14: '/content/reports/2001/14_14.doc',
  15: '/content/reports/2001/15_15.doc'},
 '2002': {1: '/content/reports/2002/01_01.doc',
  2: '/content/reports/2002/02_02.doc',
  3: '/content/reports/2002/03_03.doc',
  4: '/content/reports/2002/04_04.doc',
  5: '/content/reports/2002/05_05.doc',
  6: '/content/reports/2002/06_06.doc',
  7: '/content/reports/2002/07_07.doc',
  8: '/content/reports/2002/08_08.doc',
  9: '/content/reports/2002/09_09.doc',
  10: '/content/r

In [22]:
import os
import json
import pandas as pd
from docx import Document
!apt-get install -y libreoffice
import subprocess

def convert_doc_to_docx(base_dir="/content/reports"):
    for root, _, files in os.walk(base_dir):
        for fname in files:
            if fname.endswith(".doc") and not fname.endswith(".docx"):
                fpath = os.path.join(root, fname)
                print(fpath)
                subprocess.run([
                    "libreoffice", "--headless", "--convert-to", "docx", fpath, "--outdir", root
                ])

# convert all docs
convert_doc_to_docx("/content/reports")


def extract_tables_from_reports(base_dir="/content/reports", out_dir="/content/tables"):
    os.makedirs(out_dir, exist_ok=True)
    summary = {}

    # loop years
    for year in range(2001, 2025):
        print(year)
        year_path = os.path.join(base_dir, str(year))
        if not os.path.isdir(year_path):
            continue

        for fname in os.listdir(year_path):
            if not (fname.endswith(".docx") or fname.endswith(".doc")):
                continue

            chapter = fname.split("_")[0]
            fpath = os.path.join(year_path, fname)

            try:
                doc = Document(fpath)
            except Exception as e:
                print(f"skip {fpath}: {e}")
                continue

            serial = 1
            # find all tables with names containing "לוח"
            for i, table in enumerate(doc.tables):
                # find nearest paragraph before this table
                table_name = ""
                for para in reversed(doc.paragraphs):
                    if para._element.getnext() is not None and para._element.getnext().tag.endswith("tbl"):
                        table_name = para.text.strip()
                        break

                if "לוח" in table_name:
                    # convert table to dataframe
                    data = [[cell.text.strip() for cell in row.cells] for row in table.rows]
                    df = pd.DataFrame(data)

                    # build identifier
                    identifier = f"{serial}#{chapter}#{year}"

                    # save CSV
                    save_dir = os.path.join(out_dir, str(year), chapter)
                    os.makedirs(save_dir, exist_ok=True)
                    save_path = os.path.join(save_dir, f"{identifier}.csv")
                    df.to_csv(save_path, index=False, encoding="utf-8-sig")

                    # record mapping
                    summary[table_name] = identifier

                    serial += 1

    # write summary JSON
    with open(os.path.join(out_dir, "tables_summary.json"), "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

extract_tables_from_reports()

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libreoffice is already the newest version (1:7.3.7-0ubuntu0.22.04.10).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
/content/reports/2007/14_14.doc
/content/reports/2007/11_11.doc
/content/reports/2007/03_03.doc
/content/reports/2007/04_04.doc
/content/reports/2007/02_02.doc
/content/reports/2007/07_07.doc
/content/reports/2007/08_08.doc
/content/reports/2007/15_15.doc
/content/reports/2007/13_13.doc
/content/reports/2007/12_12.doc


KeyboardInterrupt: 