<a href="https://colab.research.google.com/github/shiragelb/NCC-Statistical-Reports/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install pdfplumber
!pip install camelot-py[cv]
!pip install tabula-py
!pip install python-docx

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

Imports

In [4]:
import requests
import os
from docx import Document
import pandas as pd
from google.colab import files
import camelot
import tabula
import pdfplumber
from docx.shared import Inches # Import Inches for setting image size
import json

Extract Tables

In [7]:
def setup_directory():
    """Create tables directory if it doesn't exist"""
    if not os.path.exists('tables'):
        os.makedirs('tables')
        print("Created 'tables/' directory")
    else:
        print("'tables/' directory already exists")

def extract_tables_with_names(docx_path):
    """Extract tables with their names from DOCX"""
    doc = Document(docx_path)
    tables = []

    for i, table in enumerate(doc.tables):
        # Extract table data
        data = []
        for row in table.rows:
            data.append([cell.text.strip() for cell in row.cells])

        if data:
            df = pd.DataFrame(data)

            # Try to find table name from first row or use default
            # Assuming first row might contain the table name
            table_name = f"Table_{i+1}"  # Default name
            if len(data[0]) == 1 and len(data) > 1:  # Single cell in first row might be title
                table_name = data[0][0] if data[0][0] else table_name
                df = pd.DataFrame(data[1:])  # Skip title row

            tables.append((table_name, df))

    return tables

def save_tables_to_csv(tables, chapter, year):
    """Save tables to CSV files and return reference dictionary"""
    reference_dict = {}

    for i, (name, df) in enumerate(tables, 1):
        # Create filename: table{i}{j}{k}.csv
        filename = f"table{i}{chapter}{year}.csv"
        filepath = os.path.join('tables', filename)

        # Save dataframe to CSV
        df.to_csv(filepath, index=False, header=False)

        # Add to reference dictionary
        reference_dict[name] = filepath
        print(f"Saved: {filepath}")

    return reference_dict

def save_dictionary_to_json(reference_dict, filename='table_references.json'):
    """Save reference dictionary to JSON file"""
    with open(filename, 'w') as f:
        json.dump(reference_dict, f, indent=2)
    print(f"Reference dictionary saved to {filename}")

def process_documents(doc1_path, chapter1, year1, doc2_path, chapter2, year2):
    """Main function to process both documents"""
    # Setup directory
    setup_directory()

    # Combined dictionary for all tables
    all_references = {}

    # Process first document
    print(f"\nProcessing: {doc1_path}")
    tables1 = extract_tables_with_names(doc1_path)
    ref_dict1 = save_tables_to_csv(tables1, chapter1, year1)
    all_references.update(ref_dict1)

    # Process second document
    print(f"\nProcessing: {doc2_path}")
    tables2 = extract_tables_with_names(doc2_path)
    ref_dict2 = save_tables_to_csv(tables2, chapter2, year2)
    all_references.update(ref_dict2)

    # Save combined dictionary
    save_dictionary_to_json(all_references)

    print(f"\nTotal tables processed: {len(all_references)}")
    return all_references

# # Example usage
# if _name_ == "_main_":
#     # Manual input for chapter and year
#     references = process_documents(
#         'document1.docx', chapter1=1, year1=2024,
#         'document2.docx', chapter2=2, year2=2024
#     )

In [11]:
# Upload files from the pc
chp1_2001_raw = files.upload()
chp1_2002_raw = files.upload()

# Get path
chp1_2001 = list(chp1_2001_raw.keys())[0]
chp1_2002 = list(chp1_2002_raw.keys())[0]

Saving chap 01.doc to chap 01 (4).doc


Saving chap 01.doc to chap 01 (5).doc


In [12]:
tables_chp1_2001 = extract_tables_with_names(chp1_2001)
tables_chp1_2002 = extract_tables_with_names(chp1_2002)

PackageNotFoundError: Package not found at 'chap 01 (4).doc'

Previous

In [None]:


def process_file_from_url(url, year):
    """
    Downloads a file from a URL, processes it, extracts tables from DOCX,
    saves them as CSVs, and stores metadata.

    Args:
        url (str): The URL of the file.
        year (int): The year to include in the CSV filenames.

    Returns:
        dict: A dictionary mapping original table topics to their new filenames.
    """
    # Step 2: Download the file
    local_filename = url.split('/')[-1]
    print(f"Downloading {url} to {local_filename}")
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print("Download complete.")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return {}


    # Step 3: Determine file type and convert if necessary
    file_extension = os.path.splitext(local_filename)[1].lower()
    docx_path = local_filename

    if file_extension == '.pdf':
        print(f"Detected PDF file. Attempting to convert {local_filename} to DOCX.")
        try:
            # Basic PDF to DOCX conversion using pdfplumber
            # This will extract text but may not preserve formatting or tables accurately.
            # For better results, consider using libraries like 'pdf2docx' or external services.
            docx_path = local_filename + ".docx"
            document = Document()
            with pdfplumber.open(local_filename) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        document.add_paragraph(text)
            document.save(docx_path)
            print(f"Conversion to DOCX complete: {docx_path}")
        except Exception as e:
            print(f"Error converting PDF to DOCX: {e}")
            print("Proceeding with the original file assuming it might be parsable by a DOCX reader or skipping table extraction.")
            # If conversion fails, we might not be able to extract tables reliably.
            # Depending on requirements, you might want to return here or handle this case differently.


    elif file_extension == '.docx':
        print(f"Detected DOCX file: {local_filename}")
        pass # File is already DOCX, no conversion needed
    else:
        print(f"Unsupported file type: {file_extension}. Skipping table extraction.")
        return {} # Return empty dictionary for unsupported types


    # Step 4: Extract tables from DOCX
    tables = []
    if os.path.exists(docx_path):
      try:
          tables = extract_tables_from_docx(docx_path)
          print(f"Extracted {len(tables)} tables from {docx_path}")
      except Exception as e:
          print(f"Error extracting tables from DOCX: {e}")
          return {}
    else:
      print(f"DOCX file not found at {docx_path}. Skipping table extraction.")
      return {}


    # Step 5: Save tables as CSV and store metadata
    table_metadata = {}
    for i, table in enumerate(tables, 1):
        # Placeholder for extracting original table topic
        # This is highly dependent on the document structure and might require
        # more advanced parsing or heuristics.
        original_topic = f"Table {i}" # Using a placeholder for now

        csv_filename = f"{i}-{year}.csv"
        table.to_csv(csv_filename, index=False)
        table_metadata[original_topic] = csv_filename
        print(f"Saved table {i} to {csv_filename}")

    # Step 6: Return the dictionary
    return table_metadata

def extract_tables_from_docx(docx_path):
    """
    Extracts tables from a DOCX file.

    Args:
        docx_path (str): The path to the DOCX file.

    Returns:
        list: A list of pandas DataFrames, where each DataFrame represents a table.
    """
    doc = Document(docx_path)
    tables = []

    for table in doc.tables:
        data = []
        for row in table.rows:
            data.append([cell.text.strip() for cell in row.cells])
        if data: # Ensure table is not empty
            df = pd.DataFrame(data)
            tables.append(df)
    return tables

from google.colab import drive
drive.mount("/content/drive")
os.chdir('content/drive/Shareddrives/')

# Example usage (You can uncomment and modify this to test)
url = pdf_path # Replace with the actual file URL
year = 2024 # Replace with the desired year
extracted_info = process_file_from_url(url, year)
print("\nExtracted Information:")
print(extracted_info)