<a href="https://colab.research.google.com/github/shiragelb/NCC-Statistical-Reports/blob/main/File_conversion_and_upload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install pandoc
!pip install pypandoc
!pip install python-docx
!pip install docx2txt
!apt-get install -y libreoffice

In [None]:
#extracting folders id from shared memory

from google.colab import auth
from googleapiclient.discovery import build
import pandas as pd

# Authenticate
auth.authenticate_user()
drive_service = build('drive', 'v3')

folder_id = "1e0eA-AIsz_BSwVHOppJMXECX42hBfG4J"

def list_all_files_in_folder_recursive(parent_id, parent_path=""):
    """Recursively list all files in a folder and subfolders"""
    all_files = []

    query = f"'{parent_id}' in parents and trashed=false"
    page_token = None

    while True:
        response = drive_service.files().list(
            q=query,
            spaces='drive',
            fields='nextPageToken, files(id, name, mimeType)',
            pageToken=page_token
        ).execute()

        for item in response.get('files', []):
            item_path = f"{parent_path}/{item['name']}" if parent_path else item['name']

            if item['mimeType'] == 'application/vnd.google-apps.folder':
                # Recurse into subfolder
                all_files.extend(list_all_files_in_folder_recursive(item['id'], item_path))
            else:
                all_files.append({
                    "file_name": item['name'],
                    "file_path": item_path,
                    "file_id": item['id'],
                    "file_url": f"https://drive.google.com/file/d/{item['id']}/view?usp=sharing"
                })

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

    return all_files

# Run the recursive function
files_list = list_all_files_in_folder_recursive(folder_id)

# Convert to DataFrame
df_files = pd.DataFrame(files_list)
##df_files.head()

##getting local id of the file
!pip install python-docx
import os
from googleapiclient.http import MediaIoBaseDownload
import io

def get_chapter_file(chapter_num: int, year: str, download_dir="/content") -> str:
    """
    Download a chapter file for a given year from Google Drive and return its local path.

    Args:
        chapter_num (int): The chapter number (e.g., 1, 2, ..., 14).
        year (str): The year folder name.
        download_dir (str): Local folder to save the file. Defaults to /content in Colab.

    Returns:
        str: Local path to the downloaded file, or None if not found.
    """
    # Normalize chapter filename (01, 02, ...)
    chapter_str = f"{chapter_num:02}"

    # Find matching file in df_files
    match = df_files[
        (df_files["file_name"].str.contains(chapter_str)) &
        (df_files["file_path"].str.contains(f"{year}/"))
    ]

    if match.empty:
        print(f"❌ No file found for chapter {chapter_str} in year {year}")
        return None

    file_id = match.iloc[0]["file_id"]
    file_name = match.iloc[0]["file_name"]
    local_path = os.path.join(download_dir, year, file_name)

    # Ensure year directory exists locally
    os.makedirs(os.path.dirname(local_path), exist_ok=True)

    # Download the file from Google Drive
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.FileIO(local_path, "wb")
    downloader = MediaIoBaseDownload(fh, request)

    done = False
    while not done:
        status, done = downloader.next_chunk()
        if status:
            print(f"⬇️ Download {int(status.progress() * 100)}%.")

    print(f"✅ Saved to {local_path}")
    return local_path

from docx import Document

# After downloading the file
# sanity check
path = get_chapter_file(14, "2015")
print("Local file:", path)

if path and path.endswith(".docx"):
    doc = Document(path)
    full_text = "\n".join([p.text for p in doc.paragraphs if p.text.strip() != ""])

    # Split into sentences (naive split by period/full stop)
    sentences = full_text.replace("\n", " ").split(".")

    # Print first few sentences
    preview_count = 3
    preview_sentences = [s.strip() for s in sentences if s.strip()][:preview_count]
    print("\nPreview of content:")
    for i, s in enumerate(preview_sentences, 1):
        print(f"{i}. {s}.")



    #a code that downloads the files to a directory called content/reports
import os
import io
from googleapiclient.http import MediaIoBaseDownload

def download_all_chapters(download_dir="/content/reports", years=range(2001, 2025), chapters=range(1, 16)):
    """
    Downloads all chapters for all years to local environment.

    Returns:
        dict: {year: {chapter_number: local_path}}
    """
    all_paths = {}

    for year in years:
        year_str = str(year)
        all_paths[year_str] = {}

        for chapter in chapters:
            chapter_str = f"{chapter:02}"

            # Find matching file in df_files
            match = df_files[
                (df_files["file_name"].str.contains(chapter_str)) &
                (df_files["file_path"].str.contains(f"{year_str}/"))
            ]

            if match.empty:
                print(f"⚠️ Chapter {chapter_str} not found for year {year_str}")
                continue

            file_id = match.iloc[0]["file_id"]
            file_name = match.iloc[0]["file_name"]
            local_path = os.path.join(download_dir, year_str, f"{chapter_str}_{file_name}")

            # Ensure folder exists
            os.makedirs(os.path.dirname(local_path), exist_ok=True)

            # Download file
            request = drive_service.files().get_media(fileId=file_id)
            fh = io.FileIO(local_path, "wb")
            downloader = MediaIoBaseDownload(fh, request)

            done = False
            while not done:
                status, done = downloader.next_chunk()
                if status:
                    print(f"⬇️ Download {int(status.progress() * 100)}% for {file_name}")

            print(f"✅ Saved {file_name} to {local_path}")
            all_paths[year_str][chapter] = local_path

    return all_paths
download_all_chapters()

In [None]:
import os
import json
import pandas as pd
from docx import Document
import subprocess

def convert_doc_to_docx(base_dir="/content/reports"):
    for root, _, files in os.walk(base_dir):
        for fname in files:
            if fname.endswith(".doc") and not fname.endswith(".docx"):
                fpath = os.path.join(root, fname)
                print(fpath)
                subprocess.run([
                    "libreoffice", "--headless", "--convert-to", "docx", fpath, "--outdir", root
                ])

# convert all docs
convert_doc_to_docx("/content/reports")

In [None]:
from googleapiclient.http import MediaFileUpload

# Save converted files to google drive
def get_original_drive_folder_id(local_file_path, df_files):
    """
    Find the original Drive folder ID for a local file path.

    Args:
        local_file_path (str): Local path like "/content/reports/2015/01_chapter01.docx"
        df_files (DataFrame): The existing df_files DataFrame with file metadata

    Returns:
        str: Drive folder ID of the original year folder, or None if not found
    """
    # Extract the year from the local path
    path_parts = local_file_path.replace("/content/reports/", "").split("/")

    if len(path_parts) < 2:
        print(f"⚠️ Invalid path structure: {local_file_path}")
        return None

    year = path_parts[0]

    # Find the year folder ID by looking for any file in that year's folder
    # and getting its parent folder ID
    year_files = df_files[df_files["file_path"].str.startswith(f"{year}/")]

    if year_files.empty:
        print(f"⚠️ No files found for year: {year}")
        return None

    # Get any file from that year to find the parent folder ID
    sample_file_id = year_files.iloc[0]["file_id"]

    try:
        # Get file metadata to find its parent folder (which is the year folder)
        file_metadata = drive_service.files().get(fileId=sample_file_id, fields='parents').execute()
        year_folder_id = file_metadata.get('parents', [None])[0]
        return year_folder_id
    except Exception as e:
        print(f"⚠️ Error getting year folder for {local_file_path}: {e}")
        return None

def upload_file_to_drive(local_file_path, target_folder_id):
    """
    Upload a single file to a specified Google Drive folder.

    Args:
        local_file_path (str): Path to the local file to upload
        target_folder_id (str): Drive folder ID where the file should be uploaded

    Returns:
        bool: True if upload successful, False otherwise
    """
    try:
        # Extract filename from local path
        filename = os.path.basename(local_file_path)

        # Fix duplicate chapter numbers (e.g., "09_09.docx" -> "09.docx")
        if "_" in filename and filename.endswith(".docx"):
            parts = filename.replace(".docx", "").split("_")
            if len(parts) == 2 and parts[0] == parts[1]:
                # If the parts before and after underscore are the same, use just one
                filename = f"{parts[0]}.docx"

        # Set up file metadata
        file_metadata = {
            'name': filename,
            'parents': [target_folder_id]
        }

        # Create media upload object
        media = MediaFileUpload(
            local_file_path,
            mimetype='application/vnd.openxmlformats-officedocument.wordprocessingml.document'
        )

        # Upload the file
        uploaded_file = drive_service.files().create(
            body=file_metadata,
            media_body=media,
            fields='id'
        ).execute()

        file_id = uploaded_file.get('id')
        print(f"✅ Successfully uploaded {filename} (ID: {file_id})")
        return True

    except Exception as e:
        print(f"❌ Failed to upload {local_file_path}: {e}")
        return False

def upload_converted_files_to_drive(base_dir="/content/reports"):
    """
    Main function to upload all converted .docx files back to their original Google Drive folders
    in 'converted' subfolders.

    Args:
        base_dir (str): Base directory containing the converted files
    """
    print("🚀 Starting upload of converted files to Google Drive...")

    total_files = 0
    successful_uploads = 0
    failed_uploads = 0

    # Walk through the directory structure
    for root, dirs, files in os.walk(base_dir):
        for filename in files:
            # Only process .docx files
            if filename.endswith(".docx"):
                total_files += 1
                local_file_path = os.path.join(root, filename)

                print(f"\n📁 Processing: {local_file_path}")

                # Step 1: Get the original Drive folder ID
                original_folder_id = get_original_drive_folder_id(local_file_path, df_files)
                if not original_folder_id:
                    print(f"❌ Upload failed for {filename}: Could not find original Drive folder")
                    failed_uploads += 1
                    continue

                # Step 2: Upload the file
                upload_success = upload_file_to_drive(local_file_path, original_folder_id)
                if upload_success:
                    successful_uploads += 1
                else:
                    print(f"❌ Upload failed for {filename}: File upload unsuccessful")
                    failed_uploads += 1

    # Print summary
    print(f"\n📊 Upload Summary:")
    print(f"   Total .docx files processed: {total_files}")
    print(f"   Successful uploads: {successful_uploads}")
    print(f"   Failed uploads: {failed_uploads}")
    print("🎉 Upload process completed!")

# Upload all converted files to drive
upload_converted_files_to_drive()