In [None]:
!ls /content/drive/MyDrive/ml_document_extractor/data/train


18325926-Rental-Agreement-1.docx
36199312-Rental-Agreement.png
44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement.docx
46239065-Standard-Rental-Agreement-Rental-With-Performance-Fee.docx
47854715-RENTAL-AGREEMENT.docx
50070534-RENTAL-AGREEMENT.docx
54770958-Rental-Agreement.png
54945838-Rental-Agreement.png
6683127-House-Rental-Contract-GERALDINE-GALINATO-v2-Page-1.docx
6683129-House-Rental-Contract-Geraldine-Galinato-v2.docx


In [None]:
!ls /content/drive/MyDrive/ml_document_extractor/data/test

156155545-Rental-Agreement-Kns-Home.pdf.docx  24158401-Rental-Agreement.png
228094620-Rental-Agreement.pdf.docx	      95980236-Rental-Agreement.png


In [1]:
# ==============================================================================
# SCRIPT: 01_preprocess_data.py
#
# PURPOSE:
#   - Read .docx and image files from the input data folder.
#   - Perform OCR on images using Tesseract.
#   - Merge extracted text with CSV ground truth.
#   - Output structured JSON for training/testing.
# ==============================================================================

import pandas as pd
import numpy as np
from pathlib import Path
import json
import subprocess

from google.colab import drive
drive.mount('/content/drive')

# --- Installation of Dependencies ---
try:
    import docx
except ImportError:
    print("Installing python-docx...")
    subprocess.run(['pip', 'install', '-q', 'python-docx'])
    import docx

try:
    from PIL import Image
except ImportError:
    print("Installing Pillow...")
    subprocess.run(['pip', 'install', '-q', 'Pillow'])
    from PIL import Image

try:
    import pytesseract
except ImportError:
    print("Installing pytesseract and Tesseract OCR...")
    subprocess.run(['pip', 'install', '-q', 'pytesseract'])
    subprocess.run(['sudo', 'apt-get', 'install', '-y', 'tesseract-ocr'])
    import pytesseract

# --- Configuration ---
BASE_DIR = Path('/content/drive/MyDrive/ml_document_extractor')
DATA_DIR = BASE_DIR / 'data'
PROCESSED_DATA_DIR = BASE_DIR / 'processed_data'
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

# --- Text Extraction Functions ---
def extract_text_from_docx(file_path):
    """Extracts text from a .docx file."""
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs if para.text])
    except Exception as e:
        print(f"Error reading docx file {file_path}: {e}")
        return ""

def extract_text_from_image(file_path):
    """Extracts text from an image file using OCR."""
    try:
        return pytesseract.image_to_string(Image.open(file_path))
    except Exception as e:
        print(f"Error reading image file {file_path}: {e}")
        return ""

# --- Main Processing Function ---
def process_files(csv_path, files_dir):
    """
    Read files mentioned in CSV, extract text from .docx/.png/.jpg files,
    and log progress or missing files.
    """
    df = pd.read_csv(csv_path)
    df.replace(np.nan, '', inplace=True)

    processed_records = []

    all_files = list(files_dir.glob("*"))
    actual_file_stems = {file.stem for file in all_files}
    csv_filenames = set(df['File Name'].tolist())

    print(f"\n📄 CSV Entries: {len(df)}")
    print(f"📂 Folder Files: {len(actual_file_stems)}")

    for index, row in df.iterrows():
        filename = row['File Name']
        print(f"\n➡️  Processing: {filename}...", end=' ')
        context = ""
        file_path = None
        found = False

        # Match based on prefix (to handle .pdf.docx, etc.)
        matched_files = list(files_dir.glob(f"{filename}*"))
        for match in matched_files:
            suffix = match.suffix.lower()
            file_path = match
            found = True

            if suffix == '.docx':
                context = extract_text_from_docx(file_path)
            elif suffix in ['.png', '.jpg', '.jpeg']:
                context = extract_text_from_image(file_path)
            else:
                print(f"⚠️ Unsupported file type: {suffix}")
                context = ""
            break

        if not found:
            print("❌ File not found in folder.")
            continue

        if not context.strip():
            print("⚠️  No text extracted.")
            continue

        record = {
            'file_name': filename,
            'context': context,
            'ground_truth': {
                'Agreement Value': str(row.get('Agreement Value', '')),
                'Agreement Start Date': str(row.get('Agreement Start Date', '')),
                'Agreement End Date': str(row.get('Agreement End Date', '')),
                'Renewal Notice (Days)': str(row.get('Renewal Notice (Days)', '')),
                'Party One': str(row.get('Party One', '')),
                'Party Two': str(row.get('Party Two', ''))
            }
        }

        processed_records.append(record)
        print("✅ Success.")

    # --- Log missing and extra files ---
    missing_files = [f for f in csv_filenames if not any(Path(f).name in str(p) for p in all_files)]
    if missing_files:
        print("\n🚫 Files listed in CSV but NOT found in folder:")
        for f in missing_files:
            print(f"  - {f}")

    extra_files = [f.stem for f in all_files if f.stem not in csv_filenames]
    if extra_files:
        print("\n📌 Files present in folder but NOT listed in CSV:")
        for f in extra_files:
            print(f"  - {f}")

    return processed_records

# --- Main Execution ---
def main():
    """Main entry point to process training and testing data."""
    # Training
    train_data = process_files(DATA_DIR / 'train.csv', DATA_DIR / 'train')
    train_output_path = PROCESSED_DATA_DIR / 'train_data.json'
    with open(train_output_path, 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=4)
    print(f"\n✅ Training data saved to: {train_output_path}")

    # Testing
    test_data = process_files(DATA_DIR / 'test.csv', DATA_DIR / 'test')
    test_output_path = PROCESSED_DATA_DIR / 'test_data.json'
    with open(test_output_path, 'w', encoding='utf-8') as f:
        json.dump(test_data, f, ensure_ascii=False, indent=4)
    print(f"✅ Testing data saved to: {test_output_path}")

# --- Script Entry Point ---
if __name__ == '__main__':
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Installing python-docx...
Installing pytesseract and Tesseract OCR...

📄 CSV Entries: 10
📂 Folder Files: 10

➡️  Processing: 6683127-House-Rental-Contract-GERALDINE-GALINATO-v2-Page-1... ✅ Success.

➡️  Processing: 6683129-House-Rental-Contract-Geraldine-Galinato-v2... ✅ Success.

➡️  Processing: 18325926-Rental-Agreement-1... ✅ Success.

➡️  Processing: 24158401-Rental-Agreement... ❌ File not found in folder.

➡️  Processing: 36199312-Rental-Agreement... ✅ Success.

➡️  Processing: 44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement... ✅ Success.

➡️  Processing: 47854715-RENTAL-AGREEMENT... ✅ Success.

➡️  Processing: 50070534-RENTAL-AGREEMENT... ✅ Success.

➡️  Processing: 54770958-Rental-Agreement... ✅ Success.

➡️  Processing: 54945838-Rental-Agreement... ✅ Success.

🚫 Files listed in CSV but NOT found in folder:
  - 24158401-Rental-Agreement

📌 Files pr

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# ============================================================================== #
# SCRIPT: 01_combined_preprocess_and_normalize.py
# PURPOSE:
#   - Extracts and normalizes both context and ground_truth for train/test data.
#   - Supports .docx and image (png/jpg) files.
#   - Prepares clean training-ready .json format.
# ============================================================================== #

import pandas as pd
import numpy as np
from pathlib import Path
import json
import subprocess
import re
from dateutil.parser import parse as date_parse
from num2words import num2words

# --- Mount Google Drive (Colab only) ---
from google.colab import drive
drive.mount('/content/drive')

# --- Install dependencies if missing ---
def safe_import(module, pip_name=None, apt_name=None):
    try:
        return __import__(module)
    except ImportError:
        if pip_name:
            subprocess.run(['pip', 'install', '-q', pip_name])
        if apt_name:
            subprocess.run(['sudo', 'apt-get', 'install', '-y', apt_name])
        return __import__(module)

docx = safe_import("docx", pip_name="python-docx")
Image = safe_import("PIL", pip_name="Pillow").Image
pytesseract = safe_import("pytesseract", pip_name="pytesseract", apt_name="tesseract-ocr")

# --- Path Configuration ---
BASE_DIR = Path('/content/drive/MyDrive/ml_document_extractor')
DATA_DIR = BASE_DIR / 'data'
PROCESSED_DATA_DIR = BASE_DIR / 'processed_data'
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

# --- Normalization Utilities ---
def normalize_date(raw_date: str) -> str:
    try:
        dt = date_parse(raw_date, dayfirst=True)
        return dt.strftime('%d %B %Y').lstrip("0")
    except:
        return raw_date

def normalize_amount(value: str) -> list:
    try:
        float_val = float(str(value).replace(",", "").strip())
        return [
            str(int(float_val)),
            f"{float_val:,.2f}",
            num2words(float_val, to='cardinal', lang='en').replace('-', ' ').title()
        ]
    except:
        return [value]

def normalize_name(name: str) -> str:
    return re.sub(r'\s+', ' ', name).strip().title()

def normalize_field(label: str, value: str):
    if not value.strip():
        return value
    if "Date" in label:
        return normalize_date(value)
    elif "Value" in label:
        return normalize_amount(value)
    elif "Party" in label:
        return normalize_name(value)
    elif "Renewal Notice" in label:
        try:
            int_val = int(float(value.strip()))
            return str(int_val)
        except:
            return value.strip()
    return value

# --- Preprocessing Context ---
def preprocess_context(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)  # normalize spacing
    return text.strip()

# --- Text Extraction ---
def extract_text_from_docx(file_path):
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs if para.text])
    except Exception as e:
        print(f"❌ Error reading docx: {e}")
        return ""

def extract_text_from_image(file_path):
    try:
        return pytesseract.image_to_string(Image.open(file_path))
    except Exception as e:
        print(f"❌ Error reading image: {e}")
        return ""

# --- Main Processor ---
def process_files(csv_path, files_dir):
    df = pd.read_csv(csv_path)
    df.replace(np.nan, '', inplace=True)

    processed_records = []
    failed_contexts = []

    all_files = list(files_dir.glob("*"))
    actual_file_stems = {file.stem for file in all_files}
    csv_filenames = set(df['File Name'].tolist())

    print(f"\n📄 CSV Entries: {len(df)} | 📂 Files in folder: {len(all_files)}")

    for idx, row in df.iterrows():
        filename = row['File Name']
        print(f"\n➡️  Processing: {filename}...", end=' ')
        context = ""
        file_path = None

        matched_files = list(files_dir.glob(f"{filename}*"))
        if not matched_files:
            print("❌ File not found.")
            continue

        file_path = matched_files[0]
        suffix = file_path.suffix.lower()

        if suffix == '.docx':
            context = extract_text_from_docx(file_path)
        elif suffix in ['.png', '.jpg', '.jpeg']:
            context = extract_text_from_image(file_path)
        else:
            print(f"⚠️ Unsupported file type: {suffix}")
            continue

        if not context.strip():
            print(f"⚠️ No text extracted for: {file_path.name}")
            failed_contexts.append(file_path.name)
            continue

        context = preprocess_context(context)

        gt = {}
        for label in ['Agreement Value', 'Agreement Start Date', 'Agreement End Date',
                      'Renewal Notice (Days)', 'Party One', 'Party Two']:
            val = str(row.get(label, ''))
            norm_val = normalize_field(label, val)
            gt[label] = norm_val

        record = {
            'file_name': filename,
            'context': context,
            'ground_truth': gt
        }
        processed_records.append(record)
        print("✅ Success.")

    missing_files = [f for f in csv_filenames if f not in actual_file_stems]
    if missing_files:
        print("\n🚫 Missing Files from folder:")
        for f in missing_files:
            print(f"  - {f}")

    extra_files = [f.stem for f in all_files if f.stem not in csv_filenames]
    if extra_files:
        print("\n📌 Extra Files in folder not listed in CSV:")
        for f in extra_files:
            print(f"  - {f}")

    if failed_contexts:
        print("\n⚠️ Files skipped due to empty context:")
        for f in failed_contexts:
            print(f"  - {f}")

    return processed_records

# --- Main ---
def main():
    train_data = process_files(DATA_DIR / 'train.csv', DATA_DIR / 'train')
    train_output_path = PROCESSED_DATA_DIR / 'train_data.json'
    with open(train_output_path, 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=4)
    print(f"\n✅ train_data.json saved!")

    test_data = process_files(DATA_DIR / 'test.csv', DATA_DIR / 'test')
    test_output_path = PROCESSED_DATA_DIR / 'test_data.json'
    with open(test_output_path, 'w', encoding='utf-8') as f:
        json.dump(test_data, f, ensure_ascii=False, indent=4)
    print(f"✅ test_data.json saved!")

# --- Entry Point ---
if __name__ == '__main__':
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

📄 CSV Entries: 10 | 📂 Files in folder: 10

➡️  Processing: 6683127-House-Rental-Contract-GERALDINE-GALINATO-v2-Page-1... ✅ Success.

➡️  Processing: 6683129-House-Rental-Contract-Geraldine-Galinato-v2... ✅ Success.

➡️  Processing: 18325926-Rental-Agreement-1... ✅ Success.

➡️  Processing: 24158401-Rental-Agreement... ❌ File not found.

➡️  Processing: 36199312-Rental-Agreement... ✅ Success.

➡️  Processing: 44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement... ✅ Success.

➡️  Processing: 47854715-RENTAL-AGREEMENT... ✅ Success.

➡️  Processing: 50070534-RENTAL-AGREEMENT... ✅ Success.

➡️  Processing: 54770958-Rental-Agreement... ✅ Success.

➡️  Processing: 54945838-Rental-Agreement... ✅ Success.

🚫 Missing Files from folder:
  - 24158401-Rental-Agreement

📌 Extra Files in folder not listed in CSV:
  - 46239065-Standard-Rental-Agreement-Rental-With-Performan

In [7]:
# ============================================================================== #
# SCRIPT: 01_combined_preprocess_and_normalize.py
# PURPOSE:
#   - Extracts and normalizes both context and ground_truth for train/test data.
#   - Supports .docx and image (png/jpg) files.
#   - Prepares clean training-ready .json format.
# ============================================================================== #

import pandas as pd
import numpy as np
from pathlib import Path
import json
import subprocess
import re
from dateutil.parser import parse as date_parse
from num2words import num2words

# --- Mount Google Drive (Colab only) ---
from google.colab import drive
drive.mount('/content/drive')

# --- Install dependencies if missing ---
def safe_import(module, pip_name=None, apt_name=None):
    try:
        return __import__(module)
    except ImportError:
        if pip_name:
            subprocess.run(['pip', 'install', '-q', pip_name])
        if apt_name:
            subprocess.run(['sudo', 'apt-get', 'install', '-y', apt_name])
        return __import__(module)

docx = safe_import("docx", pip_name="python-docx")
Image = safe_import("PIL", pip_name="Pillow").Image
pytesseract = safe_import("pytesseract", pip_name="pytesseract", apt_name="tesseract-ocr")

# --- Path Configuration ---
BASE_DIR = Path('/content/drive/MyDrive/ml_document_extractor')
DATA_DIR = BASE_DIR / 'data'
PROCESSED_DATA_DIR = BASE_DIR / 'processed_data'
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

# --- Normalization Utilities ---
def normalize_date(raw_date: str) -> str:
    try:
        dt = date_parse(raw_date, dayfirst=True, fuzzy=True)
        return dt.strftime('%d %B %Y')
    except:
        return raw_date.strip()

def normalize_amount(value: str) -> list:
    try:
        float_val = float(str(value).replace(",", "").strip())
        return [
            str(int(float_val)),
            f"{float_val:,.2f}",
            num2words(float_val, to='cardinal', lang='en').replace('-', ' ').title()
        ]
    except:
        return [value.strip()]

def normalize_name(name: str) -> str:
    return re.sub(r'\s+', ' ', name).strip().title()

def normalize_field(label: str, value: str):
    value = value.strip()
    if not value:
        return value
    if "Date" in label:
        return normalize_date(value)
    elif "Value" in label:
        return normalize_amount(value)
    elif "Party" in label:
        return normalize_name(value)
    elif "Renewal Notice" in label:
        try:
            return str(int(float(value)))
        except:
            if "month" in value.lower():
                match = re.search(r'(\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)', value, re.I)
                if match:
                    word = match.group(0).lower()
                    word_to_number = {
                        "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
                        "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
                        "eleven": 11, "twelve": 12
                    }
                    return str(word_to_number.get(word, word))
            return value
    return value

# --- Preprocessing Context ---
def preprocess_context(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'(\d{1,2})(st|nd|rd|th)?\s+(January|February|March|April|May|June|July|August|September|October|November|December)',
                  lambda m: f"{int(m.group(1)):02d} {m.group(3)}", text, flags=re.I)
    text = re.sub(r'(?i)(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s+months?',
                  lambda m: str({
                      "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
                      "six": 6, "seven": 7, "eight": 8, "nine": 9,
                      "ten": 10, "eleven": 11, "twelve": 12
                  }[m.group(1).lower()]), text)
    text = re.sub(r'(\d+)\s+months?', r'\1', text)
    return text.strip()

# --- Text Extraction ---
def extract_text_from_docx(file_path):
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs if para.text])
    except Exception as e:
        print(f"❌ Error reading docx: {e}")
        return ""

def extract_text_from_image(file_path):
    try:
        return pytesseract.image_to_string(Image.open(file_path))
    except Exception as e:
        print(f"❌ Error reading image: {e}")
        return ""

# --- Main Processor ---
def process_files(csv_path, files_dir):
    df = pd.read_csv(csv_path)
    df.replace(np.nan, '', inplace=True)

    processed_records = []
    failed_contexts = []

    all_files = list(files_dir.glob("*"))
    actual_file_stems = {file.stem for file in all_files}
    csv_filenames = set(df['File Name'].tolist())

    print(f"\n📄 CSV Entries: {len(df)} | 📂 Files in folder: {len(all_files)}")

    for idx, row in df.iterrows():
        filename = row['File Name']
        print(f"\n➡️  Processing: {filename}...", end=' ')
        context = ""

        matched_files = list(files_dir.glob(f"{filename}*"))
        if not matched_files:
            print("❌ File not found.")
            continue

        file_path = matched_files[0]
        suffix = file_path.suffix.lower()

        if suffix == '.docx':
            context = extract_text_from_docx(file_path)
        elif suffix in ['.png', '.jpg', '.jpeg']:
            context = extract_text_from_image(file_path)
        else:
            print(f"⚠️ Unsupported file type: {suffix}")
            continue

        if not context.strip():
            print(f"⚠️ No text extracted for: {file_path.name}")
            failed_contexts.append(file_path.name)
            continue

        context = preprocess_context(context)

        gt = {}
        for label in ['Agreement Value', 'Agreement Start Date', 'Agreement End Date',
                      'Renewal Notice (Days)', 'Party One', 'Party Two']:
            val = str(row.get(label, ''))
            norm_val = normalize_field(label, val)
            gt[label] = norm_val

        record = {
            'file_name': filename,
            'context': context,
            'ground_truth': gt
        }
        processed_records.append(record)
        print("✅ Success.")

    # Log extra/missing files
    missing_files = [f for f in csv_filenames if f not in actual_file_stems]
    if missing_files:
        print("\n🚫 Missing Files from folder:")
        for f in missing_files:
            print(f"  - {f}")

    extra_files = [f.stem for f in all_files if f.stem not in csv_filenames]
    if extra_files:
        print("\n📌 Extra Files in folder not listed in CSV:")
        for f in extra_files:
            print(f"  - {f}")

    if failed_contexts:
        print("\n⚠️ Files skipped due to empty context:")
        for f in failed_contexts:
            print(f"  - {f}")

    return processed_records

# --- Main ---
def main():
    train_data = process_files(DATA_DIR / 'train.csv', DATA_DIR / 'train')
    with open(PROCESSED_DATA_DIR / 'train_data.json', 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=4)
    print(f"\n✅ train_data.json saved!")

    test_data = process_files(DATA_DIR / 'test.csv', DATA_DIR / 'test')
    with open(PROCESSED_DATA_DIR / 'test_data.json', 'w', encoding='utf-8') as f:
        json.dump(test_data, f, ensure_ascii=False, indent=4)
    print(f"✅ test_data.json saved!")

# --- Entry Point ---
if __name__ == '__main__':
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

📄 CSV Entries: 10 | 📂 Files in folder: 10

➡️  Processing: 6683127-House-Rental-Contract-GERALDINE-GALINATO-v2-Page-1... ✅ Success.

➡️  Processing: 6683129-House-Rental-Contract-Geraldine-Galinato-v2... ✅ Success.

➡️  Processing: 18325926-Rental-Agreement-1... ✅ Success.

➡️  Processing: 24158401-Rental-Agreement... ❌ File not found.

➡️  Processing: 36199312-Rental-Agreement... ✅ Success.

➡️  Processing: 44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement... ✅ Success.

➡️  Processing: 47854715-RENTAL-AGREEMENT... ✅ Success.

➡️  Processing: 50070534-RENTAL-AGREEMENT... ✅ Success.

➡️  Processing: 54770958-Rental-Agreement... ✅ Success.

➡️  Processing: 54945838-Rental-Agreement... ✅ Success.

🚫 Missing Files from folder:
  - 24158401-Rental-Agreement

📌 Extra Files in folder not listed in CSV:
  - 46239065-Standard-Rental-Agreement-Rental-With-Performan