In [None]:
#@title 1. Setup
import subprocess
import sys
import base64
import re
import tempfile
from pathlib import Path

# Install pandoc
subprocess.run(['apt-get', 'update', '-qq'], check=True)
subprocess.run(['apt-get', 'install', '-qq', '-y', 'pandoc'], check=True)

from lxml.html.diff import htmldiff

CSS = '''
<style>
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; max-width: 900px; margin: 40px auto; line-height: 1.6; padding: 20px; color: #333; }
ins { background: #e6ffed; color: #22863a; text-decoration: none; padding: 0 2px; }
del { background: #ffeef0; color: #cb2431; text-decoration: line-through; padding: 0 2px; }
h1, h2, h3 { border-bottom: 1px solid #eee; padding-bottom: 10px; }
img { max-width: 100%; height: auto; display: block; margin: 1em 0; }
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
td, th { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background: #f5f5f5; }
</style>
'''

def convert_docx_to_html(docx_path: Path, media_dir: Path) -> str:
    """Convert DOCX to HTML using pandoc, extracting media."""
    result = subprocess.run(
        ['pandoc', str(docx_path), '-t', 'html', f'--extract-media={media_dir}'],
        capture_output=True, text=True
    )
    if result.returncode != 0:
        raise RuntimeError(f"Pandoc error: {result.stderr}")
    return result.stdout

def embed_images_as_base64(html: str, media_dir: Path) -> str:
    """Replace img src paths with base64 data URIs."""
    def replace_src(match):
        src = match.group(1)
        img_path = media_dir / src if not Path(src).is_absolute() else Path(src)
        if not img_path.exists():
            # Try relative to media_dir parent
            img_path = media_dir.parent / src
        if img_path.exists():
            suffix = img_path.suffix.lower()
            mime = {'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg', 'gif': 'image/gif', 'svg': 'image/svg+xml'}.get(suffix.lstrip('.'), 'application/octet-stream')
            data = base64.b64encode(img_path.read_bytes()).decode('ascii')
            return f'src="data:{mime};base64,{data}"'
        return match.group(0)  # Keep original if not found
    return re.sub(r'src="([^"]+)"', replace_src, html)

def create_diff_html(html_a: str, html_b: str, media_dir: Path) -> str:
    """Create a complete diff HTML document."""
    diff_body = htmldiff(html_a, html_b)
    diff_with_images = embed_images_as_base64(diff_body, media_dir)
    return f'<!DOCTYPE html>\n<html>\n<head>\n<meta charset="utf-8">\n<title>Document Comparison</title>\n{CSS}</head>\n<body>\n{diff_with_images}\n</body>\n</html>'

print("Setup complete. Run Cell 2 to compare documents.")

In [None]:
#@title 2. Compare Documents
from google.colab import files
from IPython.display import display, HTML
import html as html_module

# Upload files
print("Upload Version A (older document):")
uploaded_a = files.upload()
print("\nUpload Version B (newer document):")
uploaded_b = files.upload()

# Validate
file_a = list(uploaded_a.keys())[0]
file_b = list(uploaded_b.keys())[0]

if not file_a.lower().endswith('.docx') or not file_b.lower().endswith('.docx'):
    raise ValueError("Both files must be .docx format")

print(f"\nComparing: {file_a} vs {file_b}")

# Save uploaded files and convert
with tempfile.TemporaryDirectory() as tmpdir:
    tmpdir = Path(tmpdir)
    media_dir = tmpdir / 'media'
    media_dir.mkdir()
    
    path_a = tmpdir / file_a
    path_b = tmpdir / file_b
    path_a.write_bytes(uploaded_a[file_a])
    path_b.write_bytes(uploaded_b[file_b])
    
    # Convert to HTML
    print("Converting documents...")
    html_a = convert_docx_to_html(path_a, media_dir)
    html_b = convert_docx_to_html(path_b, media_dir)
    
    # Generate diff
    print("Generating diff...")
    result_html = create_diff_html(html_a, html_b, media_dir)
    
    # Save output
    name_a = Path(file_a).stem
    name_b = Path(file_b).stem
    output_name = f"diff_{name_a}_{name_b}.html"
    output_path = Path('/content') / output_name
    output_path.write_text(result_html, encoding='utf-8')

print(f"\nDiff saved to: {output_name}")
print(f"File size: {output_path.stat().st_size / 1024:.1f} KB")

# Preview (images won't render in iframe)
print("\n--- Preview (images may not display) ---")
escaped = html_module.escape(result_html)
display(HTML(f'<iframe srcdoc="{escaped}" width="100%" height="600" style="border:1px solid #ccc;"></iframe>'))

# Download
print("\n--- Download ---")
files.download(str(output_path))