# PPTX Media Optimizer

**Quick Start:** Run Cell 1 (Setup), then Cell 2 (Optimize) - that's it!

Features: Image compression (pngquant + JPEG), video transcoding (GPU), unused media removal.

In [None]:
#@title 1. Setup (Run Once)
import subprocess, sys, os, zipfile, tempfile, shutil, json, time
from pathlib import Path
from dataclasses import dataclass
from typing import Optional
from enum import Enum
import xml.etree.ElementTree as ET

# Install dependencies
print("Installing dependencies...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "Pillow"])

# Install pngquant for high-quality PNG compression
print("Installing pngquant...")
subprocess.run(["apt-get", "update", "-qq"], capture_output=True)
subprocess.run(["apt-get", "install", "-y", "-qq", "pngquant"], capture_output=True)

from PIL import Image

# Check FFmpeg
try:
    subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
    print("FFmpeg: OK")
except:
    print("Installing FFmpeg...")
    subprocess.run(["apt-get", "install", "-y", "-qq", "ffmpeg"], capture_output=True)

# Check pngquant
try:
    subprocess.run(["pngquant", "--version"], capture_output=True, check=True)
    PNGQUANT_AVAILABLE = True
    print("pngquant: OK")
except:
    PNGQUANT_AVAILABLE = False
    print("pngquant: Not available (will use Pillow fallback)")

# GPU Detection
GPU_AVAILABLE = False
NVENC_AVAILABLE = False
try:
    result = subprocess.run(["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
                          capture_output=True, text=True, timeout=5)
    if result.returncode == 0:
        GPU_AVAILABLE = True
        print(f"GPU: {result.stdout.strip()}")
        result = subprocess.run(["ffmpeg", "-encoders"], capture_output=True, text=True)
        NVENC_AVAILABLE = "h264_nvenc" in result.stdout
        print(f"NVENC: {'Available' if NVENC_AVAILABLE else 'Not available'}")
except:
    print("GPU: Not available (CPU mode)")

# === CORE CLASSES ===
class MediaType(Enum):
    IMAGE = "image"
    VIDEO = "video"
    AUDIO = "audio"
    VECTOR = "vector"
    UNKNOWN = "unknown"

@dataclass
class MediaFile:
    path: Path
    name: str
    size: int
    media_type: MediaType
    extension: str
    is_referenced: bool = True

IMAGE_EXT = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"}
VECTOR_EXT = {".emf", ".wmf", ".svg"}
VIDEO_EXT = {".mp4", ".avi", ".mov", ".wmv", ".m4v", ".mkv", ".webm"}
AUDIO_EXT = {".mp3", ".wav", ".m4a", ".wma", ".aac", ".ogg"}
SKIP_EXT = {".gif"}

def get_media_type(ext):
    ext = ext.lower()
    if ext in IMAGE_EXT: return MediaType.IMAGE
    if ext in VECTOR_EXT: return MediaType.VECTOR
    if ext in VIDEO_EXT: return MediaType.VIDEO
    if ext in AUDIO_EXT: return MediaType.AUDIO
    return MediaType.UNKNOWN

def format_size(b):
    for u in ['B','KB','MB','GB']:
        if b < 1024: return f"{b:.1f} {u}"
        b /= 1024
    return f"{b:.1f} TB"

def extract_pptx(pptx_path, extract_dir):
    try:
        with zipfile.ZipFile(pptx_path, 'r') as zf:
            zf.extractall(extract_dir)
        return True
    except zipfile.BadZipFile:
        print("Error: Invalid PPTX file")
        return False

def repackage_pptx(source_dir, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(source_dir):
            for f in files:
                fp = os.path.join(root, f)
                zf.write(fp, os.path.relpath(fp, source_dir))
    return True

def get_referenced_media(extract_dir):
    referenced = set()
    ns = {"r": "http://schemas.openxmlformats.org/package/2006/relationships"}
    for rels_dir in ["ppt/slides/_rels", "ppt/slideLayouts/_rels", "ppt/slideMasters/_rels", "ppt/_rels"]:
        rels_path = Path(extract_dir) / rels_dir
        if not rels_path.exists(): continue
        for rels_file in rels_path.glob("*.rels"):
            try:
                for rel in ET.parse(rels_file).getroot().findall(".//r:Relationship", ns):
                    target = rel.get("Target", "")
                    if "media/" in target:
                        referenced.add(target.split("media/")[-1])
            except: pass
    return referenced

# === IMAGE OPTIMIZATION ===
def optimize_png_pngquant(file_path, quality_min=40, quality_max=70):
    """Optimize PNG using pngquant (lossy but high quality)."""
    original_size = file_path.stat().st_size
    
    # pngquant outputs to a new file by default, use --ext to replace
    # --force overwrites, --skip-if-larger keeps original if result is bigger
    cmd = [
        "pngquant",
        "--quality", f"{quality_min}-{quality_max}",
        "--force",
        "--skip-if-larger",
        "--ext", ".png",  # Replace original
        "--strip",  # Remove metadata
        str(file_path)
    ]
    
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        # pngquant returns 99 if skipped due to --skip-if-larger
        new_size = file_path.stat().st_size
        return True, original_size, new_size
    except Exception as e:
        return False, original_size, original_size

def optimize_image(file_path, jpeg_quality=65, png_quality=70, max_width=1600):
    """Optimize image: resize + compress."""
    ext = file_path.suffix.lower()
    if ext in VECTOR_EXT or ext in SKIP_EXT:
        return False, 0, 0
    
    original_size = file_path.stat().st_size
    
    try:
        # First resize if needed (using Pillow)
        resized = False
        with Image.open(file_path) as img:
            if img.width > max_width:
                ratio = max_width / img.width
                new_size = (max_width, int(img.height * ratio))
                img = img.resize(new_size, Image.Resampling.LANCZOS)
                
                if ext in (".jpg", ".jpeg"):
                    if img.mode == "RGBA":
                        bg = Image.new("RGB", img.size, (255, 255, 255))
                        bg.paste(img, mask=img.split()[3])
                        img = bg
                    img.save(file_path, "JPEG", quality=jpeg_quality, optimize=True)
                elif ext == ".png":
                    img.save(file_path, "PNG", optimize=True)
                resized = True
        
        # For PNG, use pngquant (even if not resized)
        if ext == ".png" and PNGQUANT_AVAILABLE:
            ok, _, new_sz = optimize_png_pngquant(file_path, quality_min=40, quality_max=png_quality)
            return True, original_size, file_path.stat().st_size
        
        # For JPEG (if not already handled by resize)
        if ext in (".jpg", ".jpeg") and not resized:
            with Image.open(file_path) as img:
                if img.mode == "RGBA":
                    bg = Image.new("RGB", img.size, (255, 255, 255))
                    bg.paste(img, mask=img.split()[3])
                    img = bg
                img.save(file_path, "JPEG", quality=jpeg_quality, optimize=True)
        
        return True, original_size, file_path.stat().st_size
    
    except Exception as e:
        print(f"Warning: {e}")
        return False, original_size, original_size

# === VIDEO OPTIMIZATION ===
def transcode_video(input_path, output_path, bitrate="1.5M", use_gpu=True):
    """Transcode video to H.264/AAC."""
    original_size = input_path.stat().st_size
    cmd = ["ffmpeg", "-y", "-hide_banner", "-loglevel", "warning"]
    
    if use_gpu and NVENC_AVAILABLE:
        cmd.extend(["-hwaccel", "cuda", "-i", str(input_path),
                   "-c:v", "h264_nvenc", "-preset", "fast", "-b:v", bitrate,
                   "-c:a", "aac", "-b:a", "96k", str(output_path)])
    else:
        cmd.extend(["-i", str(input_path),
                   "-c:v", "libx264", "-preset", "fast", "-crf", "26",
                   "-c:a", "aac", "-b:a", "96k", str(output_path)])
    
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
        if result.returncode == 0 and output_path.exists():
            return True, original_size, output_path.stat().st_size
    except Exception as e:
        print(f"Error: {e}")
    return False, original_size, original_size

def transcode_audio(input_path, output_path):
    """Transcode audio to AAC."""
    original_size = input_path.stat().st_size
    cmd = ["ffmpeg", "-y", "-hide_banner", "-loglevel", "warning",
           "-i", str(input_path), "-c:a", "aac", "-b:a", "96k", str(output_path)]
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
        if result.returncode == 0 and output_path.exists():
            return True, original_size, output_path.stat().st_size
    except: pass
    return False, original_size, original_size

print("\n" + "="*50)
print("Setup complete! Now run Cell 2 to optimize.")
print("="*50)

In [None]:
#@title 2. Upload & Optimize
#@markdown ### Settings
jpeg_quality = 65 #@param {type:"slider", min:30, max:95, step:5}
png_quality = 70 #@param {type:"slider", min:40, max:95, step:5}
max_image_width = 1600 #@param {type:"integer"}
video_bitrate = "1.5M" #@param ["1M", "1.5M", "2M", "3M"] {allow-input: true}
remove_unused = True #@param {type:"boolean"}
optimize_images_flag = True #@param {type:"boolean"}
optimize_videos_flag = True #@param {type:"boolean"}

from google.colab import files

# Upload
print("Select your PPTX file:")
uploaded = files.upload()
if not uploaded:
    raise Exception("No file uploaded")

INPUT_FILE = list(uploaded.keys())[0]
original_size = os.path.getsize(INPUT_FILE)
print(f"\nFile: {INPUT_FILE} ({format_size(original_size)})")

# Process
start_time = time.time()
work_dir = tempfile.mkdtemp(prefix="pptx_opt_")
output_file = INPUT_FILE.replace(".pptx", "_optimized.pptx").replace(".PPTX", "_optimized.pptx")

print("\nExtracting...")
if not extract_pptx(INPUT_FILE, work_dir):
    raise Exception("Failed to extract PPTX")

media_dir = Path(work_dir) / "ppt" / "media"
stats = {"images": 0, "videos": 0, "audio": 0, "unused": 0, "saved": 0}

if media_dir.exists():
    referenced = get_referenced_media(work_dir)
    
    media_files = []
    for f in media_dir.iterdir():
        if not f.is_file(): continue
        ext = f.suffix.lower()
        media_files.append(MediaFile(
            path=f, name=f.name, size=f.stat().st_size,
            media_type=get_media_type(ext), extension=ext,
            is_referenced=f.name in referenced
        ))
    
    # Remove unused
    if remove_unused:
        unused = [m for m in media_files if not m.is_referenced]
        if unused:
            print(f"\nRemoving {len(unused)} unused files...")
            for m in unused:
                try:
                    m.path.unlink()
                    stats["unused"] += 1
                    stats["saved"] += m.size
                    print(f"  Removed {m.name} ({format_size(m.size)})")
                except: pass
    
    # Optimize images
    if optimize_images_flag:
        images = [m for m in media_files if m.media_type == MediaType.IMAGE and m.is_referenced]
        if images:
            print(f"\nOptimizing {len(images)} images...")
            for m in images:
                if not m.path.exists(): continue
                print(f"  {m.name}...", end=" ", flush=True)
                ok, orig, new = optimize_image(m.path, jpeg_quality, png_quality, max_image_width)
                if ok:
                    saved = orig - new
                    pct = (saved/orig*100) if orig > 0 else 0
                    print(f"{format_size(orig)} -> {format_size(new)} ({pct:.0f}% saved)")
                    stats["images"] += 1
                    stats["saved"] += saved
                else:
                    print("skipped")
    
    # Optimize videos
    if optimize_videos_flag:
        videos = [m for m in media_files if m.media_type == MediaType.VIDEO and m.is_referenced]
        if videos:
            encoder = "GPU" if NVENC_AVAILABLE else "CPU"
            print(f"\nTranscoding {len(videos)} videos ({encoder})...")
            for m in videos:
                if not m.path.exists(): continue
                print(f"  {m.name}...", end=" ", flush=True)
                temp_out = m.path.with_suffix(".temp.mp4")
                ok, orig, new = transcode_video(m.path, temp_out, video_bitrate, NVENC_AVAILABLE)
                if ok:
                    m.path.unlink()
                    temp_out.rename(m.path)
                    saved = orig - new
                    pct = (saved/orig*100) if orig > 0 else 0
                    print(f"{format_size(orig)} -> {format_size(new)} ({pct:.0f}% saved)")
                    stats["videos"] += 1
                    stats["saved"] += saved
                else:
                    print("failed")
                    if temp_out.exists(): temp_out.unlink()
        
        # Audio
        audio = [m for m in media_files if m.media_type == MediaType.AUDIO and m.is_referenced]
        if audio:
            print(f"\nTranscoding {len(audio)} audio files...")
            for m in audio:
                if not m.path.exists(): continue
                print(f"  {m.name}...", end=" ", flush=True)
                temp_out = m.path.with_suffix(".temp.m4a")
                ok, orig, new = transcode_audio(m.path, temp_out)
                if ok:
                    m.path.unlink()
                    temp_out.rename(m.path)
                    saved = orig - new
                    print(f"{format_size(orig)} -> {format_size(new)}")
                    stats["audio"] += 1
                    stats["saved"] += saved
                else:
                    print("failed")
                    if temp_out.exists(): temp_out.unlink()

# Repackage
print("\nRepackaging...")
repackage_pptx(work_dir, output_file)
shutil.rmtree(work_dir, ignore_errors=True)

# Results
new_size = os.path.getsize(output_file)
elapsed = time.time() - start_time
total_saved = original_size - new_size
pct_saved = (total_saved / original_size * 100) if original_size > 0 else 0

print(f"\n{'='*60}")
print("DONE!")
print(f"{'='*60}")
print(f"Original:  {format_size(original_size)}")
print(f"Optimized: {format_size(new_size)}")
print(f"Saved:     {format_size(total_saved)} ({pct_saved:.1f}%)")
print(f"Time:      {elapsed:.1f}s")
print(f"\nImages: {stats['images']} | Videos: {stats['videos']} | Audio: {stats['audio']} | Unused removed: {stats['unused']}")

# Auto-download
print(f"\nDownloading {output_file}...")
files.download(output_file)

In [None]:
#@title 3. Analysis Only (Optional)
#@markdown Run this to analyze without optimizing.

from google.colab import files

print("Select PPTX to analyze:")
uploaded = files.upload()
if not uploaded:
    raise Exception("No file uploaded")

INPUT_FILE = list(uploaded.keys())[0]
print(f"\nAnalyzing {INPUT_FILE}...\n")

work_dir = tempfile.mkdtemp(prefix="pptx_analyze_")
extract_pptx(INPUT_FILE, work_dir)

media_dir = Path(work_dir) / "ppt" / "media"
if not media_dir.exists():
    print("No media found.")
else:
    referenced = get_referenced_media(work_dir)
    total_size = 0
    unused_size = 0
    
    print(f"{'='*70}")
    print(f"{'File':<35} {'Type':<8} {'Size':>10} {'Status':<10}")
    print(f"{'-'*70}")
    
    for f in sorted(media_dir.iterdir()):
        if not f.is_file(): continue
        size = f.stat().st_size
        mtype = get_media_type(f.suffix.lower())
        is_ref = f.name in referenced
        status = "" if is_ref else "UNUSED"
        
        print(f"{f.name:<35} {mtype.value:<8} {format_size(size):>10} {status:<10}")
        total_size += size
        if not is_ref:
            unused_size += size
    
    print(f"{'-'*70}")
    print(f"Total media: {format_size(total_size)}")
    print(f"File size:   {format_size(os.path.getsize(INPUT_FILE))}")
    if unused_size > 0:
        print(f"Unused:      {format_size(unused_size)} (can be removed)")

shutil.rmtree(work_dir, ignore_errors=True)