# Unity Catalog Volume - Excel File Cleanup

This notebook deletes Excel files (`.xlsx`, `.xls`) older than a configurable retention period from Unity Catalog Volume paths.

**Scheduled**: Daily via Databricks Workflow  
**Configuration**: All parameters are exposed as Databricks widgets for job-level overrides.

## 1. Configuration - Widget Parameters

In [None]:
# ---------------------------------------------------------------
# Widget definitions - override these at job/task level as needed
# ---------------------------------------------------------------

dbutils.widgets.text(
    "volume_paths",
    "/Volumes/catalog1/schema1/volume1,/Volumes/catalog1/schema1/volume2",
    "Comma-separated volume paths to scan",
)

dbutils.widgets.text(
    "retention_days",
    "29",
    "Delete files older than this many days",
)

dbutils.widgets.text(
    "file_extensions",
    ".xlsx,.xls",
    "Comma-separated file extensions to target",
)

dbutils.widgets.dropdown(
    "dry_run",
    "true",
    ["true", "false"],
    "Dry run mode (true = list only, false = delete)",
)

## 2. Parse Widget Values

In [None]:
import os
import time
from datetime import datetime, timezone

# --- Parse widgets into typed config ---
VOLUME_PATHS = [
    p.strip() for p in dbutils.widgets.get("volume_paths").split(",") if p.strip()
]
RETENTION_DAYS = int(dbutils.widgets.get("retention_days"))
FILE_EXTENSIONS = tuple(
    ext.strip().lower() for ext in dbutils.widgets.get("file_extensions").split(",") if ext.strip()
)
DRY_RUN = dbutils.widgets.get("dry_run").lower() == "true"

# Pre-compute the cutoff timestamp (seconds since epoch)
CUTOFF_EPOCH = time.time() - (RETENTION_DAYS * 86400)
CUTOFF_DATE = datetime.fromtimestamp(CUTOFF_EPOCH, tz=timezone.utc)

print("=" * 60)
print("CONFIGURATION")
print("=" * 60)
print(f"Volume paths     : {VOLUME_PATHS}")
print(f"Retention days   : {RETENTION_DAYS}")
print(f"File extensions  : {FILE_EXTENSIONS}")
print(f"Dry run          : {DRY_RUN}")
print(f"Cutoff date (UTC): {CUTOFF_DATE:%Y-%m-%d %H:%M:%S}")
print("=" * 60)

## 3. Helper Functions

In [None]:
def scan_excel_files(base_path: str, extensions: tuple[str, ...]) -> list[str]:
    """Recursively find all files matching the target extensions under base_path."""
    matched_files = []
    for root, _dirs, files in os.walk(base_path):
        for fname in files:
            if fname.lower().endswith(extensions):
                matched_files.append(os.path.join(root, fname))
    return matched_files


def get_file_age_days(file_path: str) -> float:
    """Return the age of a file in days based on its modification time."""
    mtime = os.path.getmtime(file_path)
    return (time.time() - mtime) / 86400


def is_older_than_cutoff(file_path: str, cutoff_epoch: float) -> bool:
    """Return True if the file's modification time is before the cutoff."""
    return os.path.getmtime(file_path) < cutoff_epoch


def delete_file(file_path: str, dry_run: bool) -> dict:
    """Delete a single file. Returns a result dict with status details."""
    mtime = os.path.getmtime(file_path)
    size_bytes = os.path.getsize(file_path)
    age_days = (time.time() - mtime) / 86400

    result = {
        "file_path": file_path,
        "size_bytes": size_bytes,
        "age_days": round(age_days, 1),
        "modified_utc": datetime.fromtimestamp(mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
        "status": "pending",
    }

    if dry_run:
        result["status"] = "dry_run_skipped"
    else:
        try:
            os.remove(file_path)
            result["status"] = "deleted"
        except Exception as e:
            result["status"] = f"error: {e}"

    return result

## 4. Scan and Delete Old Excel Files

In [None]:
all_results = []
summary = {"paths_scanned": 0, "files_scanned": 0, "files_eligible": 0, "files_deleted": 0, "errors": 0, "total_bytes_freed": 0}

for vol_path in VOLUME_PATHS:
    print(f"\nScanning: {vol_path}")

    if not os.path.isdir(vol_path):
        print(f"  WARNING: Path does not exist or is not a directory - skipping")
        continue

    summary["paths_scanned"] += 1
    excel_files = scan_excel_files(vol_path, FILE_EXTENSIONS)
    print(f"  Found {len(excel_files)} Excel file(s)")
    summary["files_scanned"] += len(excel_files)

    for fpath in excel_files:
        if is_older_than_cutoff(fpath, CUTOFF_EPOCH):
            summary["files_eligible"] += 1
            result = delete_file(fpath, DRY_RUN)
            all_results.append(result)

            if result["status"] == "deleted":
                summary["files_deleted"] += 1
                summary["total_bytes_freed"] += result["size_bytes"]
            elif result["status"].startswith("error"):
                summary["errors"] += 1

            action = result["status"].upper()
            print(f"  [{action}] {result['file_path']} | age={result['age_days']}d | size={result['size_bytes']}B | modified={result['modified_utc']}")

## 5. Summary Report

In [None]:
print("\n" + "=" * 60)
print("EXECUTION SUMMARY")
print("=" * 60)
print(f"Mode               : {'DRY RUN' if DRY_RUN else 'LIVE DELETE'}")
print(f"Volume paths scanned: {summary['paths_scanned']}")
print(f"Total Excel files   : {summary['files_scanned']}")
print(f"Eligible for delete : {summary['files_eligible']} (older than {RETENTION_DAYS} days)")
if DRY_RUN:
    print(f"Files deleted       : 0 (dry run - no files removed)")
else:
    print(f"Files deleted       : {summary['files_deleted']}")
    print(f"Space freed         : {summary['total_bytes_freed'] / (1024*1024):.2f} MB")
print(f"Errors              : {summary['errors']}")
print("=" * 60)

if summary["errors"] > 0:
    print("\nERROR DETAILS:")
    for r in all_results:
        if r["status"].startswith("error"):
            print(f"  {r['file_path']} -> {r['status']}")

if DRY_RUN and summary["files_eligible"] > 0:
    print(f"\nNOTE: Set dry_run=false to actually delete the {summary['files_eligible']} eligible file(s).")

## 6. Exit with Error if Failures Occurred

In [None]:
if summary["errors"] > 0:
    dbutils.notebook.exit(
        f"COMPLETED_WITH_ERRORS: {summary['files_deleted']} deleted, {summary['errors']} errors"
    )
else:
    mode = "DRY_RUN" if DRY_RUN else "LIVE"
    dbutils.notebook.exit(
        f"SUCCESS ({mode}): {summary['files_eligible']} eligible, {summary['files_deleted']} deleted"
    )