# Iceberg Table Explorer

Analyze the Iceberg table created by `ingest.py`. Run the ingestion first:
```bash
rm -rf output && .venv/bin/python ingest.py
```

In [None]:
from pathlib import Path
from pyiceberg.catalog.sql import SqlCatalog
import polars as pl

SCRIPT_DIR = Path(".").resolve()
WAREHOUSE_DIR = SCRIPT_DIR / "output" / "warehouse"
CATALOG_DB = SCRIPT_DIR / "output" / "catalog.db"

catalog = SqlCatalog(
    "local",
    uri=f"sqlite:///{CATALOG_DB}",
    warehouse=str(WAREHOUSE_DIR),
)

table = catalog.load_table("default.employees")
print(f"Table: {table.name()}")
print(f"Location: {table.location()}")

## Schema

In [None]:
for field in table.schema().fields:
    print(f"  {field.field_id:>3}  {field.name:<20} {field.field_type}  {'required' if field.required else 'optional'}")

## Snapshots (version history)

In [None]:
for snap in table.metadata.snapshots:
    print(f"  Snapshot {snap.snapshot_id} | parent={snap.parent_snapshot_id} | records={snap.summary.additional_properties.get('total-records', '?')}")

## Full table scan

In [None]:
df = pl.from_arrow(table.scan().to_arrow())
print(f"Rows: {len(df)}  Columns: {len(df.columns)}")
df

## Null analysis (schema evolution gaps)

In [None]:
null_counts = df.null_count()
null_counts

## Filter examples

In [None]:
# Rows that have an email (from employees_added.csv)
df.filter(pl.col("email").is_not_null())

In [None]:
# Rows that have full_name instead of name (from employees_renamed.csv)
df.filter(pl.col("full_name").is_not_null())

In [None]:
# Rows missing age (from employees_deleted.csv)
df.filter(pl.col("age").is_null())

## Group by department

In [None]:
df.group_by("department").agg(pl.len().alias("count"))

## Data files on disk

In [None]:
for task in table.scan().plan_files():
    f = task.file
    print(f"  {Path(f.file_path).name}  format={f.file_format}  records={f.record_count}  size={f.file_size_in_bytes} bytes")