In [1]:
from pathlib import Path
from pyiceberg.catalog.sql import SqlCatalog
import polars as pl

SCRIPT_DIR = Path(".").resolve()
WAREHOUSE_DIR = SCRIPT_DIR / "output" / "warehouse"
CATALOG_DB = SCRIPT_DIR / "output" / "catalog.db"

catalog = SqlCatalog(
    "local",
    uri=f"sqlite:///{CATALOG_DB}",
    warehouse=str(WAREHOUSE_DIR),
)

table = catalog.load_table("default.employees")
print(f"Table: {table.name()}")
print(f"Location: {table.location()}")

Table: ('default', 'employees')
Location: /mnt/c/Work/playground/components/de/iceberg/upload/output/warehouse/default/employees


## Schema

In [2]:
for field in table.schema().fields:
    print(f"  {field.field_id:>3}  {field.name:<20} {field.field_type}  {'required' if field.required else 'optional'}")

    1  id                   string  optional
    2  name                 string  optional
    3  age                  string  optional
    4  department           string  optional
    5  email                string  optional
    6  full_name            string  optional


## Snapshots (version history)

In [3]:
for snap in table.metadata.snapshots:
    print(f"  Snapshot {snap.snapshot_id} | parent={snap.parent_snapshot_id} | records={snap.summary.additional_properties.get('total-records', '?')}")

  Snapshot 2031915129896079601 | parent=None | records=9


## Full table scan

In [4]:
df = pl.from_arrow(table.scan().to_arrow())
print(f"Rows: {len(df)}  Columns: {len(df.columns)}")
df

Rows: 9  Columns: 6


id,name,age,department,email,full_name
str,str,str,str,str,str
"""7""","""Grace Lee""","""28""","""Sales""","""grace@example.com""",
"""8""","""Hank Wilson""","""42""","""Engineering""","""hank@example.com""",
"""9""","""Ivy Chen""","""31""","""Marketing""","""ivy@example.com""",
"""4""","""Dave Brown""",,"""Sales""",,
"""5""","""Eve Davis""",,"""Engineering""",,
"""6""","""Frank Miller""",,"""Marketing""",,
"""1""",,"""30""","""Engineering""",,"""Alice Smith"""
"""2""",,"""25""","""Marketing""",,"""Bob Jones"""
"""3""",,"""35""","""Engineering""",,"""Carol White"""


## Null analysis (schema evolution gaps)

In [None]:
null_counts = df.null_count()
null_counts

## Filter examples

In [5]:
# Rows that have an email (from employees_added.csv)
df.filter(pl.col("email").is_not_null())

id,name,age,department,email,full_name
str,str,str,str,str,str
"""7""","""Grace Lee""","""28""","""Sales""","""grace@example.com""",
"""8""","""Hank Wilson""","""42""","""Engineering""","""hank@example.com""",
"""9""","""Ivy Chen""","""31""","""Marketing""","""ivy@example.com""",


In [None]:
# Rows that have full_name instead of name (from employees_renamed.csv)
df.filter(pl.col("full_name").is_not_null())

In [None]:
# Rows missing age (from employees_deleted.csv)
df.filter(pl.col("age").is_null())

## Group by department

In [None]:
df.group_by("department").agg(pl.len().alias("count"))

## Data files on disk

In [None]:
for task in table.scan().plan_files():
    f = task.file
    print(f"  {Path(f.file_path).name}  format={f.file_format}  records={f.record_count}  size={f.file_size_in_bytes} bytes")