# Run Metadata

**E-Commerce Pipeline Metadata Management**


This script demonstrates how to use clgraph's metadata capabilities:
1. Parse inline SQL comment metadata
2. Manually assign metadata (PII, owner, tags)
3. Propagate metadata through lineage
4. Generate descriptions (with LLM or fallback)
5. Query columns by metadata
6. Trace PII through lineage
7. Export metadata to JSON

### Imports

In [1]:
import json
from pathlib import Path

from clgraph import Pipeline


def load_sql_queries(sql_dir: Path) -> list[tuple[str, str]]:
    """Load all SQL files from a directory in sorted order."""
    queries = []
    for sql_file in sorted(sql_dir.glob("*.sql")):
        with open(sql_file) as f:
            sql = f.read()
        query_name = sql_file.stem
        queries.append((query_name, sql))
        print(f"  Loaded: {query_name}")
    return queries

### Code

In [2]:
print("E-Commerce Pipeline Metadata Management")
print()

# Load SQL files from current directory
sql_dir = Path(".")
queries = load_sql_queries(sql_dir)

print(f"\nLoaded {len(queries)} SQL files")
print()

# Build the pipeline
print("Building pipeline...")
pipeline = Pipeline(queries, dialect="duckdb")
print(f"  Built pipeline with {len(pipeline.table_graph.queries)} queries")
print(f"  Found {len(pipeline.columns)} columns")
print()

E-Commerce Pipeline Metadata Management

  Loaded: 01_raw_orders
  Loaded: 02_raw_customers
  Loaded: 03_raw_products
  Loaded: 04_raw_order_items
  Loaded: 05_stg_orders_enriched
  Loaded: 06_int_daily_metrics
  Loaded: 07_mart_customer_ltv
  Loaded: 08_mart_product_perf

Loaded 8 SQL files

Building pipeline...
  Built pipeline with 8 queries
  Found 326 columns



### Section

In [3]:
print("1. INLINE SQL COMMENT METADATA")
print("""
graph can parse structured metadata from SQL comments in the format:
<description> [key: value, key2: value2, ...]

ample SQL:
SELECT
    email,           -- User email address [pii: true, owner: data-team]
    SUM(amount) as total  /* Total revenue [tags: metric finance] */
FROM ...

pported metadata keys:
- description: Free-text description (before brackets)
- pii: Boolean flag (true/false)
- owner: String identifying data owner
- tags: Space-separated tags
- Any custom key-value pairs
""")

# Show columns that have inline metadata (from SQL comments)
# Metadata is extracted and stored directly on column properties
cols_with_inline_metadata = [
    col
    for col in pipeline.columns.values()
    if col.description_source and col.description_source.value == "source"
]

# Deduplicate by (table_name, column_name)
seen = set()
unique_cols = []
for col in cols_with_inline_metadata:
    key = (col.table_name, col.column_name)
    if key not in seen:
        seen.add(key)
        unique_cols.append(col)

if unique_cols:
    print(f"  Found {len(unique_cols)} columns with inline metadata:")
    for col in unique_cols[:15]:
        print(f"    {col.table_name}.{col.column_name}:")
        if col.description:
            print(f"      description: {col.description}")
        if col.pii:
            print(f"      pii: {col.pii}")
        if col.owner:
            print(f"      owner: {col.owner}")
        if col.tags:
            print(f"      tags: {col.tags}")
    if len(unique_cols) > 15:
        print(f"    ... and {len(unique_cols) - 15} more")
else:
    print("  No columns with inline metadata found in these SQL files.")
    print("  (Add comments like '-- Description [pii: true]' to columns)")
print()

1. INLINE SQL COMMENT METADATA

graph can parse structured metadata from SQL comments in the format:
<description> [key: value, key2: value2, ...]

ample SQL:
SELECT
    email,           -- User email address [pii: true, owner: data-team]
    SUM(amount) as total  /* Total revenue [tags: metric finance] */
FROM ...

pported metadata keys:
- description: Free-text description (before brackets)
- pii: Boolean flag (true/false)
- owner: String identifying data owner
- tags: Space-separated tags
- Any custom key-value pairs

  Found 49 columns with inline metadata:
    raw_orders.order_id:
      description: Unique order identifier
      owner: data-platform
    raw_orders.customer_id:
      description: Reference to customer
      owner: data-platform
    raw_orders.order_date:
      description: Date order was placed
      owner: finance
      tags: {'time'}
    raw_orders.order_timestamp:
      description: Timestamp order was placed
      owner: finance
      tags: {'time'}
    raw_ord

### Section

In [4]:
print("2. MANUAL METADATA ASSIGNMENT")
print("""
r columns without inline metadata, you can assign metadata programmatically:
col.pii = True
col.owner = "data-team"
col.tags.add("metric")
col.description = "Custom description"
col.custom_metadata["sensitivity"] = "high"

te: Columns in raw_orders, raw_customers, raw_products already have metadata
om inline SQL comments. Here we add metadata to order_items (which has none).
""")

# Add metadata to raw_order_items (which doesn't have inline comments)
order_item_metadata = [
    ("raw_order_items", "order_item_id", "data-platform", None, set()),
    ("raw_order_items", "order_id", "data-platform", None, set()),
    ("raw_order_items", "product_id", "data-platform", None, set()),
    ("raw_order_items", "quantity", "operations", "Number of units ordered", {"metric"}),
    ("raw_order_items", "unit_price", "finance", "Price per unit", {"metric", "revenue"}),
    (
        "raw_order_items",
        "line_total",
        "finance",
        "Total line item amount",
        {"metric", "revenue"},
    ),
]

print("  Adding metadata to raw_order_items:")
marked_count = 0
for table, column, owner, description, tags in order_item_metadata:
    for col in pipeline.columns.values():
        if col.table_name == table and col.column_name == column and not col.owner:
            col.owner = owner
            if description:
                col.description = description
            col.tags.update(tags)
            marked_count += 1
            tag_str = f", tags: {tags}" if tags else ""
            desc_str = f", desc: {description[:30]}..." if description else ""
            print(f"    {table}.{column} [owner: {owner}{tag_str}{desc_str}]")

print(f"\n  Added metadata to {marked_count} columns programmatically")
print()

2. MANUAL METADATA ASSIGNMENT

r columns without inline metadata, you can assign metadata programmatically:
col.pii = True
col.owner = "data-team"
col.tags.add("metric")
col.description = "Custom description"
col.custom_metadata["sensitivity"] = "high"

te: Columns in raw_orders, raw_customers, raw_products already have metadata
om inline SQL comments. Here we add metadata to order_items (which has none).

  Adding metadata to raw_order_items:
    raw_order_items.order_item_id [owner: data-platform]
    raw_order_items.order_id [owner: data-platform]
    raw_order_items.product_id [owner: data-platform]
    raw_order_items.quantity [owner: operations, tags: {'metric'}, desc: Number of units ordered...]
    raw_order_items.unit_price [owner: finance, tags: {'revenue', 'metric'}, desc: Price per unit...]
    raw_order_items.line_total [owner: finance, tags: {'revenue', 'metric'}, desc: Total line item amount...]

  Added metadata to 6 columns programmatically



### Section

In [5]:
print("3. METADATA PROPAGATION")
print("""
tadata propagates through lineage automatically:
- PII: If any source column is PII, derived column is PII
- Owner: First owner found in sources wins
- Tags: Union of all source tags

is ensures data governance follows the data through transformations.
""")

# Check PII before propagation
pii_before = len(pipeline.get_pii_columns())
print(f"  PII columns before propagation: {pii_before}")

# Propagate metadata
pipeline.propagate_all_metadata()

# Check PII after propagation
pii_after = len(pipeline.get_pii_columns())
print(f"  PII columns after propagation:  {pii_after}")
print(f"  New PII columns discovered:     {pii_after - pii_before}")
print()

3. METADATA PROPAGATION

tadata propagates through lineage automatically:
- PII: If any source column is PII, derived column is PII
- Owner: First owner found in sources wins
- Tags: Union of all source tags

is ensures data governance follows the data through transformations.

  PII columns before propagation: 6
ðŸ“Š Pass 1: Propagating metadata backward from 172 output columns...
ðŸ“Š Pass 2: Propagating metadata forward for 172 columns...
âœ… Done! Propagated metadata for 172 columns
  PII columns after propagation:  16
  New PII columns discovered:     10



### Section

In [6]:
print("4. DESCRIPTION GENERATION")
print("""
graph can generate descriptions for columns that don't have them.

pported LLM backends:
- Ollama (local, free): ollama pull llama3.2
- OpenAI: requires OPENAI_API_KEY

age:
pipeline.llm = llm_instance
pipeline.generate_all_descriptions()
""")

# Count columns without descriptions in derived tables
cols_without_desc = [
    col
    for col in pipeline.columns.values()
    if not col.description and col.table_name.startswith(("int_", "mart_", "stg_"))
]

# Deduplicate
seen = set()
unique_without_desc = []
for col in cols_without_desc:
    key = (col.table_name, col.column_name)
    if key not in seen:
        seen.add(key)
        unique_without_desc.append(col)

print(f"  Columns in derived tables without descriptions: {len(unique_without_desc)}")
print()

# Try to use Ollama for description generation
llm_available = False
ollama_models = ["llama3:latest", "llama3.2", "qwen3-coder:30b"]  # Try these in order

try:
    from langchain_ollama import ChatOllama

    print("  Attempting to connect to Ollama...")
    for model_name in ollama_models:
        try:
            llm = ChatOllama(
                model=model_name,
                temperature=0.3,
            )
            # Test connection with a simple call
            llm.invoke("test")
            pipeline.llm = llm
            llm_available = True
            print(f"  Connected to Ollama (model: {model_name})")
            break
        except Exception:
            continue

    if not llm_available:
        raise Exception("No working Ollama model found")

except Exception as e:
    print(f"  Ollama not available: {type(e).__name__}")
    print("  To enable LLM descriptions:")
    print("    1. Install Ollama: brew install ollama")
    print("    2. Pull model: ollama pull llama3:latest")
    print("    3. Start server: ollama serve")
print()

if llm_available:
    # Generate descriptions for a few sample columns to demonstrate
    print("  Generating descriptions for sample columns...")
    print("  (Generating for 5 columns to save time)")
    print()

    sample_cols = unique_without_desc[:5]
    for col in sample_cols:
        try:
            # Import the generate function
            from clgraph.column import generate_description

            generate_description(col, pipeline.llm, pipeline)
            print(f"    {col.table_name}.{col.column_name}:")
            print(f"      -> {col.description}")
        except Exception as e:
            print(f"    {col.table_name}.{col.column_name}: Error - {e}")
    print()
    print("  To generate all descriptions, run: pipeline.generate_all_descriptions()")
else:
    print("  Sample columns that would get descriptions:")
    for col in unique_without_desc[:5]:
        print(f"    - {col.table_name}.{col.column_name}")
    if len(unique_without_desc) > 5:
        print(f"    ... and {len(unique_without_desc) - 5} more")
print()

4. DESCRIPTION GENERATION

graph can generate descriptions for columns that don't have them.

pported LLM backends:
- Ollama (local, free): ollama pull llama3.2
- OpenAI: requires OPENAI_API_KEY

age:
pipeline.llm = llm_instance
pipeline.generate_all_descriptions()

  Columns in derived tables without descriptions: 119



  Attempting to connect to Ollama...


  Connected to Ollama (model: qwen3-coder:30b)

  Generating descriptions for sample columns...
  (Generating for 5 columns to save time)



    stg_orders_enriched.customer_id:
      -> Customer identifier referencing raw_orders table, per order record.


    stg_orders_enriched.order_date:
      -> Order date when customers placed their purchases per day.


    stg_orders_enriched.order_timestamp:
      -> Order timestamp from raw data, per order record.


    stg_orders_enriched.status:
      -> Order status indicator per raw_orders source table.


    stg_orders_enriched.channel:
      -> Sales channel per order, derived from raw_orders table.

  To generate all descriptions, run: pipeline.generate_all_descriptions()



### Section

In [7]:
print("5. QUERYING COLUMNS BY METADATA")

# 5a. Get PII columns
print("  5a. PII Columns (get_pii_columns)")
pii_columns = pipeline.get_pii_columns()
print(f"  Found {len(pii_columns)} PII columns:")

# Group by table for cleaner output
pii_by_table = {}
for col in pii_columns:
    if col.table_name not in pii_by_table:
        pii_by_table[col.table_name] = []
    pii_by_table[col.table_name].append(col.column_name)

for table in sorted(pii_by_table.keys()):
    cols = sorted(set(pii_by_table[table]))  # dedupe
    print(f"    {table}:")
    for col_name in cols[:5]:
        print(f"      - {col_name}")
    if len(cols) > 5:
        print(f"      ... and {len(cols) - 5} more")
print()

# 5b. Get columns by owner
print("  5b. Columns by Owner (get_columns_by_owner)")

owners = set()
for col in pipeline.columns.values():
    if col.owner:
        owners.add(col.owner)

for owner in sorted(owners):
    cols = pipeline.get_columns_by_owner(owner)
    unique_cols = {(c.table_name, c.column_name) for c in cols}
    print(f"    {owner}: {len(unique_cols)} unique columns")
print()

# 5c. Get columns by tag
print("  5c. Columns by Tag (get_columns_by_tag)")

all_tags = set()
for col in pipeline.columns.values():
    all_tags.update(col.tags)

for tag in sorted(all_tags):
    cols = pipeline.get_columns_by_tag(tag)
    unique_cols = {(c.table_name, c.column_name) for c in cols}
    print(f"    '{tag}': {len(unique_cols)} unique columns")
print()

5. QUERYING COLUMNS BY METADATA
  5a. PII Columns (get_pii_columns)
  Found 16 PII columns:
    raw_customers:
      - email
      - first_name
      - last_name
      - phone_number
    raw_orders:
      - ip_address
      - shipping_address
    source_customers:
      - email
      - first_name
      - last_name
      - phone_number
    source_orders:
      - ip_address
      - shipping_address
    stg_orders_enriched:
      - customer_email
      - customer_first_name
      - customer_full_name
      - customer_last_name

  5b. Columns by Owner (get_columns_by_owner)
    data-governance: 14 unique columns
    data-platform: 22 unique columns
    finance: 33 unique columns
    inventory: 2 unique columns
    marketing: 18 unique columns
    operations: 5 unique columns
    product: 8 unique columns
    security: 2 unique columns

  5c. Columns by Tag (get_columns_by_tag)
    'attribution': 6 unique columns
    'category': 2 unique columns
    'confidential': 2 unique columns
    'con

### Section

In [8]:
print("6. TRACING PII THROUGH LINEAGE")
print("""
mbine metadata with lineage to understand PII data flow:
- Forward trace: Where does PII data go?
- Backward trace: Where did PII originate?
""")

# Find a PII column in a derived table (not raw_)
pii_derived_cols = [
    col
    for col in pipeline.get_pii_columns()
    if not col.table_name.startswith("raw_") and not col.table_name.startswith("source_")
]

if pii_derived_cols:
    # Pick one example
    example_col = pii_derived_cols[0]
    print(f"  Example: {example_col.table_name}.{example_col.column_name}")
    print()

    # Trace backward to find PII source
    print("  Backward trace (where did PII originate?):")
    try:
        sources = pipeline.trace_column_backward(example_col.table_name, example_col.column_name)
        for source in sources:
            pii_flag = " [PII SOURCE]" if source.pii else ""
            print(f"    <- {source.table_name}.{source.column_name}{pii_flag}")
    except Exception as e:
        print(f"    Error: {e}")
else:
    print("  No PII columns found in derived tables.")

# Forward trace from a source PII column
print()
print("  Forward trace (where does raw_customers.email go?):")
try:
    impacts = pipeline.trace_column_forward("raw_customers", "email")
    if impacts:
        for impact in impacts[:8]:
            print(f"    -> {impact.table_name}.{impact.column_name}")
        if len(impacts) > 8:
            print(f"    ... and {len(impacts) - 8} more")
    else:
        print("    (no downstream impacts found)")
except Exception as e:
    print(f"    Error: {e}")
print()

6. TRACING PII THROUGH LINEAGE

mbine metadata with lineage to understand PII data flow:
- Forward trace: Where does PII data go?
- Backward trace: Where did PII originate?

  Example: stg_orders_enriched.customer_email

  Backward trace (where did PII originate?):
    <- source_customers.email [PII SOURCE]

  Forward trace (where does raw_customers.email go?):
    -> mart_customer_ltv.customer_email



### Section

In [9]:
print("7. EXPORT TO JSON")
print("""
port pipeline with metadata to JSON for external tools:
data = pipeline.to_json(include_metadata=True)
""")

json_data = pipeline.to_json(include_metadata=True)

print("  JSON export contains:")
print(f"    - columns: {len(json_data.get('columns', []))} entries")
print(f"    - edges: {len(json_data.get('edges', []))} entries")
print(f"    - tables: {len(json_data.get('tables', []))} entries")

# Show sample column with metadata
print("\n  Sample column entry:")
sample_col = None
for col_data in json_data.get("columns", []):
    if col_data.get("pii") or col_data.get("owner"):
        sample_col = col_data
        break

if sample_col:
    # Pretty print with limited fields
    display_data = {
        k: v
        for k, v in sample_col.items()
        if k in ["table_name", "column_name", "pii", "owner", "tags", "description"]
    }
    print(f"    {json.dumps(display_data, indent=6)}")

# Export to file example
print("\n  To save to file:")
print('    with open("lineage.json", "w") as f:')
print("        json.dump(pipeline.to_json(), f, indent=2)")
print()

7. EXPORT TO JSON

port pipeline with metadata to JSON for external tools:
data = pipeline.to_json(include_metadata=True)

  JSON export contains:
    - columns: 326 entries
    - edges: 350 entries
    - tables: 22 entries

  Sample column entry:
    {
      "column_name": "order_id",
      "table_name": "raw_orders",
      "description": "Unique order identifier",
      "owner": "data-platform",
      "pii": false,
      "tags": []
}

  To save to file:
    with open("lineage.json", "w") as f:
        json.dump(pipeline.to_json(), f, indent=2)



### Section

In [10]:
print("METADATA SUMMARY")

# Count unique columns with each type of metadata
unique_pii = set()
unique_owners = {}
unique_tags = {}

for col in pipeline.columns.values():
    key = (col.table_name, col.column_name)
    if col.pii:
        unique_pii.add(key)
    if col.owner:
        if col.owner not in unique_owners:
            unique_owners[col.owner] = set()
        unique_owners[col.owner].add(key)
    for tag in col.tags:
        if tag not in unique_tags:
            unique_tags[tag] = set()
        unique_tags[tag].add(key)

print(f"  Total columns:     {len(pipeline.columns)}")
print(f"  PII columns:       {len(unique_pii)} unique")
print(f"  Owned columns:     {sum(len(v) for v in unique_owners.values())} unique")
print(f"  Tagged columns:    {sum(len(v) for v in unique_tags.values())} unique")
print()
print("  Owners:")
for owner in sorted(unique_owners.keys()):
    print(f"    - {owner}: {len(unique_owners[owner])} columns")
print()
print("  Tags:")
for tag in sorted(unique_tags.keys()):
    print(f"    - {tag}: {len(unique_tags[tag])} columns")
print()
print("Metadata management complete!")

METADATA SUMMARY
  Total columns:     326
  PII columns:       16 unique
  Owned columns:     104 unique
  Tagged columns:    97 unique

  Owners:
    - data-governance: 14 columns
    - data-platform: 22 columns
    - finance: 33 columns
    - inventory: 2 columns
    - marketing: 18 columns
    - operations: 5 columns
    - product: 8 columns
    - security: 2 columns

  Tags:
    - attribution: 6 columns
    - category: 2 columns
    - confidential: 2 columns
    - contact: 7 columns
    - cost: 5 columns
    - geo: 6 columns
    - loyalty: 3 columns
    - metric: 26 columns
    - payment: 3 columns
    - product: 8 columns
    - revenue: 15 columns
    - sensitive: 2 columns
    - status: 5 columns
    - time: 7 columns

Metadata management complete!
