# Working with Local Data

This notebook shows how to load and process data from local parquet files.

In [None]:
import sys
sys.path.insert(0, '..')

## Load Data with Statistics

In [None]:
from src.data import load_local_data, LocalDataLoader

# Load with full statistics
result = load_local_data("../data/samples")

print(f"Tables loaded: {result.tables_loaded}")
print(f"Total events: {len(result.events):,}")
print(f"ID history records: {len(result.id_history):,}")
print(f"Unique customers: {result.unique_customers:,}")
print(f"Load time: {result.load_duration_ms:.1f}ms")

print("\nEvents by type:")
for event_type, count in result.events_by_type.items():
    print(f"  {event_type}: {count:,}")

## Inspect Sample Events

In [None]:
# Purchase events
purchases = [e for e in result.events if e.event_type.value == "purchase"]
print(f"Purchase events: {len(purchases):,}")

if purchases:
    p = purchases[0]
    print(f"\nSample purchase:")
    print(f"  Customer: {p.internal_customer_id}")
    print(f"  Timestamp: {p.timestamp}")
    print(f"  Order ID: {p.properties.order_id}")
    print(f"  Order Total: {p.properties.order_total}")

In [None]:
# View item events
views = [e for e in result.events if e.event_type.value == "view_item"]
print(f"View events: {len(views):,}")

if views:
    v = views[0]
    print(f"\nSample view:")
    print(f"  Product ID: {v.properties.product_id}")
    print(f"  Product Name: {v.properties.product_name}")
    print(f"  Category: {v.properties.product_category}")

## Run Full Pipeline

In [None]:
from src.pipeline import run_pipeline, PipelineConfig, format_pipeline_summary

config = PipelineConfig(
    min_events_per_customer=3,
    n_clusters=6,
    run_sensitivity=True,
    run_integrated_analysis=True,
    verbose=True,
)

pipeline_result = run_pipeline(config, events=result.events, id_history=result.id_history)
print(format_pipeline_summary(pipeline_result))

## Segment Analysis

In [None]:
import pandas as pd

# Create segment summary DataFrame
segment_data = []
for seg in sorted(pipeline_result.segments, key=lambda s: float(s.total_clv), reverse=True):
    rob = pipeline_result.robustness_scores.get(seg.segment_id)
    act = pipeline_result.actionability_evaluations.get(seg.segment_id)
    
    segment_data.append({
        "Segment": seg.name,
        "Size": seg.size,
        "Total CLV": float(seg.total_clv),
        "Avg AOV": float(seg.avg_order_value),
        "Robustness": rob.overall_robustness if rob else None,
        "Tier": rob.robustness_tier.value if rob else None,
        "Actionable": act.is_actionable if act else None,
    })

df = pd.DataFrame(segment_data)
df["Total CLV"] = df["Total CLV"].apply(lambda x: f"${x:,.2f}")
df["Avg AOV"] = df["Avg AOV"].apply(lambda x: f"${x:,.2f}")
df