# Custom Configuration Guide

This notebook shows how to customize the segmentation pipeline with different configurations.

In [None]:
import sys
sys.path.insert(0, '..')

from datetime import datetime, timezone
from src.pipeline import (
    PipelineConfig,
    run_pipeline,
    format_pipeline_summary,
    get_pipeline_metrics,
)
from src.segmentation.segment_validator import ValidationCriteria

## 1. PipelineConfig Options

The `PipelineConfig` dataclass provides full control over pipeline behavior:

In [None]:
# View all configuration options
config = PipelineConfig()

print("Default PipelineConfig values:")
print(f"  n_customers: {config.n_customers}")
print(f"  data_seed: {config.data_seed}")
print(f"  merge_probability: {config.merge_probability}")
print(f"  n_clusters: {config.n_clusters}")
print(f"  auto_select_k: {config.auto_select_k}")
print(f"  k_range: {config.k_range}")
print(f"  run_sensitivity: {config.run_sensitivity}")
print(f"  generate_report: {config.generate_report}")
print(f"  use_llm: {config.use_llm}")

## 2. Controlling Data Generation

In [None]:
# Generate more customers with specific date range
config = PipelineConfig(
    n_customers=1000,
    data_seed=123,  # For reproducibility
    merge_probability=0.2,  # 20% of customers have merged IDs
    date_range=(
        datetime(2024, 1, 1, tzinfo=timezone.utc),
        datetime(2024, 12, 31, tzinfo=timezone.utc),
    ),
    # Skip expensive operations for faster testing
    run_sensitivity=False,
    generate_report=False,
)

result = run_pipeline(config)
print(f"Generated {len(result.profiles)} profiles from 1000 customers")

## 3. Clustering Configuration

In [None]:
# Fixed number of clusters
config_fixed = PipelineConfig(
    n_customers=500,
    n_clusters=4,
    auto_select_k=False,
    cluster_seed=42,
    run_sensitivity=False,
)

result_fixed = run_pipeline(config_fixed)
print(f"Fixed k: {len(result_fixed.segments)} segments")

In [None]:
# Automatic k selection with range
config_auto = PipelineConfig(
    n_customers=500,
    auto_select_k=True,
    k_range=(3, 8),  # Try k from 3 to 8
    cluster_seed=42,
    run_sensitivity=False,
)

result_auto = run_pipeline(config_auto)
print(f"Auto k: {len(result_auto.segments)} segments (selected from range 3-8)")

## 4. Sensitivity Analysis Options

In [None]:
# Full sensitivity analysis
config_robust = PipelineConfig(
    n_customers=300,
    n_clusters=4,
    auto_select_k=False,
    run_sensitivity=True,
    include_sampling_stability=True,
    generate_report=False,
)

result_robust = run_pipeline(config_robust)

print("Robustness Analysis:")
print(f"  Overall robustness: {result_robust.sensitivity_result.overall_robustness:.3f}")
print(f"  Feature stability: {result_robust.sensitivity_result.feature_sensitivity.feature_stability:.3f}")
print(f"  Time consistency: {result_robust.sensitivity_result.time_window_sensitivity.time_consistency:.3f}")
if result_robust.sensitivity_result.sampling_stability:
    print(f"  Sampling stability: {result_robust.sensitivity_result.sampling_stability:.3f}")

## 5. Custom Validation Criteria

In [None]:
from decimal import Decimal

# Strict validation criteria
strict_criteria = ValidationCriteria(
    min_segment_size=20,
    max_segment_size_pct=0.4,  # Max 40% of customers
    min_total_clv=Decimal("5000"),
    min_avg_clv=Decimal("100"),
    min_feature_stability=0.5,
    min_overall_robustness=0.6,
    min_expected_roi=1.0,  # Require 100% ROI
)

config_strict = PipelineConfig(
    n_customers=500,
    n_clusters=5,
    auto_select_k=False,
    validation_criteria=strict_criteria,
    run_sensitivity=True,
)

result_strict = run_pipeline(config_strict)
print(f"Strict validation: {len(result_strict.valid_segments)}/{len(result_strict.segments)} segments valid")

In [None]:
# See rejection reasons
for segment in result_strict.segments:
    validation = result_strict.validation_results.get(segment.segment_id)
    if validation and not validation.is_valid:
        print(f"\n{segment.name} rejected:")
        for reason in validation.rejection_reasons:
            print(f"  - {reason}")

## 6. Comparing Configurations

In [None]:
# Compare different k values
import pandas as pd

results = []
for k in [3, 4, 5, 6, 7]:
    config = PipelineConfig(
        n_customers=500,
        n_clusters=k,
        auto_select_k=False,
        run_sensitivity=False,
        generate_report=False,
    )
    result = run_pipeline(config)
    
    results.append({
        'k': k,
        'silhouette': result.clustering_result.silhouette if result.clustering_result else None,
        'inertia': result.clustering_result.inertia if result.clustering_result else None,
        'min_size': min(s.size for s in result.segments),
        'max_size': max(s.size for s in result.segments),
    })

df = pd.DataFrame(results)
print(df.to_string(index=False))

## 7. Using Pre-Generated Data

In [None]:
from src.data.synthetic_generator import generate_small_dataset
from src.pipeline import run_pipeline_on_dataset

# Generate dataset once
dataset = generate_small_dataset(seed=42)
print(f"Generated dataset: {dataset.n_customers} customers, {dataset.n_events} events")

# Run pipeline on pre-generated data
result = run_pipeline_on_dataset(dataset, n_clusters=5)
print(f"Pipeline result: {len(result.segments)} segments")

## 8. Performance Metrics

In [None]:
# Get detailed metrics
metrics = get_pipeline_metrics(result)

print("Pipeline Metrics:")
print("-" * 50)
for key, value in metrics.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

In [None]:
# Stage timing breakdown
print("\nStage Timings:")
print("-" * 50)
for stage in result.stage_results:
    status = "" if stage.success else " (FAILED)"
    print(f"  {stage.stage_name}: {stage.duration_ms:.1f}ms{status}")