# Custom Schema Configuration

Configure the data loader for different data sources (GA4, Segment, custom).

In [None]:
import sys
sys.path.insert(0, '..')

## Preset Configurations

The engine includes presets for common data platforms:

In [None]:
from src.data import (
    create_bloomreach_config,
    create_ga4_config,
    create_segment_config,
    create_generic_config,
)

# Inspect Bloomreach config
br_config = create_bloomreach_config()
print(f"Bloomreach Config:")
print(f"  Customer ID field: {br_config.customer_id_field}")
print(f"  Timestamp field: {br_config.timestamp_field}")
print(f"  Properties field: {br_config.properties_field}")
print(f"  Category fields: {br_config.category_fields}")
print(f"  Revenue fields: {br_config.revenue_fields}")

In [None]:
# Inspect GA4 config
ga4_config = create_ga4_config()
print(f"GA4 Config:")
print(f"  Customer ID field: {ga4_config.customer_id_field}")
print(f"  Timestamp field: {ga4_config.timestamp_field}")
print(f"  Properties field: {ga4_config.properties_field}")
print(f"  Mobile values: {ga4_config.mobile_device_values}")

## Create Custom Configuration

In [None]:
from src.data import ClientSchemaConfig

# Configure for your custom data structure
custom_config = ClientSchemaConfig(
    client_name="my_company",
    description="Custom e-commerce data warehouse",
    
    # Core fields
    customer_id_field="user_id",
    timestamp_field="event_time",
    properties_field="event_data",  # Or None for flat structure
    
    # ID merge configuration
    id_history_table="user_id_mapping",
    past_id_field="old_user_id",
    canonical_id_field="current_user_id",
    
    # Field alternatives (tries each until value found)
    category_fields=["category", "product_type", "item_category", "dept"],
    revenue_fields=["total", "amount", "transaction_value", "order_total"],
    
    # Device detection
    mobile_device_values=["mobile", "iOS", "Android", "tablet", "phone"],
    desktop_device_values=["desktop", "web", "browser"],
)

print(f"Created config: {custom_config.client_name}")

## Use Custom Config to Load Data

In [None]:
from src.data import load_events_only, LocalDataLoader

# Using preset config
events, id_history = load_events_only(
    "../data/samples",
    schema_config=create_bloomreach_config()  # Or your custom_config
)

print(f"Loaded {len(events):,} events")

In [None]:
# Or use LocalDataLoader for more control
loader = LocalDataLoader(
    "../data/samples",
    schema_config=create_bloomreach_config(),
    exclude_tables=["customers_external_ids"],  # Skip certain tables
)

result = loader.load()
print(f"Tables: {result.tables_loaded}")
print(f"Events: {len(result.events):,}")

## Adding Custom Event Types

In [None]:
from src.data import EventTypeConfig, EventTypeMapping, FieldMapping, SemanticFieldType

# Define custom event type configuration
wishlist_config = EventTypeConfig(
    source_table="wishlist_add",
    canonical_type=EventTypeMapping.WISHLIST_ADD,
    is_transactional=False,
    is_engagement=True,
    field_mappings=[
        FieldMapping("properties.item_id", SemanticFieldType.PRODUCT_ID),
        FieldMapping("properties.item_name", SemanticFieldType.PRODUCT_NAME),
    ],
)

# Add to a custom config
extended_config = ClientSchemaConfig(
    client_name="extended",
    customer_id_field="internal_customer_id",
    event_types=[wishlist_config],  # Add custom event types
)

print(f"Config with custom event types: {len(extended_config.event_types)} event types")