# PyRevealed: Preference Structure Analysis

This notebook tests advanced preference structure functions:
- Feature independence (separability)
- Cross-price effects (substitutes vs complements)
- PreferenceEncoder for ML integration

In [None]:
import numpy as np

from pyrevealed import (
    BehaviorLog,
    PreferenceEncoder,
    test_feature_independence,
    test_cross_price_effect,
    validate_consistency,
    compute_integrity_score,
)

print("Imports successful!")

## Create Test Data

We'll create data representing purchases across 4 product categories:
- Goods 0-1: Food (Produce, Dairy)
- Goods 2-3: Entertainment (Streaming, Games)

In [None]:
# 8 observations, 4 goods
# Designed to show independence between Food (0,1) and Entertainment (2,3)
prices = np.array([
    [2.0, 3.0, 10.0, 15.0],  # Baseline
    [1.0, 3.0, 10.0, 15.0],  # Produce sale
    [2.0, 1.5, 10.0, 15.0],  # Dairy sale
    [2.0, 3.0, 5.0, 15.0],   # Streaming sale
    [2.0, 3.0, 10.0, 8.0],   # Games sale
    [1.5, 2.5, 8.0, 12.0],   # Mixed
    [2.5, 3.5, 12.0, 18.0],  # All higher
    [1.8, 2.8, 9.0, 14.0],   # Slight variation
])

# Quantities respond to prices within categories, less between
quantities = np.array([
    [5.0, 3.0, 2.0, 1.0],
    [8.0, 3.0, 2.0, 1.0],   # More produce when cheap
    [5.0, 6.0, 2.0, 1.0],   # More dairy when cheap
    [5.0, 3.0, 4.0, 1.0],   # More streaming when cheap
    [5.0, 3.0, 2.0, 3.0],   # More games when cheap
    [6.0, 4.0, 2.5, 1.5],
    [4.0, 2.5, 1.5, 0.8],
    [5.5, 3.5, 2.2, 1.2],
])

log = BehaviorLog(
    cost_vectors=prices,
    action_vectors=quantities,
    user_id="test_shopper"
)

print(f"Created log: {log.num_records} observations, {log.num_features} features")

## Basic Consistency Check

In [None]:
result = validate_consistency(log)
print(f"GARP consistent: {result.is_consistent}")

integrity = compute_integrity_score(log)
print(f"Integrity score: {integrity.efficiency_index:.3f}")

## Feature Independence (Separability) Test

Test whether Food (goods 0,1) and Entertainment (goods 2,3) can be treated independently.

In [None]:
# Test Food vs Entertainment independence
FOOD = [0, 1]
ENTERTAINMENT = [2, 3]

sep_result = test_feature_independence(log, group_a=FOOD, group_b=ENTERTAINMENT)

print(f"Is separable: {sep_result.is_separable}")
print(f"Cross-effect strength: {sep_result.cross_effect_strength:.3f}")
print(f"Within-group A consistency: {sep_result.within_group_a_consistency:.3f}")
print(f"Within-group B consistency: {sep_result.within_group_b_consistency:.3f}")
print(f"Recommendation: {sep_result.recommendation}")

In [None]:
# Test different grouping: Produce+Streaming vs Dairy+Games
# (This should be less separable since it mixes categories)
GROUP_MIXED_1 = [0, 2]  # Produce, Streaming
GROUP_MIXED_2 = [1, 3]  # Dairy, Games

sep_mixed = test_feature_independence(log, group_a=GROUP_MIXED_1, group_b=GROUP_MIXED_2)

print(f"Mixed grouping separable: {sep_mixed.is_separable}")
print(f"Cross-effect strength: {sep_mixed.cross_effect_strength:.3f}")

## Cross-Price Effects (Substitutes vs Complements)

Test relationships between individual goods.

In [None]:
# Test within-category relationship: Produce vs Dairy
# Hypothesis: They might be substitutes (both are food)
cross_food = test_cross_price_effect(log, good_g=0, good_h=1)

print("Produce vs Dairy:")
print(f"  Relationship: {cross_food.relationship}")
print(f"  Are substitutes: {cross_food.are_substitutes}")
print(f"  Are complements: {cross_food.are_complements}")
print(f"  Confidence: {cross_food.confidence_score:.3f}")

In [None]:
# Test cross-category relationship: Produce vs Streaming
# Hypothesis: Likely independent
cross_mixed = test_cross_price_effect(log, good_g=0, good_h=2)

print("Produce vs Streaming:")
print(f"  Relationship: {cross_mixed.relationship}")
print(f"  Confidence: {cross_mixed.confidence_score:.3f}")

In [None]:
# Test within Entertainment: Streaming vs Games
cross_ent = test_cross_price_effect(log, good_g=2, good_h=3)

print("Streaming vs Games:")
print(f"  Relationship: {cross_ent.relationship}")
print(f"  Confidence: {cross_ent.confidence_score:.3f}")

## PreferenceEncoder: ML Feature Extraction

In [None]:
encoder = PreferenceEncoder()
encoder.fit(log)

print("Encoder fitted successfully!")

In [None]:
# Extract latent values (per-observation utility)
latent_values = encoder.extract_latent_values()
print(f"Latent values shape: {latent_values.shape}")
print(f"Latent values: {latent_values}")

In [None]:
# Extract marginal weights (price sensitivity)
marginal_weights = encoder.extract_marginal_weights()
print(f"Marginal weights shape: {marginal_weights.shape}")
print(f"Marginal weights: {marginal_weights}")

In [None]:
# Get a callable value function
value_fn = encoder.get_value_function()

# Test bundle: 5 produce, 3 dairy, 2 streaming, 1 game
test_bundle = np.array([5.0, 3.0, 2.0, 1.0])
value = value_fn(test_bundle)
print(f"Value of bundle {test_bundle}: {value:.3f}")

In [None]:
# Predict choice under new prices and budget
# NOTE: API uses cost_vector and resource_limit (not prices/budget)
new_prices = np.array([2.0, 3.0, 10.0, 15.0])
budget = 50.0

predicted = encoder.predict_choice(cost_vector=new_prices, resource_limit=budget)
print(f"Predicted choice at prices {new_prices}:")
print(f"  Quantities: {predicted}")
if predicted is not None:
    print(f"  Total cost: {np.dot(new_prices, predicted):.2f} (budget: {budget})")

In [None]:
# Get fit details for diagnostics
details = encoder.get_fit_details()
print("Fit details:")
for key, value in details.items():
    print(f"  {key}: {value}")

## Edge Cases

In [None]:
# Test with single-good groups
sep_single = test_feature_independence(log, group_a=[0], group_b=[1])
print(f"Single-good groups separable: {sep_single.is_separable}")
print(f"Cross-effect: {sep_single.cross_effect_strength:.3f}")

In [None]:
# Test with invalid good index (should error)
try:
    result = test_cross_price_effect(log, good_g=0, good_h=10)  # Index 10 doesn't exist
    print("ERROR: Should have raised exception!")
except Exception as e:
    print(f"Caught (expected): {type(e).__name__}")
    print(f"Message: {e}")

In [None]:
# Test with same good for both (should error)
try:
    result = test_cross_price_effect(log, good_g=0, good_h=0)
    print("ERROR: Should have raised exception!")
except Exception as e:
    print(f"Caught (expected): {type(e).__name__}")
    print(f"Message: {e}")

## Summary

### Issues Found

1. **scipy OptimizeWarning**: When using `PreferenceEncoder.fit()`, scipy emits a warning about 
   unrecognized `tol` option. This is passed to HiGHS but not recognized.
   
2. **API Naming Inconsistency**: `PreferenceEncoder.predict_choice()` uses `cost_vector` and 
   `resource_limit` parameters, which don't match the tech-friendly naming pattern 
   (`prices`/`budget` would be more intuitive).

### What Worked

- All imports successful with tech-friendly API
- BehaviorLog creation with cost_vectors/action_vectors
- validate_consistency, compute_integrity_score, compute_confusion_metric all work
- BehavioralAuditor.full_audit() works correctly
- PreferenceEncoder.fit(), extract_latent_values(), extract_marginal_weights() work
- test_feature_independence() and test_cross_price_effect() work with correct param names
- Error handling provides clear, helpful messages