# Itinerary Improvements EDA

## Issues to Address:
1. **Back-to-back attractions** - No relaxation gaps between heavy museum visits
2. **Static duration** - Same duration for all personas (Vatican Museums = 4 hours for everyone)
3. **Missing iconic attractions** - Colosseum not appearing in honeymoon itineraries

This notebook analyzes these issues and proposes data-driven solutions.

In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load Rome POIs
data_path = Path('../data/seed/rome_pois.json')
with open(data_path) as f:
    rome_data = json.load(f)

pois = rome_data['pois']
personas = rome_data['persona_templates']

print(f"Loaded {len(pois)} POIs and {len(personas)} personas")

In [None]:
# Create DataFrame for analysis
poi_df = pd.DataFrame(pois)

# Flatten persona scores
for poi in pois:
    if 'persona_scores' in poi:
        for key, value in poi['persona_scores'].items():
            poi[key] = value

poi_df = pd.DataFrame(pois)
poi_df.head()

## Issue 1: Back-to-Back Attractions

Currently, attractions are scheduled one after another with only a small buffer.
After visiting a 2-4 hour museum, tourists need rest!

### Analysis: Heavy vs Light Activities

In [None]:
# Categorize activities by intensity
def categorize_intensity(row):
    duration = row.get('typical_duration_minutes', 60)
    subcategory = row.get('subcategory', '')
    
    # Heavy activities (need break after)
    if subcategory in ['museum', 'historical'] and duration >= 90:
        return 'heavy'
    # Moderate activities
    elif duration >= 60:
        return 'moderate'
    # Light activities (can be back-to-back)
    else:
        return 'light'

poi_df['intensity'] = poi_df.apply(categorize_intensity, axis=1)

# Show distribution
intensity_counts = poi_df['intensity'].value_counts()
print("Activity Intensity Distribution:")
print(intensity_counts)
print()

# Show heavy activities
heavy_activities = poi_df[poi_df['intensity'] == 'heavy'][['name', 'subcategory', 'typical_duration_minutes']]
print("Heavy Activities (need break after):")
print(heavy_activities.to_string(index=False))

In [None]:
# Propose break rules
BREAK_RULES = {
    'after_heavy': {
        'min_break_minutes': 45,
        'suggested_break': 'coffee_gelato',
        'description': 'Coffee/gelato break after museum or long historical site'
    },
    'after_moderate': {
        'min_break_minutes': 20,
        'suggested_break': 'walk_rest',
        'description': 'Short rest or leisurely walk'
    },
    'after_light': {
        'min_break_minutes': 10,
        'suggested_break': None,
        'description': 'Travel time is sufficient'
    },
    'between_anchors': {
        'min_break_minutes': 60,
        'suggested_break': 'light_activity_or_rest',
        'description': 'Buffer between major attractions (lunch, snack, or rest)'
    }
}

print("Proposed Break Rules:")
for rule, config in BREAK_RULES.items():
    print(f"\n{rule}:")
    print(f"  Min break: {config['min_break_minutes']} minutes")
    print(f"  Description: {config['description']}")

## Issue 2: Persona-Based Duration

Currently, all personas get the same duration (e.g., Vatican Museums = 240 min for everyone).

Reality:
- **History buff solo traveler**: Spends 4+ hours, reads every plaque
- **Honeymoon couple**: 1.5-2 hours, highlights only
- **Family with kids**: 1-1.5 hours, kids get bored
- **Senior travelers**: 2-2.5 hours, need rest stops

In [None]:
# Persona-based duration multipliers
PERSONA_DURATION_MULTIPLIERS = {
    # Group types
    'family': {
        'museum': 0.5,        # Kids get bored
        'historical': 0.6,
        'park': 1.2,          # Kids love parks
        'market': 0.8,
        'walking_tour': 0.6,  # Shorter attention span
        'default': 0.7
    },
    'honeymoon': {
        'museum': 0.5,        # Not here for deep history
        'historical': 0.6,
        'park': 1.0,
        'viewpoint': 1.2,     # Romantic photo spots!
        'trattoria': 1.3,     # Long romantic dinners
        'cocktail_bar': 1.2,
        'default': 0.8
    },
    'solo': {
        'museum': 1.2,        # Can take their time
        'historical': 1.2,
        'walking_tour': 1.0,
        'market': 1.1,        # Explore freely
        'default': 1.0
    },
    'friends': {
        'museum': 0.7,        # Quick highlights
        'historical': 0.7,
        'nightlife': 1.5,     # Party longer!
        'market': 1.0,
        'walking_tour': 0.8,
        'default': 0.8
    },
    'seniors': {
        'museum': 0.9,        # Need rests but interested
        'historical': 0.8,
        'park': 1.3,          # Enjoy slow walks
        'walking_tour': 0.6,  # Tiring
        'trattoria': 1.2,     # Long leisurely meals
        'default': 0.8
    },
    'couple': {
        'museum': 0.8,
        'historical': 0.8,
        'viewpoint': 1.1,
        'default': 0.9
    }
}

def get_persona_duration(base_duration, subcategory, persona_type):
    """Calculate adjusted duration for a specific persona."""
    multipliers = PERSONA_DURATION_MULTIPLIERS.get(persona_type, {})
    multiplier = multipliers.get(subcategory, multipliers.get('default', 1.0))
    return int(base_duration * multiplier)

# Show example
examples = [
    ('Vatican Museums', 'museum', 240),
    ('Colosseum', 'historical', 120),
    ('Villa Borghese Gardens', 'park', 120),
    ('Trevi Fountain', 'monument', 30),
]

print("Duration by Persona (in minutes):")
print("="*80)
header = f"{'POI':<25} {'Base':>6} {'Family':>8} {'Honeymn':>8} {'Solo':>8} {'Friends':>8} {'Seniors':>8}"
print(header)
print("-"*80)

for name, subcat, base in examples:
    row = f"{name:<25} {base:>6}"
    for persona in ['family', 'honeymoon', 'solo', 'friends', 'seniors']:
        adjusted = get_persona_duration(base, subcat, persona)
        row += f" {adjusted:>8}"
    print(row)

In [None]:
# Apply to all POIs and visualize
duration_comparison = []

for poi in pois:
    base = poi.get('typical_duration_minutes', 60)
    subcat = poi.get('subcategory', 'default')
    
    for persona in ['family', 'honeymoon', 'solo', 'friends', 'seniors']:
        adjusted = get_persona_duration(base, subcat, persona)
        duration_comparison.append({
            'poi': poi['name'],
            'subcategory': subcat,
            'persona': persona,
            'base_duration': base,
            'adjusted_duration': adjusted,
            'difference': adjusted - base
        })

duration_df = pd.DataFrame(duration_comparison)

# Plot heatmap of duration differences
pivot_df = duration_df.pivot_table(
    index='poi', 
    columns='persona', 
    values='adjusted_duration',
    aggfunc='first'
).head(15)  # Top 15 POIs

fig = px.imshow(
    pivot_df,
    labels=dict(x="Persona", y="POI", color="Duration (min)"),
    title="Persona-Adjusted Durations (in minutes)",
    color_continuous_scale='RdYlGn_r',
    aspect='auto'
)
fig.update_layout(height=600)
fig.show()

## Issue 3: Colosseum Not Appearing for Honeymoon

Let's analyze why iconic attractions like Colosseum might not appear in honeymoon itineraries.

In [None]:
# Honeymoon persona config
honeymoon_persona = {
    'group_type': 'honeymoon',
    'primary_vibes': ['romantic', 'foodie', 'cultural'],
    'weight_config': {
        'group': 0.35,
        'vibe': 0.40,
        'similarity': 0.15,
        'practical': 0.10
    }
}

def calculate_final_score(poi, persona):
    """Calculate final score for a POI given a persona."""
    scores = poi.get('persona_scores', {})
    weights = persona['weight_config']
    vibes = persona['primary_vibes']
    group_type = persona['group_type']
    
    # Group score
    group_score = scores.get(f'score_{group_type}', 0.5)
    
    # Vibe score (average of selected vibes)
    vibe_scores = [scores.get(f'score_{v}', 0.5) for v in vibes]
    vibe_score = sum(vibe_scores) / len(vibe_scores) if vibe_scores else 0.5
    
    # Similarity score (assume 0.5 for this analysis)
    similarity = 0.5
    
    # Practical score (assume 0.8 for most)
    practical = 0.8
    
    # Final score
    final = (
        group_score * weights['group'] +
        vibe_score * weights['vibe'] +
        similarity * weights['similarity'] +
        practical * weights['practical']
    )
    
    return {
        'name': poi['name'],
        'group_score': group_score,
        'vibe_score': round(vibe_score, 3),
        'final_score': round(final, 3),
        'is_must_see': poi.get('attributes', {}).get('is_must_see', False),
        'category': poi.get('category'),
        'subcategory': poi.get('subcategory')
    }

# Calculate scores for all POIs
honeymoon_scores = [calculate_final_score(poi, honeymoon_persona) for poi in pois]
scores_df = pd.DataFrame(honeymoon_scores)

# Sort by final score
scores_df = scores_df.sort_values('final_score', ascending=False)

print("Honeymoon Persona Scores (Top 20):")
print("="*90)
print(scores_df.head(20).to_string(index=False))

In [None]:
# Find where Colosseum ranks
scores_df = scores_df.reset_index(drop=True)
colosseum_idx = scores_df[scores_df['name'] == 'Colosseum'].index[0]

print(f"\nColosseum ranks #{colosseum_idx + 1} out of {len(scores_df)} POIs for honeymoon!")
print()

# Show Colosseum's scores
colosseum = scores_df[scores_df['name'] == 'Colosseum'].iloc[0]
print("Colosseum Scores:")
print(f"  Group (honeymoon): {colosseum['group_score']}")
print(f"  Vibe (romantic, foodie, cultural avg): {colosseum['vibe_score']}")
print(f"  Final Score: {colosseum['final_score']}")
print(f"  Is Must-See: {colosseum['is_must_see']}")

# Compare with top honeymoon attraction
top = scores_df.iloc[0]
print(f"\nTop Honeymoon Attraction: {top['name']}")
print(f"  Group (honeymoon): {top['group_score']}")
print(f"  Vibe avg: {top['vibe_score']}")
print(f"  Final Score: {top['final_score']}")

In [None]:
# Visualize the problem
fig = px.bar(
    scores_df.head(15),
    x='name',
    y='final_score',
    color='is_must_see',
    title='Honeymoon Persona: Top 15 POIs by Score',
    labels={'final_score': 'Final Score', 'name': 'POI'},
    color_discrete_map={True: '#FF6B6B', False: '#4ECDC4'}
)
fig.update_layout(xaxis_tickangle=-45, height=500)

# Add annotation for Colosseum if not in top 15
if colosseum_idx >= 15:
    fig.add_annotation(
        x=0.5, y=1.1,
        xref='paper', yref='paper',
        text=f"Colosseum ranks #{colosseum_idx + 1} (not shown)",
        showarrow=False,
        font=dict(size=14, color='red')
    )

fig.show()

## Solution: Must-See Guarantee

Iconic attractions should be included regardless of persona score.

Proposed solution:
1. **Must-See Boost**: Add +0.15 to final score for `is_must_see=True` attractions
2. **Guaranteed Slots**: Reserve 1 must-see slot per day
3. **Persona-Adjusted Duration**: Shorten duration for honeymoon at Colosseum

In [None]:
# Solution 1: Must-See Boost
MUST_SEE_BOOST = 0.15

def calculate_boosted_score(poi, persona):
    """Calculate score with must-see boost."""
    result = calculate_final_score(poi, persona)
    
    if result['is_must_see']:
        result['boosted_score'] = min(1.0, result['final_score'] + MUST_SEE_BOOST)
    else:
        result['boosted_score'] = result['final_score']
    
    return result

# Recalculate with boost
boosted_scores = [calculate_boosted_score(poi, honeymoon_persona) for poi in pois]
boosted_df = pd.DataFrame(boosted_scores)
boosted_df = boosted_df.sort_values('boosted_score', ascending=False)

print("With Must-See Boost (+0.15):")
print("="*100)
print(boosted_df[['name', 'group_score', 'vibe_score', 'final_score', 'is_must_see', 'boosted_score']].head(15).to_string(index=False))

In [None]:
# New Colosseum rank
boosted_df = boosted_df.reset_index(drop=True)
new_colosseum_idx = boosted_df[boosted_df['name'] == 'Colosseum'].index[0]

print(f"\nColosseum now ranks #{new_colosseum_idx + 1} (was #{colosseum_idx + 1})")
print(f"Improvement: Moved up {colosseum_idx - new_colosseum_idx} positions!")

In [None]:
# Compare before and after
fig = make_subplots(rows=1, cols=2, subplot_titles=['Before (No Boost)', 'After (Must-See Boost)'])

# Before
before_df = scores_df.head(10)
fig.add_trace(
    go.Bar(
        x=before_df['name'],
        y=before_df['final_score'],
        marker_color=['#FF6B6B' if x else '#4ECDC4' for x in before_df['is_must_see']],
        name='Before'
    ),
    row=1, col=1
)

# After
after_df = boosted_df.head(10)
fig.add_trace(
    go.Bar(
        x=after_df['name'],
        y=after_df['boosted_score'],
        marker_color=['#FF6B6B' if x else '#4ECDC4' for x in after_df['is_must_see']],
        name='After'
    ),
    row=1, col=2
)

fig.update_layout(
    title='Impact of Must-See Boost on Honeymoon Rankings',
    height=500,
    showlegend=False
)
fig.update_xaxes(tickangle=-45)
fig.show()

## Proposed Data Model Changes

Based on this analysis, here are the recommended changes:

In [None]:
# 1. New POI field: duration_by_persona
NEW_POI_FIELD = {
    "duration_by_persona": {
        "description": "Persona-specific duration overrides",
        "example": {
            "family": 60,        # 1 hour for families
            "honeymoon": 75,     # 1.25 hours for honeymoon
            "solo": 150,         # 2.5 hours for solo
            "default": 120       # 2 hours default
        }
    }
}

# 2. New POI field: needs_break_after
NEW_BREAK_FIELD = {
    "needs_break_after": {
        "description": "Whether a relaxation break should follow this activity",
        "type": "boolean",
        "auto_calculate": "subcategory in ['museum', 'historical'] and duration >= 90"
    },
    "suggested_break_minutes": {
        "description": "Recommended break duration after this activity",
        "type": "integer",
        "default": 30
    },
    "break_type": {
        "description": "Type of break activity to insert",
        "type": "string",
        "options": ["coffee", "gelato", "rest", "light_walk", "shopping"]
    }
}

# 3. Updated PACING_CONFIG
UPDATED_PACING_CONFIG = {
    'slow': {
        'anchors_per_day': 1,
        'max_activities': 3,
        'min_buffer_minutes': 60,
        'meal_duration_minutes': 90,
        'must_include_breaks': True,
        'break_after_heavy': 60,
        'break_after_moderate': 30,
    },
    'moderate': {
        'anchors_per_day': 2,
        'max_activities': 5,
        'min_buffer_minutes': 30,
        'meal_duration_minutes': 75,
        'must_include_breaks': True,
        'break_after_heavy': 45,
        'break_after_moderate': 15,
    },
    'fast': {
        'anchors_per_day': 3,
        'max_activities': 7,
        'min_buffer_minutes': 15,
        'meal_duration_minutes': 60,
        'must_include_breaks': False,
        'break_after_heavy': 20,
        'break_after_moderate': 0,
    }
}

print("Proposed Configuration Updates:")
print(json.dumps(UPDATED_PACING_CONFIG, indent=2))

## Summary of Changes Needed

### 1. POI Data Updates
- Add `duration_by_persona` field to each POI
- Add `needs_break_after` and `break_type` fields

### 2. Scorer Updates (`app/services/rag/scorer.py`)
- Add `MUST_SEE_BOOST = 0.15` to final score for must-see attractions
- Ensure iconic attractions are included regardless of persona

### 3. Assembler Updates (`app/services/rag/assembler.py`)
- Use `duration_by_persona[group_type]` instead of `typical_duration_minutes`
- Insert break slots after heavy activities
- Add `_insert_break()` method

### 4. PACING_CONFIG Updates (`app/core/config.py`)
- Add `break_after_heavy` and `break_after_moderate` settings
- Add `must_include_breaks` flag

In [None]:
# Export enhanced POI data with persona-based durations
enhanced_pois = []

for poi in pois:
    enhanced = poi.copy()
    base_duration = poi.get('typical_duration_minutes', 60)
    subcat = poi.get('subcategory', 'default')
    
    # Add duration_by_persona
    enhanced['duration_by_persona'] = {
        'family': get_persona_duration(base_duration, subcat, 'family'),
        'honeymoon': get_persona_duration(base_duration, subcat, 'honeymoon'),
        'couple': get_persona_duration(base_duration, subcat, 'couple'),
        'solo': get_persona_duration(base_duration, subcat, 'solo'),
        'friends': get_persona_duration(base_duration, subcat, 'friends'),
        'seniors': get_persona_duration(base_duration, subcat, 'seniors'),
        'default': base_duration
    }
    
    # Add break info
    if subcat in ['museum', 'historical'] and base_duration >= 90:
        enhanced['needs_break_after'] = True
        enhanced['suggested_break_minutes'] = 45
        enhanced['break_type'] = 'coffee_gelato'
    elif base_duration >= 60:
        enhanced['needs_break_after'] = True
        enhanced['suggested_break_minutes'] = 20
        enhanced['break_type'] = 'rest'
    else:
        enhanced['needs_break_after'] = False
        enhanced['suggested_break_minutes'] = 0
        enhanced['break_type'] = None
    
    enhanced_pois.append(enhanced)

# Save enhanced data
output_path = Path('../data/seed/rome_pois_enhanced.json')
enhanced_data = {
    'pois': enhanced_pois,
    'persona_templates': personas,
    'metadata': {
        'enhancements': ['duration_by_persona', 'break_scheduling'],
        'generated_by': 'itinerary_improvements_eda.ipynb'
    }
}

with open(output_path, 'w') as f:
    json.dump(enhanced_data, f, indent=2)

print(f"Enhanced POI data saved to: {output_path}")
print(f"\nExample enhanced POI:")
print(json.dumps(enhanced_pois[0], indent=2))

## Next Steps

1. **Review** the enhanced POI data (`rome_pois_enhanced.json`)
2. **Update DB schema** to add `duration_by_persona` JSONB field
3. **Update Scorer** to add must-see boost
4. **Update Assembler** to:
   - Use persona-based durations
   - Insert break slots
5. **Test** with honeymoon persona to verify Colosseum appears