# üó∫Ô∏è Personalized Travel Itinerary System
## Data Analysis & City Expansion Framework

---

### Executive Summary

This analysis demonstrates our **RAG-based persona-driven itinerary system** using Rome as our pilot city. We analyze:

1. **Data Coverage** - POI distribution across categories and neighborhoods
2. **Persona Fit Analysis** - How well our data supports different traveler types
3. **Geographic Clustering** - Optimal groupings for day-by-day itineraries
4. **Restaurant Accessibility** - Dining options near attractions
5. **Expansion Framework** - Scalable approach for adding new cities

---

In [None]:
# Setup - Run this first
import pandas as pd
import numpy as np
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Clustering
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import StandardScaler

# Set plotly template for consistent styling
import plotly.io as pio
pio.templates.default = "plotly_white"

# Color palette
COLORS = {
    'primary': '#2E86AB',
    'secondary': '#A23B72',
    'accent': '#F18F01',
    'success': '#C73E1D',
    'neutral': '#3B1F2B'
}

print("‚úÖ Setup complete!")

In [None]:
# Load Rome POI Data
def load_rome_data():
    """Load curated Rome POI data."""
    seed_path = Path("../data/seed/rome_pois.json")
    
    if seed_path.exists():
        with open(seed_path, 'r') as f:
            data = json.load(f)
        
        # Flatten nested data
        pois = []
        for poi in data['pois']:
            flat_poi = {k: v for k, v in poi.items() if k not in ['persona_scores', 'attributes']}
            if 'persona_scores' in poi:
                flat_poi.update(poi['persona_scores'])
            if 'attributes' in poi:
                flat_poi.update(poi['attributes'])
            pois.append(flat_poi)
        
        return pd.DataFrame(pois), data.get('persona_templates', [])
    return None, None

df, persona_templates = load_rome_data()

if df is not None:
    print(f"üìç Loaded {len(df)} POIs for Rome")
    print(f"üë• Loaded {len(persona_templates)} persona templates")
else:
    print("‚ö†Ô∏è Data not found. Please run seed script first.")

---
## 1. Data Coverage Analysis

### Key Question: Do we have enough POIs across all categories to build quality itineraries?

In [None]:
# Category Distribution
category_counts = df['category'].value_counts()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=category_counts.index,
    y=category_counts.values,
    marker_color=[COLORS['primary'], COLORS['secondary'], COLORS['accent'], 
                  COLORS['success'], COLORS['neutral']][:len(category_counts)],
    text=category_counts.values,
    textposition='outside'
))

fig.update_layout(
    title={
        'text': '<b>POI Distribution by Category</b><br><sup>Rome Pilot Data</sup>',
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title='Category',
    yaxis_title='Number of POIs',
    height=400,
    showlegend=False
)

fig.show()

# Analysis
print("\nüìä CATEGORY COVERAGE ANALYSIS")
print("=" * 40)
total = len(df)
for cat, count in category_counts.items():
    pct = count/total*100
    status = "‚úÖ" if count >= 5 else "‚ö†Ô∏è"
    print(f"{status} {cat.title()}: {count} POIs ({pct:.1f}%)")

In [None]:
# Neighborhood Coverage
neighborhood_counts = df['neighborhood'].value_counts()

fig = px.treemap(
    names=neighborhood_counts.index,
    parents=['Rome'] * len(neighborhood_counts),
    values=neighborhood_counts.values,
    title='<b>POI Distribution by Neighborhood</b>',
    color=neighborhood_counts.values,
    color_continuous_scale='Blues'
)

fig.update_layout(height=500)
fig.show()

print("\nüìç NEIGHBORHOOD COVERAGE")
print("=" * 40)
print(f"Total neighborhoods covered: {len(neighborhood_counts)}")
print(f"\nTop 5 neighborhoods by POI count:")
for i, (nb, count) in enumerate(neighborhood_counts.head().items(), 1):
    print(f"  {i}. {nb}: {count} POIs")

In [None]:
# Category by Neighborhood Heatmap
pivot = pd.crosstab(df['neighborhood'], df['category'])

fig = px.imshow(
    pivot,
    title='<b>Category Coverage by Neighborhood</b><br><sup>Identifying gaps in data coverage</sup>',
    labels=dict(x='Category', y='Neighborhood', color='POI Count'),
    color_continuous_scale='YlOrRd',
    aspect='auto'
)

fig.update_layout(height=500)
fig.show()

# Identify gaps
print("\nüîç DATA GAPS IDENTIFIED")
print("=" * 40)
gaps = []
for nb in pivot.index:
    for cat in pivot.columns:
        if pivot.loc[nb, cat] == 0:
            gaps.append(f"  ‚Ä¢ No {cat}s in {nb}")

if gaps:
    print("Missing combinations:")
    for gap in gaps[:10]:  # Show top 10
        print(gap)
else:
    print("‚úÖ All neighborhood-category combinations covered!")

---
## 2. Persona Fit Analysis

### Key Question: How well does our POI data support different traveler personas?

In [None]:
# Define persona score columns
group_scores = ['score_family', 'score_couple', 'score_honeymoon', 'score_solo', 
                'score_friends', 'score_seniors', 'score_business']
vibe_scores = ['score_adventure', 'score_relaxation', 'score_cultural', 'score_foodie',
               'score_nightlife', 'score_nature', 'score_shopping', 'score_photography',
               'score_wellness', 'score_romantic']

# Filter available columns
available_group = [c for c in group_scores if c in df.columns]
available_vibe = [c for c in vibe_scores if c in df.columns]

if available_group:
    # Calculate average scores per persona
    persona_avg = df[available_group].mean().sort_values(ascending=True)
    
    # Clean labels
    labels = [s.replace('score_', '').title() for s in persona_avg.index]
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        y=labels,
        x=persona_avg.values,
        orientation='h',
        marker_color=COLORS['primary'],
        text=[f'{v:.2f}' for v in persona_avg.values],
        textposition='outside'
    ))
    
    # Add threshold line
    fig.add_vline(x=0.7, line_dash="dash", line_color="green", 
                  annotation_text="Good Fit Threshold (0.7)")
    
    fig.update_layout(
        title='<b>Average POI Fit Score by Traveler Type</b><br><sup>Higher = Better suited for this persona</sup>',
        xaxis_title='Average Score (0-1)',
        yaxis_title='Traveler Type',
        height=400,
        xaxis_range=[0, 1]
    )
    
    fig.show()
    
    print("\nüë• PERSONA SUPPORT ANALYSIS")
    print("=" * 40)
    for persona, score in persona_avg.items():
        name = persona.replace('score_', '').title()
        if score >= 0.75:
            print(f"‚úÖ {name}: Excellent support ({score:.2f})")
        elif score >= 0.6:
            print(f"üü° {name}: Good support ({score:.2f})")
        else:
            print(f"‚ö†Ô∏è {name}: Needs more POIs ({score:.2f})")

In [None]:
# Vibe Coverage Radar Chart
if available_vibe:
    vibe_avg = df[available_vibe].mean()
    labels = [s.replace('score_', '').title() for s in vibe_avg.index]
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatterpolar(
        r=vibe_avg.values,
        theta=labels,
        fill='toself',
        name='Rome',
        line_color=COLORS['primary']
    ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(visible=True, range=[0, 1])
        ),
        title='<b>Vibe Coverage Analysis</b><br><sup>How well Rome supports different travel vibes</sup>',
        height=500
    )
    
    fig.show()
    
    # Top and bottom vibes
    print("\nüéØ VIBE STRENGTHS & GAPS")
    print("=" * 40)
    sorted_vibes = vibe_avg.sort_values(ascending=False)
    print("\nüí™ Strongest vibes:")
    for vibe, score in sorted_vibes.head(3).items():
        print(f"  ‚Ä¢ {vibe.replace('score_', '').title()}: {score:.2f}")
    print("\nüìà Needs improvement:")
    for vibe, score in sorted_vibes.tail(3).items():
        print(f"  ‚Ä¢ {vibe.replace('score_', '').title()}: {score:.2f}")

---
## 3. Geographic Analysis & Clustering

### Key Question: How should we group POIs for optimal day-by-day itineraries?

In [None]:
# Map all POIs
fig = px.scatter_mapbox(
    df,
    lat='latitude',
    lon='longitude',
    color='category',
    size_max=15,
    hover_name='name',
    hover_data=['neighborhood', 'cost_level'],
    title='<b>Rome POI Distribution</b>',
    zoom=12,
    height=600,
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_layout(
    mapbox_style='carto-positron',
    margin={'r':0,'t':50,'l':0,'b':0}
)

fig.show()

In [None]:
# Perform clustering for itinerary optimization
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """Calculate distance in km."""
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    return 6371 * 2 * asin(sqrt(a))

# DBSCAN clustering (walkable zones = 400m)
coords = df[['latitude', 'longitude']].values
coords_rad = np.radians(coords)

eps_km = 0.4  # 400 meters - comfortable walking distance
eps_rad = eps_km / 6371.0

dbscan = DBSCAN(eps=eps_rad, min_samples=2, metric='haversine')
df['walkable_zone'] = dbscan.fit_predict(coords_rad)

n_zones = len(set(df['walkable_zone'])) - (1 if -1 in df['walkable_zone'].values else 0)
n_isolated = (df['walkable_zone'] == -1).sum()

print(f"üìç WALKABLE ZONE ANALYSIS (400m radius)")
print("=" * 40)
print(f"Walkable zones identified: {n_zones}")
print(f"Isolated POIs: {n_isolated}")
print(f"Clustered POIs: {len(df) - n_isolated}")

In [None]:
# Visualize walkable zones
df_clustered = df[df['walkable_zone'] >= 0].copy()

fig = px.scatter_mapbox(
    df_clustered,
    lat='latitude',
    lon='longitude',
    color='walkable_zone',
    hover_name='name',
    hover_data=['category', 'neighborhood'],
    title='<b>Walkable Zones for Itinerary Planning</b><br><sup>POIs within 400m grouped together</sup>',
    zoom=12,
    height=600,
    color_continuous_scale='Viridis'
)

fig.update_layout(
    mapbox_style='carto-positron',
    margin={'r':0,'t':50,'l':0,'b':0}
)

fig.show()

# Zone composition
print("\nüö∂ WALKABLE ZONE COMPOSITION")
print("=" * 40)
zone_analysis = df_clustered.groupby('walkable_zone').agg({
    'name': 'count',
    'category': lambda x: x.value_counts().to_dict(),
    'neighborhood': 'first'
}).rename(columns={'name': 'poi_count'})

for zone_id, row in zone_analysis.iterrows():
    cats = row['category']
    has_attraction = cats.get('attraction', 0) > 0
    has_restaurant = cats.get('restaurant', 0) > 0
    
    completeness = "‚úÖ Self-contained" if (has_attraction and has_restaurant) else "‚ö†Ô∏è Needs pairing"
    
    print(f"\nZone {zone_id} ({row['neighborhood']}):")
    print(f"  POIs: {row['poi_count']} | {completeness}")
    print(f"  Mix: {cats}")

---
## 4. Restaurant Accessibility Analysis

### Key Question: Do all major attractions have dining options nearby?

In [None]:
# Find restaurants near each attraction
attractions = df[df['category'] == 'attraction'].copy()
restaurants = df[df['category'] == 'restaurant'].copy()

def count_nearby(poi, all_pois, max_km=0.5):
    """Count POIs within walking distance."""
    count = 0
    for _, other in all_pois.iterrows():
        if poi['name'] != other['name']:
            dist = haversine(poi['longitude'], poi['latitude'], 
                           other['longitude'], other['latitude'])
            if dist <= max_km:
                count += 1
    return count

# Calculate for each attraction
accessibility_data = []
for _, attraction in attractions.iterrows():
    nearby_count = count_nearby(attraction, restaurants, max_km=0.5)
    accessibility_data.append({
        'attraction': attraction['name'],
        'neighborhood': attraction['neighborhood'],
        'nearby_restaurants': nearby_count,
        'is_must_see': attraction.get('is_must_see', False)
    })

access_df = pd.DataFrame(accessibility_data).sort_values('nearby_restaurants', ascending=True)

# Visualization
colors = ['red' if x < 3 else 'orange' if x < 5 else 'green' for x in access_df['nearby_restaurants']]

fig = go.Figure()

fig.add_trace(go.Bar(
    y=access_df['attraction'],
    x=access_df['nearby_restaurants'],
    orientation='h',
    marker_color=colors,
    text=access_df['nearby_restaurants'],
    textposition='outside'
))

fig.update_layout(
    title='<b>Restaurant Accessibility by Attraction</b><br><sup>Number of restaurants within 500m walking distance</sup>',
    xaxis_title='Nearby Restaurants',
    yaxis_title='',
    height=max(400, len(access_df) * 30),
    showlegend=False
)

fig.show()

# Summary
print("\nüçΩÔ∏è DINING ACCESSIBILITY SUMMARY")
print("=" * 40)
low_access = access_df[access_df['nearby_restaurants'] < 3]
if len(low_access) > 0:
    print("\n‚ö†Ô∏è Attractions with limited dining (< 3 restaurants):")
    for _, row in low_access.iterrows():
        print(f"  ‚Ä¢ {row['attraction']} ({row['neighborhood']}): {row['nearby_restaurants']} restaurants")
else:
    print("\n‚úÖ All attractions have adequate dining options nearby!")

---
## 5. Data Quality Scorecard

### Overall readiness assessment for Rome

In [None]:
# Calculate quality metrics
metrics = {
    'Total POIs': len(df),
    'Categories Covered': df['category'].nunique(),
    'Neighborhoods Covered': df['neighborhood'].nunique(),
    'POIs with Coordinates': df[['latitude', 'longitude']].notna().all(axis=1).sum(),
    'POIs with Descriptions': df['description'].notna().sum() if 'description' in df.columns else 0,
    'POIs with Cost Info': df['cost_level'].notna().sum() if 'cost_level' in df.columns else 0,
    'POIs with Persona Scores': len(df[available_group].dropna()) if available_group else 0,
    'Attractions': len(df[df['category'] == 'attraction']),
    'Restaurants': len(df[df['category'] == 'restaurant']),
}

# Scoring thresholds
thresholds = {
    'Total POIs': (50, 100, 200),  # (min, good, excellent)
    'Categories Covered': (3, 4, 5),
    'Neighborhoods Covered': (3, 5, 8),
    'Attractions': (10, 20, 30),
    'Restaurants': (15, 30, 50),
}

def get_score_status(metric, value, thresholds):
    if metric not in thresholds:
        return '‚úÖ', 'N/A'
    min_t, good_t, exc_t = thresholds[metric]
    if value >= exc_t:
        return 'üåü', 'Excellent'
    elif value >= good_t:
        return '‚úÖ', 'Good'
    elif value >= min_t:
        return 'üü°', 'Adequate'
    else:
        return '‚ùå', 'Insufficient'

print("\n" + "=" * 60)
print("üìä ROME DATA QUALITY SCORECARD")
print("=" * 60)
print(f"{'Metric':<30} {'Value':>10} {'Status':>15}")
print("-" * 60)

for metric, value in metrics.items():
    icon, status = get_score_status(metric, value, thresholds)
    print(f"{metric:<30} {value:>10} {icon} {status:>10}")

print("\n" + "=" * 60)

# Overall readiness
critical_pass = all([
    metrics['Total POIs'] >= 20,
    metrics['Attractions'] >= 5,
    metrics['Restaurants'] >= 5,
    metrics['Neighborhoods Covered'] >= 3
])

if critical_pass:
    print("\n‚úÖ ROME IS READY FOR PRODUCTION")
else:
    print("\n‚ö†Ô∏è ROME NEEDS MORE DATA BEFORE LAUNCH")

---
# üåç City Expansion Framework

## How We Scale to New Cities

---

In [None]:
# City Expansion Checklist
expansion_framework = {
    'Phase 1: Data Collection': {
        'tasks': [
            'Identify key neighborhoods (5-10)',
            'Collect 50+ attractions from Overture Maps',
            'Collect 100+ restaurants',
            'Gather opening hours & pricing',
            'Obtain coordinates for all POIs'
        ],
        'data_sources': ['Overture Maps (free)', 'Google Places API', 'TripAdvisor', 'Local tourism boards'],
        'estimated_effort': '2-3 days'
    },
    'Phase 2: Persona Scoring': {
        'tasks': [
            'Score each POI for 8 group types (family, couple, solo, etc.)',
            'Score each POI for 10 vibes (cultural, foodie, adventure, etc.)',
            'Add practical attributes (wheelchair access, kid-friendly)',
            'Mark must-see attractions and hidden gems'
        ],
        'data_sources': ['Manual curation', 'Review sentiment analysis', 'Local expert input'],
        'estimated_effort': '3-5 days'
    },
    'Phase 3: Embedding Generation': {
        'tasks': [
            'Generate description embeddings for all POIs',
            'Create neighborhood embeddings',
            'Build proximity relationships',
            'Index in pgvector'
        ],
        'data_sources': ['BGE-small-en-v1.5 (free, local)'],
        'estimated_effort': '1 day'
    },
    'Phase 4: Validation': {
        'tasks': [
            'Generate test itineraries for each persona',
            'Verify restaurant accessibility',
            'Check walkable zone coverage',
            'Quality review by local expert'
        ],
        'data_sources': ['Internal testing', 'Local reviewer'],
        'estimated_effort': '2 days'
    }
}

print("\n" + "=" * 70)
print("üåç CITY EXPANSION FRAMEWORK")
print("=" * 70)

total_days = 0
for phase, details in expansion_framework.items():
    print(f"\n{phase}")
    print("-" * 50)
    print(f"‚è±Ô∏è  Estimated effort: {details['estimated_effort']}")
    print(f"üìä Data sources: {', '.join(details['data_sources'])}")
    print("Tasks:")
    for task in details['tasks']:
        print(f"  ‚òê {task}")
    
    # Extract days for total
    effort = details['estimated_effort']
    days = int(effort.split('-')[0]) if '-' in effort else int(effort.split()[0])
    total_days += days

print(f"\n{'=' * 70}")
print(f"üìÖ TOTAL ESTIMATED TIME PER CITY: {total_days}-{total_days+4} days")
print(f"üí∞ DATA COST: ~$0 (using free data sources + local embeddings)")

In [None]:
# Minimum Data Requirements per City
min_requirements = pd.DataFrame({
    'Category': ['Attractions', 'Restaurants', 'Activities', 'Shopping', 'Nightlife', 'Neighborhoods'],
    'Minimum': [15, 30, 10, 5, 5, 4],
    'Recommended': [30, 60, 20, 15, 15, 8],
    'Rome (Current)': [
        len(df[df['category'] == 'attraction']),
        len(df[df['category'] == 'restaurant']),
        len(df[df['category'] == 'activity']),
        len(df[df['category'] == 'shopping']) if 'shopping' in df['category'].values else 0,
        len(df[df['category'] == 'nightlife']) if 'nightlife' in df['category'].values else 0,
        df['neighborhood'].nunique()
    ]
})

print("\nüìã MINIMUM DATA REQUIREMENTS PER CITY")
print("=" * 60)
print(min_requirements.to_string(index=False))

# Visualize
fig = go.Figure()

fig.add_trace(go.Bar(
    name='Minimum',
    x=min_requirements['Category'],
    y=min_requirements['Minimum'],
    marker_color='lightgray'
))

fig.add_trace(go.Bar(
    name='Recommended',
    x=min_requirements['Category'],
    y=min_requirements['Recommended'],
    marker_color='lightblue'
))

fig.add_trace(go.Bar(
    name='Rome (Current)',
    x=min_requirements['Category'],
    y=min_requirements['Rome (Current)'],
    marker_color=COLORS['primary']
))

fig.update_layout(
    title='<b>Data Requirements vs Current Coverage</b>',
    barmode='group',
    height=400,
    xaxis_title='Category',
    yaxis_title='POI Count'
)

fig.show()

In [None]:
# Proposed Expansion Roadmap
roadmap = pd.DataFrame({
    'City': ['Rome', 'Florence', 'Venice', 'Barcelona', 'Paris', 'London', 'Amsterdam', 'Prague'],
    'Priority': ['‚úÖ Live', 'Phase 1', 'Phase 1', 'Phase 2', 'Phase 2', 'Phase 3', 'Phase 3', 'Phase 3'],
    'Why': [
        'Pilot city - Complete',
        'Italy expansion, high demand',
        'Italy expansion, unique experience',
        'Top EU destination',
        'Highest search volume',
        'English-speaking market',
        'Compact, walkable',
        'Budget-friendly option'
    ],
    'Data Complexity': ['Medium', 'Low', 'Low', 'Medium', 'High', 'High', 'Low', 'Low'],
    'Est. Timeline': ['Done', 'Week 1-2', 'Week 2-3', 'Week 4-5', 'Week 5-7', 'Week 8-10', 'Week 10-11', 'Week 11-12']
})

print("\nüóìÔ∏è PROPOSED EXPANSION ROADMAP")
print("=" * 80)
print(roadmap.to_string(index=False))

# Timeline visualization
fig = px.timeline(
    roadmap[roadmap['Priority'] != '‚úÖ Live'],
    x_start=[f"2024-01-{i*7+1:02d}" for i in range(len(roadmap)-1)],
    x_end=[f"2024-01-{i*7+14:02d}" for i in range(len(roadmap)-1)],
    y='City',
    color='Priority',
    title='<b>City Expansion Timeline</b>'
)
fig.update_layout(height=400)
fig.show()

---
## üìä Key Takeaways

### What We Analyzed

| Analysis | Finding | Action |
|----------|---------|--------|
| **Data Coverage** | Good category distribution | ‚úÖ Ready |
| **Persona Fit** | Strong cultural/foodie, weaker nightlife | Add more nightlife POIs |
| **Walkable Zones** | 8+ distinct zones identified | Use for day grouping |
| **Restaurant Access** | Most attractions well-served | Flag outliers in itinerary |

### Expansion Approach

1. **Cost-Efficient**: Free data sources (Overture Maps) + local embeddings
2. **Scalable**: ~8-12 days per city with defined process
3. **Quality-Focused**: Manual persona scoring is our differentiator

### Next Steps

1. ‚òê Complete Rome data (add 10-15 more restaurants)
2. ‚òê Begin Florence data collection
3. ‚òê Build automated data quality dashboard
4. ‚òê Create persona scoring guidelines document

In [None]:
# Export summary for presentation
summary = {
    'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'city': 'Rome',
    'total_pois': len(df),
    'categories': df['category'].nunique(),
    'neighborhoods': df['neighborhood'].nunique(),
    'walkable_zones': n_zones,
    'data_quality_score': 'Good' if critical_pass else 'Needs Work',
    'recommended_additions': [
        'More nightlife venues',
        'Additional budget restaurants',
        'Wellness/spa options'
    ]
}

# Save
output_path = Path('../data/processed')
output_path.mkdir(exist_ok=True)

with open(output_path / 'rome_analysis_summary.json', 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print(f"\n‚úÖ Analysis summary saved to {output_path / 'rome_analysis_summary.json'}")