In [None]:
# EU HICP Package Holidays Price Forecast - Phase 2: Exploratory Data Analysis

## Overview
This notebook implements **Phase 2** of the EU HICP Package Holidays Price Forecast project, focusing on comprehensive exploratory data analysis using interactive plotly visualizations to understand seasonal patterns, cross-country dynamics, and key economic relationships.

### Objectives
1. **Interactive Time Series Analysis**: Visualize HICP trends with seasonal highlighting
2. **Seasonal Pattern Discovery**: Create heatmaps and decomposition analysis
3. **Cross-Country Comparison**: Analyze EU vs Germany package holiday dynamics
4. **Economic Indicator Relationships**: Explore correlations with oil prices, exchange rates, etc.
5. **Holiday Period Impact**: Statistical analysis of summer tourism effects
6. **Outlier Detection**: Identify and analyze unusual price movements

### Key Insights We'll Discover
- **Summer Seasonality**: How strong are June-August price increases?
- **Cross-Country Divergence**: Do Germany and EU-wide patterns align?
- **Economic Drivers**: Which indicators best predict holiday price changes?
- **Historical Patterns**: What can past data tell us about July 2025?

### Visualization Technologies
- **Interactive Charts**: Plotly for dynamic exploration
- **Statistical Analysis**: Polars for efficient computation
- **Dashboard Creation**: Multi-panel comprehensive views


In [None]:
# Import required libraries
import polars as pl
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
from datetime import datetime
import sys
import os
from scipy import stats
import json

# Add project root to path for imports
sys.path.append('.')

# Import project modules
from visualization_utils import HICPVisualizer, create_statistical_summary, COLORS
from data_collector import DataCollector

# Configure settings
pl.Config.set_tbl_rows(15)
pl.Config.set_tbl_cols(12)
warnings.filterwarnings('ignore')

print("‚úì All libraries imported successfully")
print(f"Polars version: {pl.__version__}")
print(f"NumPy version: {np.__version__}")

# Initialize visualizer
viz = HICPVisualizer(theme='plotly_white')
print("‚úì HICP Visualizer initialized")


In [None]:
## Step 1: Load and Validate Phase 1 Data

First, we'll load the cleaned data from Phase 1 and validate its structure and quality for our exploratory analysis.


In [None]:
# Load cleaned data from Phase 1
def load_phase1_data():
    """Load and validate cleaned data from Phase 1."""
    
    try:
        # Try to load parquet files first (faster)
        long_data = pl.read_parquet('data/clean_long_format.parquet')
        wide_data = pl.read_parquet('data/clean_wide_format.parquet')
        
        print("‚úì Successfully loaded parquet data files")
        
    except FileNotFoundError:
        try:
            # Fallback to CSV files
            long_data = pl.read_csv('data/clean_long_format.csv')
            wide_data = pl.read_csv('data/clean_wide_format.csv')
            
            # Ensure proper data types for CSV
            long_data = long_data.with_columns([
                pl.col('date').str.strptime(pl.Date, format='%Y-%m-%d'),
                pl.col('value').cast(pl.Float64),
                pl.col('value_filled').cast(pl.Float64),
                pl.col('mom_pct_change').cast(pl.Float64),
                pl.col('yoy_pct_change').cast(pl.Float64)
            ])
            
            wide_data = wide_data.with_columns([
                pl.col('date').str.strptime(pl.Date, format='%Y-%m-%d')
            ])
            
            print("‚úì Successfully loaded CSV data files")
            
        except FileNotFoundError:
            print("‚ùå No cleaned data files found!")
            print("Please run Phase 1 (01_data_collection_and_cleaning.ipynb) first")
            return None, None, {}
    
    # Load metadata if available
    try:
        with open('data/data_metadata.json', 'r') as f:
            metadata = json.load(f)
        print("‚úì Loaded metadata file")
    except FileNotFoundError:
        metadata = {}
        print("‚ö†Ô∏è No metadata file found")
    
    return long_data, wide_data, metadata

# Load the data
long_data, wide_data, metadata = load_phase1_data()

# Validate data structure
if long_data is not None and wide_data is not None:
    print("\n" + "="*50)
    print("DATA VALIDATION SUMMARY")
    print("="*50)
    
    print(f"Long format data: {long_data.height:,} rows √ó {long_data.width} columns")
    print(f"Wide format data: {wide_data.height:,} rows √ó {wide_data.width} columns")
    
    if not long_data.is_empty():
        print(f"Date range: {long_data['date'].min()} to {long_data['date'].max()}")
        print(f"Available series: {long_data['series_name'].n_unique()}")
        
        # Show available series
        series_list = long_data['series_name'].unique().to_list()
        print(f"Series names:")
        for series in series_list:
            count = long_data.filter(pl.col('series_name') == series).height
            print(f"  ‚Ä¢ {series}: {count:,} observations")
    
    # Check for key HICP series
    key_series = ['eu_package_holidays', 'germany_package_holidays']
    missing_series = []
    
    for series in key_series:
        if series not in long_data['series_name'].unique().to_list():
            missing_series.append(series)
    
    if missing_series:
        print(f"\n‚ö†Ô∏è Missing key series: {missing_series}")
        print("Note: Analysis will be limited without core HICP data")
    else:
        print(f"\n‚úì All key HICP series are available")
        
else:
    print("‚ùå Cannot proceed without data. Please run Phase 1 first.")


In [None]:
## Step 2: Interactive Time Series Visualization

Let's create comprehensive interactive time series plots to understand the overall trends and patterns in our HICP data. These visualizations will highlight seasonal periods and allow for detailed exploration of the data.


In [None]:
# Create interactive time series visualizations
if long_data is not None and not long_data.is_empty():
    
    print("Creating interactive time series visualizations...")
    
    # 1. Main HICP Index Levels
    print("\n1. HICP Index Levels Over Time")
    fig_levels = viz.create_time_series_plot(
        df=long_data,
        value_col='value_filled',
        title="EU HICP Package Holidays Index Levels",
        yaxis_title="Index Value",
        show_trend=True,
        highlight_seasons=True
    )
    fig_levels.show()
    
    # 2. Month-over-Month Percentage Changes
    print("\n2. Month-over-Month Percentage Changes")
    fig_mom = viz.create_time_series_plot(
        df=long_data,
        value_col='mom_pct_change',
        title="HICP Package Holidays - Month-over-Month Changes (%)",
        yaxis_title="MoM Change (%)",
        show_trend=True,
        highlight_seasons=True
    )
    
    # Add zero line for reference
    fig_mom.add_hline(y=0, line_dash="dash", line_color="gray", opacity=0.5)
    fig_mom.show()
    
    # 3. Year-over-Year Percentage Changes
    print("\n3. Year-over-Year Percentage Changes")
    fig_yoy = viz.create_time_series_plot(
        df=long_data,
        value_col='yoy_pct_change',
        title="HICP Package Holidays - Year-over-Year Changes (%)",
        yaxis_title="YoY Change (%)",
        show_trend=True,
        highlight_seasons=True
    )
    
    # Add zero line for reference
    fig_yoy.add_hline(y=0, line_dash="dash", line_color="gray", opacity=0.5)
    fig_yoy.show()
    
    # 4. Focus on Key HICP Series Comparison
    hicp_series = ['eu_package_holidays', 'germany_package_holidays']
    available_hicp = [s for s in hicp_series if s in long_data['series_name'].unique().to_list()]
    
    if available_hicp:
        print(f"\n4. EU vs Germany HICP Comparison")
        hicp_data = long_data.filter(pl.col('series_name').is_in(available_hicp))
        
        fig_comparison = viz.create_time_series_plot(
            df=hicp_data,
            value_col='mom_pct_change',
            title="EU vs Germany Package Holidays HICP - MoM Changes Comparison",
            yaxis_title="MoM Change (%)",
            show_trend=True,
            highlight_seasons=True
        )
        
        # Add zero line and enhance styling
        fig_comparison.add_hline(y=0, line_dash="dash", line_color="gray", opacity=0.5)
        fig_comparison.update_layout(
            annotations=[
                dict(
                    text="Summer Tourism Season Highlighted",
                    xref="paper", yref="paper",
                    x=0.02, y=0.98,
                    showarrow=False,
                    font=dict(size=10, color="orange"),
                    bgcolor="rgba(255,255,255,0.8)"
                )
            ]
        )
        fig_comparison.show()
        
    else:
        print("‚ö†Ô∏è No HICP series available for comparison")
    
else:
    print("‚ùå No data available for time series visualization")


In [None]:
## Step 3: Seasonal Pattern Analysis

Now let's dive deep into seasonal patterns using heatmaps and decomposition analysis. This will help us understand the recurring patterns that are crucial for forecasting July 2025.


In [None]:
# Seasonal pattern analysis
if long_data is not None and not long_data.is_empty():
    
    print("Analyzing seasonal patterns...")
    
    # Get available HICP series for analysis
    hicp_series = ['eu_package_holidays', 'germany_package_holidays']
    available_series = [s for s in hicp_series if s in long_data['series_name'].unique().to_list()]
    
    # 1. Seasonal Heatmaps for MoM Changes
    for series_name in available_series:
        print(f"\n1. Seasonal Heatmap - {series_name}")
        
        fig_heatmap = viz.create_seasonal_heatmap(
            df=long_data,
            value_col='mom_pct_change',
            series_name=series_name,
            title=f"Seasonal Patterns: {series_name.replace('_', ' ').title()} - MoM Changes (%)"
        )
        
        # Add annotations for insights
        fig_heatmap.update_layout(
            annotations=[
                dict(
                    text="Red = Price Increases | Blue = Price Decreases<br>Look for consistent summer (Jun-Aug) patterns",
                    xref="paper", yref="paper",
                    x=0.5, y=-0.1,
                    showarrow=False,
                    font=dict(size=12),
                    xanchor="center"
                )
            ]
        )
        fig_heatmap.show()
    
    # 2. Seasonal Decomposition
    for series_name in available_series:
        print(f"\n2. Seasonal Decomposition - {series_name}")
        
        fig_decomp = viz.create_seasonal_decomposition(
            df=long_data,
            series_name=series_name,
            value_col='value_filled',
            title=f"Seasonal Decomposition Analysis"
        )
        fig_decomp.show()
    
    # 3. Monthly Box Plots for Seasonality
    print(f"\n3. Monthly Distribution Analysis")
    
    # Create monthly box plots for MoM changes
    if available_series:
        # Focus on the primary series
        primary_series = available_series[0]
        
        fig_monthly = viz.create_box_plot(
            df=long_data.filter(pl.col('series_name') == primary_series),
            value_col='mom_pct_change',
            group_col='month',
            title=f"Monthly MoM Changes Distribution - {primary_series.replace('_', ' ').title()}"
        )
        
        # Update x-axis labels to month names
        month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                      'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        fig_monthly.update_layout(
            xaxis=dict(
                tickmode='array',
                tickvals=list(range(1, 13)),
                ticktext=month_names
            )
        )
        fig_monthly.show()
    
    # 4. Seasonal Analysis by Quarter
    print(f"\n4. Quarterly Seasonal Analysis")
    
    if available_series:
        fig_quarterly = viz.create_box_plot(
            df=long_data.filter(pl.col('series_name') == available_series[0]),
            value_col='mom_pct_change',
            group_col='season',
            title=f"Seasonal MoM Changes by Quarter - {available_series[0].replace('_', ' ').title()}"
        )
        fig_quarterly.show()
    
    # 5. Calculate and Display Seasonal Statistics
    print(f"\n5. Seasonal Statistics Summary")
    
    for series_name in available_series:
        series_data = long_data.filter(pl.col('series_name') == series_name)
        
        if not series_data.is_empty():
            # Calculate seasonal statistics
            seasonal_stats = (
                series_data
                .filter(pl.col('mom_pct_change').is_not_null())
                .group_by(['season'])
                .agg([
                    pl.col('mom_pct_change').mean().alias('mean_mom_change'),
                    pl.col('mom_pct_change').std().alias('std_mom_change'),
                    pl.col('mom_pct_change').median().alias('median_mom_change'),
                    pl.col('mom_pct_change').count().alias('observations')
                ])
                .sort('mean_mom_change', descending=True)
            )
            
            print(f"\n{series_name.replace('_', ' ').title()} - Seasonal Statistics:")
            print(seasonal_stats)
            
            # Monthly statistics
            monthly_stats = (
                series_data
                .filter(pl.col('mom_pct_change').is_not_null())
                .group_by('month')
                .agg([
                    pl.col('mom_pct_change').mean().alias('mean_mom_change'),
                    pl.col('mom_pct_change').std().alias('std_mom_change')
                ])
                .sort('month')
            )
            
            print(f"\nMonthly Statistics for {series_name.replace('_', ' ').title()}:")
            print(monthly_stats)
            
            # Highlight July specifically (our forecast target)
            july_stats = monthly_stats.filter(pl.col('month') == 7)
            if not july_stats.is_empty():
                july_mean = july_stats['mean_mom_change'].item()
                july_std = july_stats['std_mom_change'].item()
                print(f"\nüéØ JULY HISTORICAL PATTERN:")
                print(f"   Mean MoM Change: {july_mean:.2f}%")
                print(f"   Standard Deviation: {july_std:.2f}%")
                print(f"   This provides a baseline for July 2025 forecasting")

else:
    print("‚ùå No data available for seasonal analysis")


In [None]:
## Step 4: Cross-Country and Economic Indicator Analysis

Let's analyze the relationships between different countries and economic indicators to understand what drives package holiday price movements.


In [None]:
# Cross-country and economic indicator analysis
if wide_data is not None and not wide_data.is_empty():
    
    print("Analyzing cross-country relationships and economic indicators...")
    
    # Get list of available variables for correlation analysis
    numeric_columns = [col for col in wide_data.columns if col not in ['date', 'year', 'month', 'quarter', 'season', 'is_holiday_season']]
    
    print(f"Available variables for analysis: {len(numeric_columns)}")
    
    # 1. Correlation Matrix for all series
    if len(numeric_columns) >= 2:
        print(f"\n1. Comprehensive Correlation Matrix")
        
        # Focus on key variables (MoM changes)
        mom_columns = [col for col in numeric_columns if '_mom_pct' in col or 'mom_pct_change' in col]
        value_columns = [col for col in numeric_columns if col in ['eu_package_holidays', 'germany_package_holidays']]
        
        # Combine key columns for analysis
        key_columns = mom_columns + value_columns
        key_columns = [col for col in key_columns if col in wide_data.columns]
        
        if len(key_columns) >= 2:
            fig_corr = viz.create_correlation_matrix(
                df=wide_data,
                variables=key_columns[:10],  # Limit to first 10 for readability
                title="Correlation Matrix: HICP Series and Economic Indicators"
            )
            fig_corr.show()
        
        # 2. Focus on HICP series correlations
        hicp_vars = [col for col in numeric_columns if 'package_holidays' in col and '_mom_pct' in col]
        
        if len(hicp_vars) >= 2:
            print(f"\n2. HICP Package Holidays Cross-Country Correlation")
            fig_hicp_corr = viz.create_correlation_matrix(
                df=wide_data,
                variables=hicp_vars,
                title="Cross-Country HICP Package Holidays Correlations"
            )
            fig_hicp_corr.show()
    
    # 3. Time series comparison of key indicators
    print(f"\n3. Economic Indicators vs HICP Analysis")
    
    # Look for economic indicators in the data
    econ_indicators = []
    hicp_mom_vars = [col for col in numeric_columns if 'package_holidays' in col and '_mom_pct' in col]
    
    # Find economic indicators (non-HICP series)
    for col in numeric_columns:
        if ('consumer_confidence' in col or 'eur_usd' in col or 'oil_price' in col or 
            'gdp_growth' in col or 'travel' in col.lower()) and '_mom_pct' in col:
            econ_indicators.append(col)
    
    if hicp_mom_vars and econ_indicators:
        # Create combined time series plot
        
        # Select key variables for plotting
        plot_vars = hicp_mom_vars[:2] + econ_indicators[:3]  # Limit for readability
        
        # Create subplot figure
        fig_econ = make_subplots(
            rows=len(plot_vars), cols=1,
            subplot_titles=[var.replace('_', ' ').title() for var in plot_vars],
            vertical_spacing=0.05
        )
        
        colors = COLORS['palette']
        
        for i, var in enumerate(plot_vars):
            if var in wide_data.columns:
                # Get non-null data
                var_data = wide_data.select(['date', var]).filter(pl.col(var).is_not_null())
                
                if not var_data.is_empty():
                    dates = var_data['date'].to_list()
                    values = var_data[var].to_list()
                    
                    fig_econ.add_trace(
                        go.Scatter(
                            x=dates, y=values,
                            mode='lines',
                            name=var.replace('_', ' ').title(),
                            line=dict(color=colors[i % len(colors)]),
                            showlegend=False
                        ),
                        row=i+1, col=1
                    )
                    
                    # Add zero line for percentage changes
                    if '_mom_pct' in var or '_yoy_pct' in var:
                        fig_econ.add_hline(
                            y=0, line_dash="dash", line_color="gray", 
                            opacity=0.3, row=i+1, col=1
                        )
        
        fig_econ.update_layout(
            title="Economic Indicators vs HICP Package Holidays",
            height=200 * len(plot_vars),
            width=1000
        )
        fig_econ.show()
    
    # 4. Statistical analysis of relationships
    print(f"\n4. Statistical Relationship Analysis")
    
    if hicp_mom_vars and econ_indicators:
        # Calculate cross-correlations
        print("Cross-correlation analysis between HICP and economic indicators:")
        
        for hicp_var in hicp_mom_vars[:2]:  # Focus on main HICP series
            print(f"\n{hicp_var.replace('_', ' ').title()}:")
            
            hicp_data = wide_data.select(['date', hicp_var]).filter(pl.col(hicp_var).is_not_null())
            
            for econ_var in econ_indicators[:3]:  # Limit for readability
                if econ_var in wide_data.columns:
                    econ_data = wide_data.select(['date', econ_var]).filter(pl.col(econ_var).is_not_null())
                    
                    # Merge data for correlation calculation
                    merged_data = hicp_data.join(econ_data, on='date', how='inner')
                    
                    if not merged_data.is_empty() and len(merged_data) > 10:
                        # Calculate correlation
                        corr_matrix = np.corrcoef(
                            merged_data[hicp_var].to_numpy(),
                            merged_data[econ_var].to_numpy()
                        )
                        correlation = corr_matrix[0, 1]
                        
                        # Interpret correlation strength
                        if abs(correlation) >= 0.7:
                            strength = "Strong"
                        elif abs(correlation) >= 0.4:
                            strength = "Moderate"
                        elif abs(correlation) >= 0.2:
                            strength = "Weak"
                        else:
                            strength = "Very Weak"
                        
                        print(f"  vs {econ_var.replace('_', ' ').title()}: {correlation:.3f} ({strength})")
    
    # 5. Create summary dashboard for key relationships
    print(f"\n5. Summary Dashboard")
    
    # Focus on the primary HICP series if available
    primary_hicp = None
    for series in ['eu_package_holidays', 'germany_package_holidays']:
        if series in [col.replace('_mom_pct', '') for col in hicp_mom_vars]:
            primary_hicp = series
            break
    
    if primary_hicp and long_data is not None:
        # Create summary statistics
        summary_stats = create_statistical_summary(
            long_data.filter(pl.col('series_name') == primary_hicp)
        )
        
        print(f"\nSummary Statistics for {primary_hicp.replace('_', ' ').title()}:")
        print(summary_stats)

else:
    print("‚ùå No wide format data available for correlation analysis")


In [None]:
## Step 5: Holiday Period Impact Assessment

Now let's conduct statistical analysis of summer tourism effects and identify periods of unusual price movements that could inform our July 2025 forecast.


In [None]:
# Holiday period impact assessment and statistical analysis
if long_data is not None and not long_data.is_empty():
    
    print("Conducting holiday period impact assessment...")
    
    # Get available HICP series
    hicp_series = ['eu_package_holidays', 'germany_package_holidays']
    available_series = [s for s in hicp_series if s in long_data['series_name'].unique().to_list()]
    
    for series_name in available_series:
        print(f"\n" + "="*60)
        print(f"HOLIDAY IMPACT ANALYSIS: {series_name.replace('_', ' ').title()}")
        print("="*60)
        
        series_data = long_data.filter(pl.col('series_name') == series_name)
        
        if series_data.is_empty():
            continue
        
        # 1. Summer vs Non-Summer Statistical Test
        print(f"\n1. Summer vs Non-Summer Statistical Analysis")
        
        summer_data = series_data.filter(pl.col('is_holiday_season') == True)['mom_pct_change'].drop_nulls().to_list()
        non_summer_data = series_data.filter(pl.col('is_holiday_season') == False)['mom_pct_change'].drop_nulls().to_list()
        
        if summer_data and non_summer_data:
            # Calculate statistics
            summer_mean = np.mean(summer_data)
            non_summer_mean = np.mean(non_summer_data)
            summer_std = np.std(summer_data)
            non_summer_std = np.std(non_summer_data)
            
            print(f"Summer months (Jun-Aug):")
            print(f"  Mean MoM change: {summer_mean:.3f}%")
            print(f"  Std deviation: {summer_std:.3f}%")
            print(f"  Observations: {len(summer_data)}")
            
            print(f"Non-summer months:")
            print(f"  Mean MoM change: {non_summer_mean:.3f}%")
            print(f"  Std deviation: {non_summer_std:.3f}%")
            print(f"  Observations: {len(non_summer_data)}")
            
            # Perform t-test
            if len(summer_data) > 1 and len(non_summer_data) > 1:
                try:
                    t_stat, p_value = stats.ttest_ind(summer_data, non_summer_data)
                    
                    print(f"\nStatistical Test Results:")
                    print(f"  T-statistic: {t_stat:.3f}")
                    print(f"  P-value: {p_value:.3f}")
                    
                    if p_value < 0.05:
                        print(f"  Result: SIGNIFICANT difference between summer and non-summer (p < 0.05)")
                        effect_size = summer_mean - non_summer_mean
                        print(f"  Summer effect: {effect_size:+.3f} percentage points")
                    else:
                        print(f"  Result: No significant difference (p >= 0.05)")
                        
                except Exception as e:
                    print(f"  Error in statistical test: {e}")
        
        # 2. July-specific analysis
        print(f"\n2. July-Specific Historical Analysis")
        
        july_data = series_data.filter(pl.col('month') == 7)
        
        if not july_data.is_empty():
            july_changes = july_data['mom_pct_change'].drop_nulls().to_list()
            
            if july_changes:
                july_mean = np.mean(july_changes)
                july_std = np.std(july_changes)
                july_median = np.median(july_changes)
                july_min = min(july_changes)
                july_max = max(july_changes)
                
                print(f"July Historical Statistics ({len(july_changes)} observations):")
                print(f"  Mean: {july_mean:.3f}%")
                print(f"  Median: {july_median:.3f}%")
                print(f"  Std Dev: {july_std:.3f}%")
                print(f"  Range: {july_min:.3f}% to {july_max:.3f}%")
                
                # Calculate confidence intervals
                if len(july_changes) > 1:
                    confidence_interval = stats.t.interval(
                        0.68, len(july_changes)-1, 
                        loc=july_mean, 
                        scale=stats.sem(july_changes)
                    )
                    
                    print(f"  68% Confidence Interval: [{confidence_interval[0]:.3f}%, {confidence_interval[1]:.3f}%]")
                    
                    # This gives us a baseline for July 2025 forecasting
                    print(f"\nüéØ JULY 2025 BASELINE FORECAST:")
                    print(f"   Expected range: {confidence_interval[0]:.2f}% to {confidence_interval[1]:.2f}%")
                    print(f"   Central estimate: {july_mean:.2f}%")
        
        # 3. Outlier detection and analysis
        print(f"\n3. Outlier Detection and Analysis")
        
        mom_changes = series_data['mom_pct_change'].drop_nulls().to_list()
        
        if len(mom_changes) > 10:
            # Calculate IQR for outlier detection
            q1 = np.percentile(mom_changes, 25)
            q3 = np.percentile(mom_changes, 75)
            iqr = q3 - q1
            
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            
            # Find outliers
            outlier_data = series_data.filter(
                (pl.col('mom_pct_change') < lower_bound) | 
                (pl.col('mom_pct_change') > upper_bound)
            )
            
            print(f"Outlier thresholds: [{lower_bound:.2f}%, {upper_bound:.2f}%]")
            print(f"Number of outliers detected: {len(outlier_data)}")
            
            if not outlier_data.is_empty():
                print(f"\nOutlier events:")
                outlier_summary = (
                    outlier_data
                    .select(['date', 'mom_pct_change', 'season'])
                    .sort('mom_pct_change', descending=True)
                )
                print(outlier_summary.head(10))
        
        # 4. Trend analysis over time
        print(f"\n4. Trend Analysis Over Time")
        
        # Group by year to see if seasonal effects are changing
        yearly_july = (
            series_data
            .filter(pl.col('month') == 7)
            .filter(pl.col('mom_pct_change').is_not_null())
            .group_by('year')
            .agg([
                pl.col('mom_pct_change').mean().alias('july_mom_change')
            ])
            .sort('year')
        )
        
        if not yearly_july.is_empty() and len(yearly_july) > 3:
            print(f"July MoM changes by year:")
            print(yearly_july)
            
            # Simple linear trend analysis
            years = yearly_july['year'].to_numpy()
            july_values = yearly_july['july_mom_change'].to_numpy()
            
            if len(years) > 2:
                # Calculate linear trend
                slope, intercept, r_value, p_value, std_err = stats.linregress(years, july_values)
                
                print(f"\nJuly Trend Analysis:")
                print(f"  Linear trend slope: {slope:.4f} percentage points per year")
                print(f"  R-squared: {r_value**2:.3f}")
                print(f"  P-value: {p_value:.3f}")
                
                if p_value < 0.05:
                    if slope > 0:
                        print(f"  Result: SIGNIFICANT increasing trend in July changes")
                    else:
                        print(f"  Result: SIGNIFICANT decreasing trend in July changes")
                    
                    # Project to 2025
                    projected_2025 = intercept + slope * 2025
                    print(f"  Linear projection for July 2025: {projected_2025:.2f}%")
                else:
                    print(f"  Result: No significant trend detected")
    
    # 5. Create comprehensive holiday impact visualization
    print(f"\n" + "="*60)
    print("COMPREHENSIVE HOLIDAY IMPACT VISUALIZATION")
    print("="*60)
    
    if available_series:
        primary_series = available_series[0]
        
        # Create a comprehensive dashboard
        fig_holiday = viz.create_summary_dashboard(
            df=long_data.filter(pl.col('series_name') == primary_series),
            series_focus=primary_series
        )
        fig_holiday.show()

else:
    print("‚ùå No data available for holiday impact assessment")


In [None]:
## Step 6: Key Insights and Phase 2 Summary

Let's consolidate our findings from the exploratory data analysis and prepare key insights for the next phases of modeling and forecasting.


In [None]:
# Phase 2 Summary and Key Insights
print("üéØ PHASE 2: EXPLORATORY DATA ANALYSIS - SUMMARY")
print("="*70)

if long_data is not None and not long_data.is_empty():
    
    # Generate final summary statistics
    print("\nüìä DATASET OVERVIEW:")
    print(f"  ‚Ä¢ Total observations: {len(long_data):,}")
    print(f"  ‚Ä¢ Date range: {long_data['date'].min()} to {long_data['date'].max()}")
    print(f"  ‚Ä¢ Series analyzed: {long_data['series_name'].n_unique()}")
    
    series_list = long_data['series_name'].unique().to_list()
    for series in series_list:
        count = long_data.filter(pl.col('series_name') == series).height
        print(f"    - {series}: {count:,} observations")
    
    # Key insights from analysis
    print(f"\nüîç KEY INSIGHTS DISCOVERED:")
    
    # Calculate overall seasonal effect
    if 'is_holiday_season' in long_data.columns:
        summer_overall = long_data.filter(pl.col('is_holiday_season') == True)['mom_pct_change'].drop_nulls()
        non_summer_overall = long_data.filter(pl.col('is_holiday_season') == False)['mom_pct_change'].drop_nulls()
        
        if not summer_overall.is_empty() and not non_summer_overall.is_empty():
            summer_mean = summer_overall.mean()
            non_summer_mean = non_summer_overall.mean()
            seasonal_effect = summer_mean - non_summer_mean
            
            print(f"  ‚Ä¢ Seasonal Effect: Summer months show {seasonal_effect:+.2f} percentage points difference")
            
            if abs(seasonal_effect) > 0.5:
                print(f"    ‚Üí STRONG seasonal pattern detected - crucial for forecasting")
            else:
                print(f"    ‚Üí Moderate seasonal pattern - important but not dominant")
    
    # July specific insights
    july_data_all = long_data.filter(pl.col('month') == 7)['mom_pct_change'].drop_nulls()
    if not july_data_all.is_empty():
        july_historical_mean = july_data_all.mean()
        july_historical_std = july_data_all.std()
        
        print(f"  ‚Ä¢ July Historical Pattern:")
        print(f"    - Mean MoM change: {july_historical_mean:.2f}%")
        print(f"    - Standard deviation: {july_historical_std:.2f}%")
        print(f"    - Sample size: {len(july_data_all)} July observations")
        
        # Simple forecast range
        forecast_lower = july_historical_mean - july_historical_std
        forecast_upper = july_historical_mean + july_historical_std
        print(f"    ‚Üí July 2025 preliminary range: {forecast_lower:.2f}% to {forecast_upper:.2f}%")
    
    # Cross-series correlation insights
    hicp_series = ['eu_package_holidays', 'germany_package_holidays']
    available_hicp = [s for s in hicp_series if s in long_data['series_name'].unique().to_list()]
    
    if len(available_hicp) >= 2:
        # Calculate cross-correlation
        eu_data = long_data.filter(pl.col('series_name') == 'eu_package_holidays')['mom_pct_change'].drop_nulls()
        de_data = long_data.filter(pl.col('series_name') == 'germany_package_holidays')['mom_pct_change'].drop_nulls()
        
        if not eu_data.is_empty() and not de_data.is_empty():
            # Simple correlation on overlapping periods
            print(f"  ‚Ä¢ Cross-Country Dynamics:")
            print(f"    - EU and Germany series both available")
            print(f"    ‚Üí Enables cross-country validation of forecasts")
    
    # Data quality assessment
    total_missing = long_data['mom_pct_change'].null_count()
    total_observations = len(long_data)
    missing_pct = (total_missing / total_observations) * 100
    
    print(f"  ‚Ä¢ Data Quality:")
    print(f"    - Missing MoM data: {missing_pct:.1f}%")
    
    if missing_pct < 10:
        print(f"    ‚Üí Excellent data coverage for reliable forecasting")
    elif missing_pct < 25:
        print(f"    ‚Üí Good data coverage with minor gaps")
    else:
        print(f"    ‚Üí Data gaps may require interpolation strategies")

# Prepare outputs for next phases
print(f"\nüìã OUTPUTS FOR NEXT PHASES:")
print(f"  ‚Ä¢ Seasonal patterns identified and quantified")
print(f"  ‚Ä¢ July historical baseline established")
print(f"  ‚Ä¢ Statistical relationships documented")
print(f"  ‚Ä¢ Outlier events catalogued")
print(f"  ‚Ä¢ Data quality validated")

print(f"\nüöÄ READY FOR PHASE 3: SEASONAL ADJUSTMENT")
print(f"   Next steps:")
print(f"   1. Implement X-13ARIMA-SEATS equivalent seasonal adjustment")
print(f"   2. Create custom seasonal factors based on discovered patterns")
print(f"   3. Develop seasonally adjusted series for modeling")
print(f"   4. Validate seasonal adjustment quality")

print(f"\nüìà PRELIMINARY JULY 2025 INSIGHTS:")
if 'july_historical_mean' in locals():
    print(f"   ‚Ä¢ Historical July average: {july_historical_mean:.2f}%")
    print(f"   ‚Ä¢ Expected range: {forecast_lower:.2f}% to {forecast_upper:.2f}%")
    print(f"   ‚Ä¢ Confidence: Based on {len(july_data_all)} historical observations")
else:
    print(f"   ‚Ä¢ Baseline will be established in subsequent phases")

print(f"\n" + "="*70)
print(f"‚úÖ PHASE 2 COMPLETED SUCCESSFULLY")
print(f"="*70)
