# 0 - QB - Crossvals

## Importation des modules

In [None]:
# Importation des modules
# Modules de base
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings

# Configuration de l'affichage
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
np.random.seed(42)  # Pour la reproductibilit√©

# Importation des classes de validation crois√©e
import sys
sys.path.append('../')
from tsforecast.crossvals import (
    TSOutOfSampleSplit, TSInSampleSplit,
    PanelOutOfSampleSplit, PanelInSampleSplit,
    PanelOutOfSampleSplitPerEntity, PanelInSampleSplitPerEntity
)

## G√©n√©ration de donn√©es synth√©tiques

Ce notebook illustre l'utilisation des classes de validation crois√©e pour les s√©ries temporelles et les donn√©es de panel. Nous commen√ßons par g√©n√©rer des donn√©es synth√©tiques avec diff√©rentes caract√©ristiques pour d√©montrer le comportement des diff√©rentes m√©thodes.

In [None]:
### 1. G√©n√©ration de s√©ries temporelles synth√©tiques

def generate_time_series(start_date='2020-01-01', periods=252, freq='D', trend_strength=0.02, 
                        seasonal_period=30, seasonal_strength=0.5, noise_std=1.0, 
                        ar_coefficient=0.7, add_outliers=False, outlier_prob=0.05):
    """
    Generate synthetic time series with various patterns.
    
    Args:
        start_date: Start date for the time series
        periods: Number of time periods
        freq: Frequency ('D' for daily, 'M' for monthly, etc.)
        trend_strength: Strength of linear trend component
        seasonal_period: Period of seasonal pattern
        seasonal_strength: Strength of seasonal component
        noise_std: Standard deviation of random noise
        ar_coefficient: Autoregressive coefficient for AR(1) process
        add_outliers: Whether to add random outliers
        outlier_prob: Probability of outliers
    
    Returns:
        pd.Series: Time series with DatetimeIndex
    """
    # Cr√©ation de l'index temporel
    dates = pd.date_range(start=start_date, periods=periods, freq=freq)
    
    # Composante tendance lin√©aire
    trend = trend_strength * np.arange(periods)
    
    # Composante saisonni√®re
    seasonal = seasonal_strength * np.sin(2 * np.pi * np.arange(periods) / seasonal_period)
    
    # Processus autor√©gressif AR(1) pour la persistance
    ar_process = np.zeros(periods)
    ar_process[0] = np.random.normal(0, noise_std)
    for i in range(1, periods):
        ar_process[i] = ar_coefficient * ar_process[i-1] + np.random.normal(0, noise_std)
    
    # Combinaison des composantes
    values = trend + seasonal + ar_process
    
    # Ajout d'outliers al√©atoires
    if add_outliers:
        outlier_mask = np.random.random(periods) < outlier_prob
        outlier_values = np.random.normal(0, 5 * noise_std, size=np.sum(outlier_mask))
        values[outlier_mask] += outlier_values
    
    return pd.Series(values, index=dates, name='value')

# G√©n√©ration de diff√©rents types de s√©ries temporelles
print("üìä G√©n√©ration de s√©ries temporelles avec diff√©rentes caract√©ristiques...")

# S√©rie 1: Trend fort, faible saisonnalit√©
ts_trend = generate_time_series(
    start_date='2020-01-01', periods=200, freq='D',
    trend_strength=0.05, seasonal_strength=0.2, noise_std=0.5,
    ar_coefficient=0.8
)

# S√©rie 2: Saisonnalit√© forte, trend faible
ts_seasonal = generate_time_series(
    start_date='2020-01-01', periods=200, freq='D',
    trend_strength=0.01, seasonal_strength=1.5, seasonal_period=50,
    noise_std=0.3, ar_coefficient=0.6
)

# S√©rie 3: S√©rie tr√®s bruit√©e avec outliers
ts_noisy = generate_time_series(
    start_date='2020-01-01', periods=200, freq='D',
    trend_strength=0.02, seasonal_strength=0.5, noise_std=2.0,
    ar_coefficient=0.3, add_outliers=True, outlier_prob=0.08
)

# S√©rie 4: S√©rie stationnaire (pas de trend)
ts_stationary = generate_time_series(
    start_date='2020-01-01', periods=200, freq='D',
    trend_strength=0.0, seasonal_strength=0.8, noise_std=1.0,
    ar_coefficient=0.5
)

print(f"‚úÖ G√©n√©ration termin√©e:")
print(f"  - S√©rie avec trend: {len(ts_trend)} observations de {ts_trend.index[0].date()} √† {ts_trend.index[-1].date()}")
print(f"  - S√©rie saisonni√®re: {len(ts_seasonal)} observations")
print(f"  - S√©rie bruit√©e: {len(ts_noisy)} observations")
print(f"  - S√©rie stationnaire: {len(ts_stationary)} observations")

In [None]:
# Visualisation des s√©ries temporelles g√©n√©r√©es
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('S√©ries temporelles synth√©tiques avec diff√©rentes caract√©ristiques', fontsize=16)

# S√©rie avec trend
axes[0, 0].plot(ts_trend.index, ts_trend.values, linewidth=1.5, color='blue')
axes[0, 0].set_title('S√©rie avec trend fort')
axes[0, 0].set_ylabel('Valeur')
axes[0, 0].grid(True, alpha=0.3)

# S√©rie saisonni√®re
axes[0, 1].plot(ts_seasonal.index, ts_seasonal.values, linewidth=1.5, color='green')
axes[0, 1].set_title('S√©rie avec saisonnalit√© forte')
axes[0, 1].set_ylabel('Valeur')
axes[0, 1].grid(True, alpha=0.3)

# S√©rie bruit√©e
axes[1, 0].plot(ts_noisy.index, ts_noisy.values, linewidth=1.5, color='red')
axes[1, 0].set_title('S√©rie bruit√©e avec outliers')
axes[1, 0].set_ylabel('Valeur')
axes[1, 0].set_xlabel('Date')
axes[1, 0].grid(True, alpha=0.3)

# S√©rie stationnaire
axes[1, 1].plot(ts_stationary.index, ts_stationary.values, linewidth=1.5, color='purple')
axes[1, 1].set_title('S√©rie stationnaire')
axes[1, 1].set_ylabel('Valeur')
axes[1, 1].set_xlabel('Date')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 2. G√©n√©ration de donn√©es de panel synth√©tiques

Les donn√©es de panel combinent plusieurs entit√©s observ√©es sur plusieurs p√©riodes temporelles. Nous allons cr√©er des datasets de panel avec diff√©rentes caract√©ristiques pour illustrer le comportement des classes de validation crois√©e.

In [None]:
def generate_panel_data(entities=['A', 'B', 'C'], start_date='2020-01-01', periods=100, 
                        freq='D', heterogeneous_effects=True, common_trend=True, 
                        entity_specific_seasonality=True, cross_sectional_correlation=0.3,
                        missing_data_prob=0.0):
    """
    Generate synthetic panel data with various characteristics.
    
    Args:
        entities: List of entity identifiers
        start_date: Start date for the panel
        periods: Number of time periods per entity
        freq: Frequency of observations
        heterogeneous_effects: Whether entities have different baseline levels
        common_trend: Whether to include a common trend across entities
        entity_specific_seasonality: Whether seasonality patterns differ by entity
        cross_sectional_correlation: Correlation between entity shocks
        missing_data_prob: Probability of missing observations
    
    Returns:
        pd.DataFrame: Panel data with MultiIndex (entity, date)
    """
    # Cr√©ation de l'index temporel
    dates = pd.date_range(start=start_date, periods=periods, freq=freq)
    
    # Cr√©ation du MultiIndex (entity, date)
    index = pd.MultiIndex.from_product([entities, dates], names=['entity', 'date'])
    
    # Initialisation du DataFrame
    n_total = len(entities) * periods
    panel_data = pd.DataFrame(index=index)
    
    # G√©n√©ration des effets fixes par entit√© (h√©t√©rog√©n√©it√©)
    if heterogeneous_effects:
        entity_effects = {entity: np.random.normal(0, 2) for entity in entities}
    else:
        entity_effects = {entity: 0 for entity in entities}
    
    # Tendance commune
    if common_trend:
        common_trend_values = 0.02 * np.arange(periods)
    else:
        common_trend_values = np.zeros(periods)
    
    # G√©n√©ration de chocs corr√©l√©s entre entit√©s
    if cross_sectional_correlation > 0:
        # Chocs communs
        common_shocks = np.random.normal(0, 1, periods)
        # Chocs idiosyncratiques
        idiosyncratic_shocks = {
            entity: np.random.normal(0, 1, periods) 
            for entity in entities
        }
    
    # Construction des s√©ries pour chaque entit√©
    values = []
    entity_labels = []
    date_labels = []
    
    for entity in entities:
        # Effet fixe de l'entit√©
        entity_effect = entity_effects[entity]
        
        # Saisonnalit√© sp√©cifique √† l'entit√©
        if entity_specific_seasonality:
            # P√©riode et amplitude diff√©rentes selon l'entit√©
            seasonal_period = 20 + hash(entity) % 40  # Entre 20 et 60
            seasonal_amplitude = 0.5 + (hash(entity) % 100) / 200  # Entre 0.5 et 1.0
        else:
            seasonal_period = 30
            seasonal_amplitude = 0.5
        
        seasonal_values = seasonal_amplitude * np.sin(2 * np.pi * np.arange(periods) / seasonal_period)
        
        # Processus autor√©gressif sp√©cifique √† l'entit√©
        ar_coef = 0.5 + (hash(entity) % 50) / 100  # Entre 0.5 et 1.0
        ar_process = np.zeros(periods)
        ar_process[0] = np.random.normal(0, 0.5)
        for t in range(1, periods):
            ar_process[t] = ar_coef * ar_process[t-1] + np.random.normal(0, 0.5)
        
        # Combinaison des composantes
        if cross_sectional_correlation > 0:
            # Chocs avec corr√©lation crois√©e
            correlated_shocks = (
                np.sqrt(cross_sectional_correlation) * common_shocks +
                np.sqrt(1 - cross_sectional_correlation) * idiosyncratic_shocks[entity]
            )
        else:
            correlated_shocks = np.random.normal(0, 1, periods)
        
        entity_values = (
            entity_effect + 
            common_trend_values + 
            seasonal_values + 
            ar_process + 
            correlated_shocks
        )
        
        # Ajout de donn√©es manquantes
        if missing_data_prob > 0:
            missing_mask = np.random.random(periods) < missing_data_prob
            entity_values[missing_mask] = np.nan
        
        values.extend(entity_values)
        entity_labels.extend([entity] * periods)
        date_labels.extend(dates)
    
    # Cr√©ation du DataFrame final
    panel_data['value'] = values
    panel_data['entity'] = entity_labels
    panel_data['date'] = date_labels
    
    # Ajout de variables explicatives
    panel_data['lag_value'] = panel_data.groupby('entity')['value'].shift(1)
    panel_data['trend'] = np.tile(np.arange(periods), len(entities))
    panel_data['month'] = panel_data['date'].dt.month
    
    return panel_data[['value', 'lag_value', 'trend', 'month']]

# G√©n√©ration de diff√©rents types de donn√©es de panel
print("üìä G√©n√©ration de donn√©es de panel avec diff√©rentes caract√©ristiques...")

# Panel 1: Donn√©es √©quilibr√©es avec effets h√©t√©rog√®nes
entities_small = ['AAPL', 'GOOGL', 'MSFT', 'AMZN']
panel_balanced = generate_panel_data(
    entities=entities_small,
    start_date='2020-01-01',
    periods=120,
    freq='D',
    heterogeneous_effects=True,
    common_trend=True,
    entity_specific_seasonality=True,
    cross_sectional_correlation=0.4
)

# Panel 2: Grand panel avec nombreuses entit√©s
entities_large = [f'Entity_{i:02d}' for i in range(1, 21)]  # 20 entit√©s
panel_large = generate_panel_data(
    entities=entities_large,
    start_date='2020-01-01',
    periods=100,
    freq='D',
    heterogeneous_effects=True,
    common_trend=True,
    entity_specific_seasonality=False,  # Saisonnalit√© commune
    cross_sectional_correlation=0.6
)

# Panel 3: Donn√©es avec observations manquantes
panel_missing = generate_panel_data(
    entities=['Entity_A', 'Entity_B', 'Entity_C'],
    start_date='2020-01-01',
    periods=80,
    freq='D',
    heterogeneous_effects=True,
    common_trend=False,
    entity_specific_seasonality=True,
    cross_sectional_correlation=0.2,
    missing_data_prob=0.05  # 5% de donn√©es manquantes
)

print(f"‚úÖ G√©n√©ration de panels termin√©e:")
print(f"  - Panel √©quilibr√©: {panel_balanced.shape[0]} observations, {len(entities_small)} entit√©s")
print(f"  - Grand panel: {panel_large.shape[0]} observations, {len(entities_large)} entit√©s")
print(f"  - Panel avec donn√©es manquantes: {panel_missing.shape[0]} observations, {panel_missing['value'].notna().sum()} valides")

# Affichage des premi√®res observations de chaque panel
print(f"\nüìã Aper√ßu des donn√©es:")
print(f"\nPanel √©quilibr√© (premi√®res 10 observations):")
print(panel_balanced.head(10))
print(f"\nPanel avec donn√©es manquantes (aper√ßu):")
print(panel_missing[panel_missing['value'].isna()].head())

In [None]:
# Visualisation des donn√©es de panel
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Donn√©es de panel synth√©tiques', fontsize=16)

# Panel √©quilibr√© - quelques entit√©s
entities_to_plot = entities_small[:3]
for i, entity in enumerate(entities_to_plot):
    entity_data = panel_balanced.xs(entity, level='entity')
    axes[0, 0].plot(entity_data.index, entity_data['value'], 
                   label=entity, linewidth=1.5, alpha=0.8)
axes[0, 0].set_title('Panel √©quilibr√© (√©chantillon d\'entit√©s)')
axes[0, 0].set_ylabel('Valeur')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Distribution des valeurs par entit√© (panel √©quilibr√©)
panel_balanced.reset_index().boxplot(column='value', by='entity', ax=axes[0, 1])
axes[0, 1].set_title('Distribution par entit√© (panel √©quilibr√©)')
axes[0, 1].set_ylabel('Valeur')
axes[0, 1].set_xlabel('Entit√©')

# Grand panel - moyennes par p√©riode
large_panel_means = panel_large.groupby('date')['value'].agg(['mean', 'std']).reset_index()
axes[1, 0].plot(large_panel_means['date'], large_panel_means['mean'], 
               color='blue', linewidth=1.5, label='Moyenne')
axes[1, 0].fill_between(large_panel_means['date'], 
                       large_panel_means['mean'] - large_panel_means['std'],
                       large_panel_means['mean'] + large_panel_means['std'],
                       alpha=0.3, color='blue', label='¬±1 √©cart-type')
axes[1, 0].set_title('Grand panel (20 entit√©s) - Statistiques agr√©g√©es')
axes[1, 0].set_ylabel('Valeur')
axes[1, 0].set_xlabel('Date')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Panel avec donn√©es manquantes
missing_stats = panel_missing.groupby('entity')['value'].apply(
    lambda x: x.notna().sum() / len(x) * 100
).reset_index()
missing_stats.columns = ['entity', 'completeness_pct']
bars = axes[1, 1].bar(missing_stats['entity'], missing_stats['completeness_pct'])
axes[1, 1].set_title('Compl√©tude des donn√©es par entit√© (%)')
axes[1, 1].set_ylabel('Pourcentage de donn√©es valides')
axes[1, 1].set_xlabel('Entit√©')
axes[1, 1].set_ylim(0, 100)
# Ajout des valeurs sur les barres
for bar, value in zip(bars, missing_stats['completeness_pct']):
    axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                   f'{value:.1f}%', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## D√©monstration des classes de validation crois√©e pour s√©ries temporelles

Les classes `TSOutOfSampleSplit` et `TSInSampleSplit` sont sp√©cialement con√ßues pour les s√©ries temporelles. Elles respectent l'ordre temporel et permettent diff√©rentes configurations selon les besoins d'√©valuation.

### 3.1 TSOutOfSampleSplit - Validation hors √©chantillon

La validation **out-of-sample** (hors √©chantillon) est la m√©thode standard pour √©valuer les mod√®les de pr√©vision. L'entra√Ænement se fait **strictement sur le pass√©** et le test sur le **futur**, respectant ainsi l'ordre temporel naturel.

In [None]:
def analyze_split_characteristics(X, train_indices, test_indices, split_name):
    """Fonction utilitaire pour analyser les caract√©ristiques d'une s√©paration."""
    train_dates = X.index[train_indices]
    test_dates = X.index[test_indices]
    
    print(f"\nüìä {split_name}:")
    print(f"  - Taille d'entra√Ænement: {len(train_indices)} observations")
    print(f"  - Taille de test: {len(test_indices)} observations")
    print(f"  - P√©riode d'entra√Ænement: {train_dates[0].date()} √† {train_dates[-1].date()}")
    print(f"  - P√©riode de test: {test_dates[0].date()} √† {test_dates[-1].date()}")
    
    # V√©rification de l'ordre temporel
    gap_days = (test_dates[0] - train_dates[-1]).days
    print(f"  - Gap entre train et test: {gap_days} jours")
    
    return {
        'train_size': len(train_indices),
        'test_size': len(test_indices),
        'train_start': train_dates[0],
        'train_end': train_dates[-1],
        'test_start': test_dates[0],
        'test_end': test_dates[-1],
        'gap_days': gap_days
    }

print("üîç D√âMONSTRATION: TSOutOfSampleSplit avec diff√©rents param√®tres")
print("="*70)

# Utilisation de la s√©rie avec trend pour les d√©monstrations
X = ts_trend.to_frame('value')
print(f"S√©rie utilis√©e: {len(X)} observations de {X.index[0].date()} √† {X.index[-1].date()}")

# Configuration 1: Split basique avec n_splits
print(f"\n{'='*50}")
print("CONFIGURATION 1: Split basique avec n_splits")
print(f"{'='*50}")

splitter1 = TSOutOfSampleSplit(n_splits=3, test_size=20)
splits_info = []

for i, (train_idx, test_idx) in enumerate(splitter1.split(X)):
    split_info = analyze_split_characteristics(X, train_idx, test_idx, f"Split {i+1}")
    splits_info.append(split_info)

# Configuration 2: Avec gap pour √©viter le data leakage
print(f"\n{'='*50}")
print("CONFIGURATION 2: Avec gap pour √©viter le data leakage")
print(f"{'='*50}")

splitter2 = TSOutOfSampleSplit(n_splits=3, test_size=15, gap=5)
print("‚ö†Ô∏è  Gap = 5 jours entre l'entra√Ænement et le test")

for i, (train_idx, test_idx) in enumerate(splitter2.split(X)):
    analyze_split_characteristics(X, train_idx, test_idx, f"Split avec gap {i+1}")

# Configuration 3: Fen√™tre d'entra√Ænement limit√©e (rolling window)
print(f"\n{'='*50}")
print("CONFIGURATION 3: Fen√™tre d'entra√Ænement limit√©e (rolling window)")
print(f"{'='*50}")

splitter3 = TSOutOfSampleSplit(n_splits=3, test_size=15, max_train_size=50, gap=2)
print("üìè max_train_size = 50 observations (fen√™tre glissante)")

for i, (train_idx, test_idx) in enumerate(splitter3.split(X)):
    analyze_split_characteristics(X, train_idx, test_idx, f"Rolling window {i+1}")

# Configuration 4: Test sur des dates sp√©cifiques
print(f"\n{'='*50}")
print("CONFIGURATION 4: Test sur des dates sp√©cifiques")
print(f"{'='*50}")

specific_test_dates = ['2020-03-01', '2020-04-15', '2020-06-01']
splitter4 = TSOutOfSampleSplit(test_indices=specific_test_dates, test_size=10, gap=3)
print(f"üéØ Dates de test sp√©cifiques: {specific_test_dates}")

for i, (train_idx, test_idx) in enumerate(splitter4.split(X)):
    analyze_split_characteristics(X, train_idx, test_idx, f"Test sp√©cifique {i+1}")

In [None]:
# Visualisation des diff√©rentes configurations de TSOutOfSampleSplit
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Visualisation des diff√©rentes configurations TSOutOfSampleSplit', fontsize=16)

configurations = [
    (splitter1, "Split basique (n_splits=3)", axes[0, 0]),
    (splitter2, "Avec gap=5", axes[0, 1]), 
    (splitter3, "Fen√™tre limit√©e (max_train_size=50)", axes[1, 0]),
    (splitter4, "Dates sp√©cifiques", axes[1, 1])
]

colors = ['blue', 'red', 'green', 'orange', 'purple']

for splitter, title, ax in configurations:
    # Plot de la s√©rie compl√®te
    ax.plot(X.index, X['value'], color='lightgray', alpha=0.5, linewidth=1, label='Donn√©es compl√®tes')
    
    # Plot des splits
    for i, (train_idx, test_idx) in enumerate(splitter.split(X)):
        train_data = X.iloc[train_idx]
        test_data = X.iloc[test_idx]
        
        # Donn√©es d'entra√Ænement
        ax.plot(train_data.index, train_data['value'], 
               color=colors[i], alpha=0.7, linewidth=2, 
               label=f'Train {i+1}' if i < 3 else None)
        
        # Donn√©es de test
        ax.scatter(test_data.index, test_data['value'], 
                  color=colors[i], s=30, alpha=0.9, marker='o',
                  edgecolors='black', linewidth=0.5,
                  label=f'Test {i+1}' if i < 3 else None)
    
    ax.set_title(title)
    ax.set_ylabel('Valeur')
    ax.grid(True, alpha=0.3)
    if ax in [axes[1, 0], axes[1, 1]]:
        ax.set_xlabel('Date')
    
    # L√©gende seulement pour le premier graphique
    if ax == axes[0, 0]:
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# R√©sum√© des caract√©ristiques importantes
print("\n" + "="*70)
print("üìã R√âSUM√â DES CARACT√âRISTIQUES IMPORTANTES")
print("="*70)
print("\n‚úÖ Points cl√©s √† retenir:")
print("  1. Out-of-sample: l'entra√Ænement pr√©c√®de TOUJOURS le test temporellement")
print("  2. Gap: permet d'√©viter le data leakage en laissant un intervalle")
print("  3. max_train_size: limite la fen√™tre d'entra√Ænement (rolling window)")
print("  4. test_indices: permet de tester sur des p√©riodes sp√©cifiques")
print("  5. Les splits respectent l'ordre chronologique des donn√©es")

### 3.2 TSInSampleSplit - Validation dans l'√©chantillon

La validation **in-sample** (dans l'√©chantillon) inclut la p√©riode de test dans les donn√©es d'entra√Ænement. Cette approche est utile pour l'√©valuation historique et la calibration de mod√®les, o√π l'information future est disponible.

In [None]:
print("üîç D√âMONSTRATION: TSInSampleSplit - Validation dans l'√©chantillon")
print("="*70)

def analyze_insample_split(X, train_indices, test_indices, split_name):
    """Fonction pour analyser les splits in-sample."""
    train_dates = X.index[train_indices]
    test_dates = X.index[test_indices]
    
    print(f"\nüìä {split_name}:")
    print(f"  - Taille d'entra√Ænement: {len(train_indices)} observations")
    print(f"  - Taille de test: {len(test_indices)} observations")
    print(f"  - P√©riode d'entra√Ænement: {train_dates[0].date()} √† {train_dates[-1].date()}")
    print(f"  - P√©riode de test: {test_dates[0].date()} √† {test_dates[-1].date()}")
    
    # V√©rification que le test est inclus dans l'entra√Ænement
    test_in_train = all(idx in train_indices for idx in test_indices)
    print(f"  - Test inclus dans train: {'‚úÖ Oui' if test_in_train else '‚ùå Non'}")
    
    return test_in_train

# Configuration 1: In-sample basique
print(f"\n{'='*50}")
print("CONFIGURATION 1: In-sample basique")
print(f"{'='*50}")

insample_splitter1 = TSInSampleSplit(test_size=20)
print("üìñ Les donn√©es de test sont incluses dans l'entra√Ænement")

for i, (train_idx, test_idx) in enumerate(insample_splitter1.split(X)):
    analyze_insample_split(X, train_idx, test_idx, f"In-sample split {i+1}")

# Configuration 2: In-sample avec fen√™tre d'entra√Ænement limit√©e
print(f"\n{'='*50}")
print("CONFIGURATION 2: In-sample avec max_train_size")
print(f"{'='*50}")

insample_splitter2 = TSInSampleSplit(test_size=15, max_train_size=80)
print("üìè Entra√Ænement limit√© mais inclut toujours la p√©riode de test")

for i, (train_idx, test_idx) in enumerate(insample_splitter2.split(X)):
    analyze_insample_split(X, train_idx, test_idx, f"Limited in-sample {i+1}")

# Configuration 3: In-sample sur dates sp√©cifiques
print(f"\n{'='*50}")
print("CONFIGURATION 3: In-sample sur dates sp√©cifiques")
print(f"{'='*50}")

specific_dates = ['2020-04-01']
insample_splitter3 = TSInSampleSplit(test_indices=specific_dates, test_size=14)
print(f"üéØ Test sur p√©riode sp√©cifique: {specific_dates[0]} (14 jours)")

for i, (train_idx, test_idx) in enumerate(insample_splitter3.split(X)):
    analyze_insample_split(X, train_idx, test_idx, f"Specific in-sample {i+1}")

# Configuration 4: Comparaison de plusieurs dates sp√©cifiques
print(f"\n{'='*50}")
print("CONFIGURATION 4: Multiples dates sp√©cifiques")
print(f"{'='*50}")

multiple_dates = ['2020-02-15', '2020-03-15', '2020-05-01']
insample_splitter4 = TSInSampleSplit(test_indices=multiple_dates, test_size=7)
print(f"üéØ Tests sur: {multiple_dates}")

for i, (train_idx, test_idx) in enumerate(insample_splitter4.split(X)):
    analyze_insample_split(X, train_idx, test_idx, f"Multiple dates {i+1}")

In [None]:
# Visualisation comparative: Out-of-sample vs In-sample
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Comparaison Out-of-sample vs In-sample', fontsize=16)

# Donn√©es de test communes pour la comparaison
test_date_comparison = '2020-04-01'
test_size_comparison = 10

# Out-of-sample
out_splitter = TSOutOfSampleSplit(test_indices=[test_date_comparison], test_size=test_size_comparison, gap=2)
in_splitter = TSInSampleSplit(test_indices=[test_date_comparison], test_size=test_size_comparison)

# Visualisation Out-of-sample
for train_idx, test_idx in out_splitter.split(X):
    train_data = X.iloc[train_idx]
    test_data = X.iloc[test_idx]
    
    axes[0, 0].plot(X.index, X['value'], color='lightgray', alpha=0.5, linewidth=1, label='Donn√©es compl√®tes')
    axes[0, 0].plot(train_data.index, train_data['value'], color='blue', alpha=0.8, linewidth=2, label='Train')
    axes[0, 0].scatter(test_data.index, test_data['value'], color='red', s=40, alpha=0.9, 
                      edgecolors='black', linewidth=0.5, label='Test')
    
    # Mise en √©vidence du gap
    if len(train_data) > 0 and len(test_data) > 0:
        gap_start = train_data.index[-1]
        gap_end = test_data.index[0]
        axes[0, 0].axvspan(gap_start, gap_end, alpha=0.3, color='yellow', label='Gap')

axes[0, 0].set_title('Out-of-sample: Train ‚Üí Gap ‚Üí Test')
axes[0, 0].set_ylabel('Valeur')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Visualisation In-sample
for train_idx, test_idx in in_splitter.split(X):
    train_data = X.iloc[train_idx]
    test_data = X.iloc[test_idx]
    
    axes[0, 1].plot(X.index, X['value'], color='lightgray', alpha=0.5, linewidth=1, label='Donn√©es compl√®tes')
    axes[0, 1].plot(train_data.index, train_data['value'], color='blue', alpha=0.8, linewidth=2, label='Train')
    axes[0, 1].scatter(test_data.index, test_data['value'], color='red', s=40, alpha=0.9, 
                      edgecolors='black', linewidth=0.5, label='Test')
    
    # Mise en √©vidence de l'overlap
    overlap_indices = np.intersect1d(train_idx, test_idx)
    if len(overlap_indices) > 0:
        overlap_data = X.iloc[overlap_indices]
        axes[0, 1].scatter(overlap_data.index, overlap_data['value'], color='purple', s=60, 
                          alpha=0.7, marker='s', edgecolors='black', linewidth=1, label='Overlap (Test dans Train)')

axes[0, 1].set_title('In-sample: Test inclus dans Train')
axes[0, 1].set_ylabel('Valeur')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Graphique de distribution des erreurs simul√©es
print("\nüßÆ Simulation d'√©valuation avec diff√©rents mod√®les...")

# Simulation d'erreurs pour d√©montrer l'impact
np.random.seed(42)
n_simulations = 1000

# Erreurs out-of-sample (plus r√©alistes)
out_sample_errors = np.random.normal(0, 1.5, n_simulations)  # Plus d'incertitude

# Erreurs in-sample (g√©n√©ralement plus faibles)
in_sample_errors = np.random.normal(0, 0.8, n_simulations)   # Moins d'incertitude

axes[1, 0].hist(out_sample_errors, bins=50, alpha=0.7, color='red', label='Out-of-sample', density=True)
axes[1, 0].hist(in_sample_errors, bins=50, alpha=0.7, color='blue', label='In-sample', density=True)
axes[1, 0].set_title('Distribution des erreurs de pr√©diction')
axes[1, 0].set_xlabel('Erreur')
axes[1, 0].set_ylabel('Densit√©')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Statistiques comparatives
out_mae = np.mean(np.abs(out_sample_errors))
in_mae = np.mean(np.abs(in_sample_errors))
out_mse = np.mean(out_sample_errors**2)
in_mse = np.mean(in_sample_errors**2)

metrics = ['MAE', 'MSE']
out_values = [out_mae, out_mse]
in_values = [in_mae, in_mse]

x = np.arange(len(metrics))
width = 0.35

bars1 = axes[1, 1].bar(x - width/2, out_values, width, label='Out-of-sample', color='red', alpha=0.7)
bars2 = axes[1, 1].bar(x + width/2, in_values, width, label='In-sample', color='blue', alpha=0.7)

axes[1, 1].set_title('M√©triques d\'erreur comparatives')
axes[1, 1].set_ylabel('Valeur de l\'erreur')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(metrics)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# Ajout des valeurs sur les barres
for bar in bars1:
    height = bar.get_height()
    axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                   f'{height:.3f}', ha='center', va='bottom')
for bar in bars2:
    height = bar.get_height()
    axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                   f'{height:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("üìã COMPARAISON OUT-OF-SAMPLE vs IN-SAMPLE")
print("="*70)
print("\nüìä R√©sultats simul√©s:")
print(f"  Out-of-sample MAE: {out_mae:.3f}")
print(f"  In-sample MAE: {in_mae:.3f}")
print(f"  Diff√©rence: {((out_mae - in_mae) / in_mae * 100):+.1f}%")
print("\n‚úÖ Points cl√©s:")
print("  1. Out-of-sample: √©valuation r√©aliste de la capacit√© pr√©dictive")
print("  2. In-sample: √©valuation optimiste, utile pour la calibration")
print("  3. L'√©cart refl√®te le challenge r√©el de la pr√©diction")
print("  4. In-sample inclut information future ‚Üí erreurs plus faibles")

## D√©monstration des classes de validation crois√©e pour donn√©es de panel

Les donn√©es de panel combinent plusieurs entit√©s observ√©es dans le temps. Les classes `PanelOutOfSampleSplit` et `PanelInSampleSplit` g√®rent cette complexit√© en appliquant la logique de validation √† chaque entit√© tout en permettant l'agr√©gation des r√©sultats.

### 4.1 PanelOutOfSampleSplit - Validation hors √©chantillon pour donn√©es de panel

Cette classe applique la logique out-of-sample √† chaque entit√© du panel, respectant l'ordre temporel au sein de chaque entit√©.

In [None]:
def analyze_panel_split(X, train_indices, test_indices, split_name, max_entities_display=5):
    """Fonction pour analyser les splits de panel."""
    # Extraction des entit√©s pr√©sentes dans train et test
    train_entities = X.iloc[train_indices].index.get_level_values('entity').unique()
    test_entities = X.iloc[test_indices].index.get_level_values('entity').unique()
    
    print(f"\nüìä {split_name}:")
    print(f"  - Observations d'entra√Ænement: {len(train_indices)}")
    print(f"  - Observations de test: {len(test_indices)}")
    print(f"  - Entit√©s dans train: {len(train_entities)} {list(train_entities[:max_entities_display])}")
    print(f"  - Entit√©s dans test: {len(test_entities)} {list(test_entities[:max_entities_display])}")
    
    # Analyse par entit√©
    if len(test_entities) <= max_entities_display:
        for entity in test_entities:
            entity_train = X.iloc[train_indices].xs(entity, level='entity', drop_level=False)
            entity_test = X.iloc[test_indices].xs(entity, level='entity', drop_level=False)
            
            if len(entity_train) > 0 and len(entity_test) > 0:
                train_dates = entity_train.index.get_level_values('date')
                test_dates = entity_test.index.get_level_values('date')
                gap_days = (test_dates[0] - train_dates[-1]).days
                print(f"    {entity}: Train {train_dates[0].date()}‚Üí{train_dates[-1].date()}, Test {test_dates[0].date()}‚Üí{test_dates[-1].date()}, Gap {gap_days}j")

print("üîç D√âMONSTRATION: PanelOutOfSampleSplit")
print("="*70)

# Utilisation du panel √©quilibr√© pour les d√©monstrations
X_panel = panel_balanced[['value']]
print(f"Panel utilis√©: {X_panel.shape[0]} observations, {len(X_panel.index.get_level_values('entity').unique())} entit√©s")
print(f"P√©riode: {X_panel.index.get_level_values('date').min().date()} √† {X_panel.index.get_level_values('date').max().date()}")

# Configuration 1: Split basique avec n_splits
print(f"\n{'='*50}")
print("CONFIGURATION 1: Panel out-of-sample basique")
print(f"{'='*50}")

panel_splitter1 = PanelOutOfSampleSplit(n_splits=3, test_size=10)
print("üìä Validation crois√©e avec 3 splits, 10 observations de test par entit√©")

for i, (train_idx, test_idx) in enumerate(panel_splitter1.split(X_panel)):
    analyze_panel_split(X_panel, train_idx, test_idx, f"Panel split {i+1}")
    if i >= 2:  # Limiter l'affichage
        break

# Configuration 2: Avec gap
print(f"\n{'='*50}")
print("CONFIGURATION 2: Panel avec gap")
print(f"{'='*50}")

panel_splitter2 = PanelOutOfSampleSplit(n_splits=2, test_size=8, gap=5)
print("‚ö†Ô∏è  Gap de 5 jours entre train et test pour chaque entit√©")

for i, (train_idx, test_idx) in enumerate(panel_splitter2.split(X_panel)):
    analyze_panel_split(X_panel, train_idx, test_idx, f"Panel avec gap {i+1}")

# Configuration 3: Test sur dates sp√©cifiques
print(f"\n{'='*50}")
print("CONFIGURATION 3: Test sur dates sp√©cifiques (panel)")
print(f"{'='*50}")

specific_panel_dates = ['2020-03-01', '2020-04-15']
panel_splitter3 = PanelOutOfSampleSplit(test_indices=specific_panel_dates, test_size=7, gap=2)
print(f"üéØ Tests sur: {specific_panel_dates} pour toutes les entit√©s")

for i, (train_idx, test_idx) in enumerate(panel_splitter3.split(X_panel)):
    analyze_panel_split(X_panel, train_idx, test_idx, f"Dates sp√©cifiques {i+1}")

# Configuration 4: Fen√™tre d'entra√Ænement limit√©e
print(f"\n{'='*50}")
print("CONFIGURATION 4: Fen√™tre d'entra√Ænement limit√©e (panel)")
print(f"{'='*50}")

panel_splitter4 = PanelOutOfSampleSplit(n_splits=2, test_size=6, max_train_size=30, gap=1)
print("üìè max_train_size = 30 observations par entit√©")

for i, (train_idx, test_idx) in enumerate(panel_splitter4.split(X_panel)):
    analyze_panel_split(X_panel, train_idx, test_idx, f"Rolling window panel {i+1}")

In [None]:
# Visualisation des splits de panel
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Visualisation des splits PanelOutOfSampleSplit', fontsize=16)

# Configuration pour la visualisation
entities_to_plot = entities_small[:3]  # Premi√®re 3 entit√©s pour la clart√©
colors_entities = ['blue', 'red', 'green']

# Configuration 1: Split basique
ax = axes[0, 0]
for i, (train_idx, test_idx) in enumerate(panel_splitter1.split(X_panel)):
    if i > 0:  # Seulement le premier split pour la clart√©
        break
    
    for j, entity in enumerate(entities_to_plot):
        try:
            # Donn√©es compl√®tes de l'entit√©
            entity_data = X_panel.xs(entity, level='entity')
            ax.plot(entity_data.index, entity_data['value'], 
                   color=colors_entities[j], alpha=0.3, linewidth=1, 
                   label=f'{entity} (complet)' if i == 0 else "")
            
            # Donn√©es d'entra√Ænement
            entity_train_data = X_panel.iloc[train_idx].xs(entity, level='entity', drop_level=False)
            if len(entity_train_data) > 0:
                train_dates = entity_train_data.index.get_level_values('date')
                ax.plot(train_dates, entity_train_data['value'], 
                       color=colors_entities[j], alpha=0.8, linewidth=2,
                       label=f'{entity} Train' if i == 0 else "")
            
            # Donn√©es de test
            entity_test_data = X_panel.iloc[test_idx].xs(entity, level='entity', drop_level=False)
            if len(entity_test_data) > 0:
                test_dates = entity_test_data.index.get_level_values('date')
                ax.scatter(test_dates, entity_test_data['value'], 
                          color=colors_entities[j], s=40, alpha=0.9,
                          edgecolors='black', linewidth=0.5,
                          label=f'{entity} Test' if i == 0 else "")
        except:
            pass

ax.set_title('Panel split basique')
ax.set_ylabel('Valeur')
ax.grid(True, alpha=0.3)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)

# Configuration 2: Avec gap
ax = axes[0, 1]
for i, (train_idx, test_idx) in enumerate(panel_splitter2.split(X_panel)):
    if i > 0:  # Seulement le premier split
        break
    
    for j, entity in enumerate(entities_to_plot):
        try:
            entity_data = X_panel.xs(entity, level='entity')
            ax.plot(entity_data.index, entity_data['value'], 
                   color=colors_entities[j], alpha=0.3, linewidth=1)
            
            entity_train_data = X_panel.iloc[train_idx].xs(entity, level='entity', drop_level=False)
            if len(entity_train_data) > 0:
                train_dates = entity_train_data.index.get_level_values('date')
                ax.plot(train_dates, entity_train_data['value'], 
                       color=colors_entities[j], alpha=0.8, linewidth=2)
            
            entity_test_data = X_panel.iloc[test_idx].xs(entity, level='entity', drop_level=False)
            if len(entity_test_data) > 0:
                test_dates = entity_test_data.index.get_level_values('date')
                ax.scatter(test_dates, entity_test_data['value'], 
                          color=colors_entities[j], s=40, alpha=0.9,
                          edgecolors='black', linewidth=0.5)
                
                # Visualisation du gap
                if len(entity_train_data) > 0:
                    gap_start = train_dates[-1]
                    gap_end = test_dates[0]
                    ax.axvspan(gap_start, gap_end, alpha=0.2, color='yellow')
        except:
            pass

ax.set_title('Panel avec gap=5')
ax.set_ylabel('Valeur')
ax.grid(True, alpha=0.3)

# Configuration 3: Dates sp√©cifiques
ax = axes[1, 0]
for i, (train_idx, test_idx) in enumerate(panel_splitter3.split(X_panel)):
    if i > 1:  # Limite √† 2 splits
        break
    
    for j, entity in enumerate(entities_to_plot):
        try:
            entity_data = X_panel.xs(entity, level='entity')
            ax.plot(entity_data.index, entity_data['value'], 
                   color=colors_entities[j], alpha=0.3, linewidth=1)
            
            entity_train_data = X_panel.iloc[train_idx].xs(entity, level='entity', drop_level=False)
            if len(entity_train_data) > 0:
                train_dates = entity_train_data.index.get_level_values('date')
                ax.plot(train_dates, entity_train_data['value'], 
                       color=colors_entities[j], alpha=0.8, linewidth=2)
            
            entity_test_data = X_panel.iloc[test_idx].xs(entity, level='entity', drop_level=False)
            if len(entity_test_data) > 0:
                test_dates = entity_test_data.index.get_level_values('date')
                ax.scatter(test_dates, entity_test_data['value'], 
                          color=colors_entities[j], s=40, alpha=0.9,
                          edgecolors='black', linewidth=0.5,
                          marker='s' if i == 0 else 'o')
        except:
            pass

ax.set_title('Dates sp√©cifiques')
ax.set_ylabel('Valeur')
ax.set_xlabel('Date')
ax.grid(True, alpha=0.3)

# Configuration 4: Fen√™tre limit√©e
ax = axes[1, 1]
for i, (train_idx, test_idx) in enumerate(panel_splitter4.split(X_panel)):
    if i > 0:  # Seulement le premier split
        break
    
    for j, entity in enumerate(entities_to_plot):
        try:
            entity_data = X_panel.xs(entity, level='entity')
            ax.plot(entity_data.index, entity_data['value'], 
                   color=colors_entities[j], alpha=0.3, linewidth=1)
            
            entity_train_data = X_panel.iloc[train_idx].xs(entity, level='entity', drop_level=False)
            if len(entity_train_data) > 0:
                train_dates = entity_train_data.index.get_level_values('date')
                ax.plot(train_dates, entity_train_data['value'], 
                       color=colors_entities[j], alpha=0.8, linewidth=2)
            
            entity_test_data = X_panel.iloc[test_idx].xs(entity, level='entity', drop_level=False)
            if len(entity_test_data) > 0:
                test_dates = entity_test_data.index.get_level_values('date')
                ax.scatter(test_dates, entity_test_data['value'], 
                          color=colors_entities[j], s=40, alpha=0.9,
                          edgecolors='black', linewidth=0.5)
        except:
            pass

ax.set_title('Fen√™tre limit√©e (max_train_size=30)')
ax.set_ylabel('Valeur')
ax.set_xlabel('Date')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 4.2 Classes sp√©cialis√©es pour le traitement par entit√©

Les classes `PanelOutOfSampleSplitPerEntity` et `PanelInSampleSplitPerEntity` permettent de traiter chaque entit√© s√©par√©ment, ce qui est utile pour l'analyse individuelle et le traitement parall√®le.

In [None]:
print("üîç D√âMONSTRATION: Classes sp√©cialis√©es par entit√©")
print("="*70)

# Comparaison des approches PanelOutOfSampleSplit vs PanelOutOfSampleSplitPerEntity
print("\nüìä Comparaison: Agr√©g√© vs Par entit√©")
print("-" * 50)

# Splitter agr√©g√© (standard)
panel_agg_splitter = PanelOutOfSampleSplit(n_splits=2, test_size=5, gap=2)

# Splitter par entit√©
panel_per_entity_splitter = PanelOutOfSampleSplitPerEntity(n_splits=2, test_size=5, gap=2)

print("\n1Ô∏è‚É£  APPROCHE AGR√âG√âE (PanelOutOfSampleSplit):")
split_count = 0
for train_idx, test_idx in panel_agg_splitter.split(X_panel):
    split_count += 1
    entities_in_test = X_panel.iloc[test_idx].index.get_level_values('entity').unique()
    print(f"  Split {split_count}: {len(test_idx)} observations de test, {len(entities_in_test)} entit√©s")
    if split_count >= 2:
        break

print(f"\n  üìà Total: {split_count} splits avec toutes les entit√©s m√©lang√©es")

print("\n2Ô∏è‚É£  APPROCHE PAR ENTIT√â (PanelOutOfSampleSplitPerEntity):")
split_count = 0
entity_splits = {}

for train_idx, test_idx in panel_per_entity_splitter.split(X_panel):
    split_count += 1
    # Identifier l'entit√© de ce split
    test_entity = X_panel.iloc[test_idx].index.get_level_values('entity').unique()[0]
    train_entity = X_panel.iloc[train_idx].index.get_level_values('entity').unique()[0] if len(train_idx) > 0 else "N/A"
    
    if test_entity not in entity_splits:
        entity_splits[test_entity] = 0
    entity_splits[test_entity] += 1
    
    print(f"  Split {split_count}: Entit√© {test_entity}, {len(train_idx)} train, {len(test_idx)} test")
    
    if split_count >= 8:  # Limiter l'affichage
        print("  ... (splits suppl√©mentaires)")
        break

print(f"\n  üìà Total: {split_count}+ splits individuels par entit√©")
print(f"  üìä R√©partition par entit√©: {dict(entity_splits)}")

# D√©monstration avec In-sample per entity
print(f"\n{'='*50}")
print("D√âMONSTRATION: PanelInSampleSplitPerEntity")
print(f"{'='*50}")

panel_insample_per_entity = PanelInSampleSplitPerEntity(test_indices=['2020-03-01'], test_size=7)

print("üéØ Test sur 2020-03-01 avec validation in-sample par entit√©:")
entity_results = {}

for train_idx, test_idx in panel_insample_per_entity.split(X_panel):
    test_entity = X_panel.iloc[test_idx].index.get_level_values('entity').unique()[0]
    train_data = X_panel.iloc[train_idx]
    test_data = X_panel.iloc[test_idx]
    
    # V√©rification que le test est inclus dans l'entra√Ænement
    test_in_train = all(idx in train_idx for idx in test_idx)
    
    entity_results[test_entity] = {
        'train_size': len(train_idx),
        'test_size': len(test_idx),
        'test_in_train': test_in_train,
        'train_period': (train_data.index.get_level_values('date').min().date(), 
                        train_data.index.get_level_values('date').max().date()),
        'test_period': (test_data.index.get_level_values('date').min().date(),
                       test_data.index.get_level_values('date').max().date())
    }
    
    print(f"  {test_entity}: Train {entity_results[test_entity]['train_size']} obs, "
          f"Test {entity_results[test_entity]['test_size']} obs, "
          f"Test in Train: {'‚úÖ' if test_in_train else '‚ùå'}")

# Statistiques de comparaison
print(f"\n{'='*50}")
print("üìä STATISTIQUES COMPARATIVES")
print(f"{'='*50}")

def calculate_split_statistics(splitter, X, split_type_name):
    """Calcule des statistiques sur les splits."""
    total_splits = 0
    total_train_obs = 0
    total_test_obs = 0
    entities_seen = set()
    
    for train_idx, test_idx in splitter.split(X):
        total_splits += 1
        total_train_obs += len(train_idx)
        total_test_obs += len(test_idx)
        
        test_entities = X.iloc[test_idx].index.get_level_values('entity').unique()
        entities_seen.update(test_entities)
        
        if total_splits >= 10:  # Limite pour √©viter trop de calculs
            break
    
    return {
        'type': split_type_name,
        'total_splits': total_splits,
        'avg_train_size': total_train_obs / total_splits if total_splits > 0 else 0,
        'avg_test_size': total_test_obs / total_splits if total_splits > 0 else 0,
        'unique_entities': len(entities_seen)
    }

# Calcul des statistiques
stats_agg = calculate_split_statistics(panel_agg_splitter, X_panel, "Agr√©g√©")
stats_per_entity = calculate_split_statistics(panel_per_entity_splitter, X_panel, "Par entit√©")

print(f"\nüìà R√©sultats (sur {min(10, stats_agg['total_splits'])} premiers splits):")
print(f"  Approche agr√©g√©e:")
print(f"    - Splits: {stats_agg['total_splits']}")
print(f"    - Taille moyenne train: {stats_agg['avg_train_size']:.1f}")
print(f"    - Taille moyenne test: {stats_agg['avg_test_size']:.1f}")
print(f"    - Entit√©s uniques vues: {stats_agg['unique_entities']}")

print(f"  Approche par entit√©:")
print(f"    - Splits: {stats_per_entity['total_splits']}")
print(f"    - Taille moyenne train: {stats_per_entity['avg_train_size']:.1f}")
print(f"    - Taille moyenne test: {stats_per_entity['avg_test_size']:.1f}")
print(f"    - Entit√©s uniques vues: {stats_per_entity['unique_entities']}")

print(f"\n‚úÖ Avantages par approche:")
print(f"  üìä Agr√©g√©e: Moins de splits, √©valuation globale, plus rapide")
print(f"  üéØ Par entit√©: Analyse d√©taill√©e, traitement parall√®le possible, contr√¥le granulaire")

## Guide pratique et meilleures pratiques

Cette section pr√©sente un guide pratique pour choisir la m√©thode de validation crois√©e appropri√©e selon diff√©rents sc√©narios et objectifs d'analyse.

In [None]:
print("üìö GUIDE PRATIQUE: Choisir la bonne m√©thode de validation crois√©e")
print("="*80)

# Matrice de d√©cision
decision_matrix = {
    "Type de donn√©es": {
        "S√©rie temporelle unique": ["TSOutOfSampleSplit", "TSInSampleSplit"],
        "Panel data (multi-entit√©s)": ["PanelOutOfSampleSplit", "PanelInSampleSplit", 
                                      "PanelOutOfSampleSplitPerEntity", "PanelInSampleSplitPerEntity"]
    },
    "Objectif d'√©valuation": {
        "Pr√©diction future (production)": ["OutOfSampleSplit"],
        "Analyse historique/calibration": ["InSampleSplit"],
        "√âvaluation comparative": ["OutOfSampleSplit", "InSampleSplit"]
    },
    "Contraintes temporelles": {
        "√âviter data leakage": ["gap > 0"],
        "Fen√™tre glissante": ["max_train_size < inf"],
        "Dates sp√©cifiques": ["test_indices=[dates]"]
    },
    "Analyse par entit√©": {
        "√âvaluation globale": ["PanelOutOfSampleSplit", "PanelInSampleSplit"],
        "Analyse individuelle": ["PanelOutOfSampleSplitPerEntity", "PanelInSampleSplitPerEntity"],
        "Traitement parall√®le": ["PerEntity classes"]
    }
}\n\n# Affichage du guide de d√©cision\nfor category, options in decision_matrix.items():\n    print(f\"\\nüéØ {category.upper()}:\")\n    for scenario, methods in options.items():\n        print(f\"  ‚Ä¢ {scenario}: {', '.join(methods)}\")\n\n# Exemples de configurations recommand√©es\nprint(f\"\\n{'='*50}\")\nprint(\"CONFIGURATIONS RECOMMAND√âES\")\nprint(f\"{'='*50}\")\n\nrecommended_configs = [\n    {\n        \"scenario\": \"Pr√©diction de s√©ries temporelles financi√®res\",\n        \"config\": \"TSOutOfSampleSplit(n_splits=5, test_size=22, gap=1)\",\n        \"rationale\": \"Gap d'1 jour pour √©viter le look-ahead bias, test sur ~1 mois\"\n    },\n    {\n        \"scenario\": \"√âvaluation de mod√®les sur panel d'entreprises\",\n        \"config\": \"PanelOutOfSampleSplit(test_size=30, gap=5, max_train_size=252)\",\n        \"rationale\": \"Fen√™tre d'1 an, gap de 5 jours, test sur 1 mois\"\n    },\n    {\n        \"scenario\": \"Backtesting historique avec calibration\",\n        \"config\": \"TSInSampleSplit(test_indices=['2020-03-01'], test_size=14)\",\n        \"rationale\": \"Test sur p√©riode de crise sp√©cifique, entra√Ænement inclut le futur\"\n    },\n    {\n        \"scenario\": \"Analyse de robustesse par entit√©\",\n        \"config\": \"PanelOutOfSampleSplitPerEntity(n_splits=3, test_size=10)\",\n        \"rationale\": \"√âvaluation individuelle de chaque entit√© avec 3 p√©riodes de test\"\n    },\n    {\n        \"scenario\": \"Validation avec donn√©es haute fr√©quence\",\n        \"config\": \"TSOutOfSampleSplit(test_size=50, gap=10, max_train_size=1000)\",\n        \"rationale\": \"Gap plus important, fen√™tre limit√©e pour donn√©es intraday\"\n    }\n]\n\nfor i, config in enumerate(recommended_configs, 1):\n    print(f\"\\n{i}Ô∏è‚É£  {config['scenario']}:\")\n    print(f\"   üìù Configuration: {config['config']}\")\n    print(f\"   üí° Rationale: {config['rationale']}\")\n\n# M√©triques de performance simul√©es\nprint(f\"\\n{'='*50}\")\nprint(\"EXEMPLE DE PIPELINE D'√âVALUATION\")\nprint(f\"{'='*50}\")\n\ndef simulate_model_evaluation(X, splitter, model_name=\"Mod√®le simple\"):\n    \"\"\"Simule l'√©valuation d'un mod√®le avec cross-validation.\"\"\"\n    mae_scores = []\n    mse_scores = []\n    \n    split_count = 0\n    for train_idx, test_idx in splitter.split(X):\n        # Simulation d'entra√Ænement et pr√©diction\n        X_train = X.iloc[train_idx]\n        X_test = X.iloc[test_idx]\n        \n        # Pr√©diction naive (moyenne mobile) pour simulation\n        if len(X_train) > 0:\n            if hasattr(X_train.index, 'get_level_values'):\n                # Panel data - moyenne par groupe\n                pred = X_train.groupby(level='entity')['value'].mean().mean()\n            else:\n                # S√©rie temporelle - moyenne des derni√®res valeurs\n                pred = X_train['value'].tail(min(10, len(X_train))).mean()\n        else:\n            pred = 0\n        \n        # Calcul des erreurs simul√©es\n        true_values = X_test['value'].values\n        predictions = np.full_like(true_values, pred)\n        \n        # Ajout de bruit r√©aliste selon le type de validation\n        if 'OutOfSample' in splitter.__class__.__name__:\n            noise_std = 0.8  # Plus d'incertitude out-of-sample\n        else:\n            noise_std = 0.4  # Moins d'incertitude in-sample\n        \n        predictions += np.random.normal(0, noise_std, len(predictions))\n        \n        mae = np.mean(np.abs(true_values - predictions))\n        mse = np.mean((true_values - predictions)**2)\n        \n        mae_scores.append(mae)\n        mse_scores.append(mse)\n        \n        split_count += 1\n        if split_count >= 5:  # Limite pour la d√©monstration\n            break\n    \n    return {\n        'mae_mean': np.mean(mae_scores),\n        'mae_std': np.std(mae_scores),\n        'mse_mean': np.mean(mse_scores),\n        'mse_std': np.std(mse_scores),\n        'n_splits': len(mae_scores)\n    }\n\n# Comparaison de performance entre diff√©rentes m√©thodes\nprint(\"\\nüî¨ Simulation d'√©valuation comparative:\")\n\nsplitters_to_compare = [\n    (TSOutOfSampleSplit(n_splits=3, test_size=15, gap=2), \"TS Out-of-Sample\"),\n    (TSInSampleSplit(test_size=15), \"TS In-Sample\"),\n    (PanelOutOfSampleSplit(n_splits=2, test_size=8), \"Panel Out-of-Sample\"),\n    (PanelInSampleSplit(test_size=8), \"Panel In-Sample\")\n]\n\nresults = []\nfor splitter, name in splitters_to_compare:\n    if 'Panel' in name:\n        data = X_panel\n    else:\n        data = X\n    \n    try:\n        result = simulate_model_evaluation(data, splitter, name)\n        result['method'] = name\n        results.append(result)\n        \n        print(f\"\\n  üìä {name}:\")\n        print(f\"     MAE: {result['mae_mean']:.3f} ¬± {result['mae_std']:.3f}\")\n        print(f\"     MSE: {result['mse_mean']:.3f} ¬± {result['mse_std']:.3f}\")\n        print(f\"     Splits: {result['n_splits']}\")\n    except Exception as e:\n        print(f\"\\n  ‚ùå {name}: Erreur - {str(e)[:50]}...\")\n\nprint(f\"\\n{'='*50}\")\nprint(\"POINTS CL√âS √Ä RETENIR\")\nprint(f\"{'='*50}\")\n\nkey_points = [\n    \"Out-of-sample donne une √©valuation plus conservative et r√©aliste\",\n    \"In-sample est utile pour l'analyse historique et la calibration\",\n    \"Le gap pr√©vient le data leakage dans les donn√©es haute fr√©quence\",\n    \"max_train_size permet une validation en fen√™tre glissante\",\n    \"Les classes Panel g√®rent automatiquement la structure multi-entit√©s\",\n    \"PerEntity permet l'analyse granulaire et le traitement parall√®le\",\n    \"test_indices permet de tester sur des √©v√©nements sp√©cifiques\"\n]\n\nfor i, point in enumerate(key_points, 1):\n    print(f\"  {i}. {point}\")\n\nprint(f\"\\n‚úÖ Le choix de la m√©thode d√©pend de votre objectif:\")\nprint(f\"   üéØ Production: Out-of-sample avec gap appropri√©\")\nprint(f\"   üìä Recherche: In-sample pour analyse historique\")\nprint(f\"   üî¨ Robustesse: Comparaison des deux approches\")"

## Conclusion

Ce notebook a pr√©sent√© de mani√®re exhaustive les classes de validation crois√©e du module `tsforecast.crossvals`. Voici les points essentiels √† retenir :

### Fonctionnalit√©s principales

1. **Classes pour s√©ries temporelles** : `TSOutOfSampleSplit` et `TSInSampleSplit`
   - Respectent l'ordre chronologique
   - G√®rent les gaps pour √©viter le data leakage
   - Supportent les fen√™tres glissantes et les dates sp√©cifiques

2. **Classes pour donn√©es de panel** : `PanelOutOfSampleSplit`, `PanelInSampleSplit` et leurs variantes `PerEntity`
   - Traitent les donn√©es multi-entit√©s automatiquement
   - Appliquent la logique temporelle au sein de chaque entit√©
   - Permettent l'analyse granulaire par entit√©

3. **Flexibilit√© de configuration** : 
   - Param√®tres `n_splits`, `test_size`, `gap`, `max_train_size`
   - Support des `test_indices` pour des p√©riodes sp√©cifiques
   - Compatible avec l'API sklearn

### Recommandations d'usage

- **Out-of-sample** : Pour l'√©valuation r√©aliste de mod√®les de production
- **In-sample** : Pour l'analyse historique et la calibration
- **Gap** : Essentiel pour les donn√©es haute fr√©quence
- **PerEntity** : Pour l'analyse d√©taill√©e et le traitement parall√®le

Ces classes offrent une base solide pour l'√©valuation rigoureuse de mod√®les sur donn√©es temporelles et de panel, en respectant les contraintes inh√©rentes √† ce type de donn√©es.