In [None]:
import matplotlib.pyplot as plt

from sktime.benchmarking.forecasting import (
    ModelComparisonBenchmark,
    TimeSeriesSimulator,
)
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.trend import PolynomialTrendForecaster

## 1. Generating Time Series with Built-in Distributions

The simulator supports 7 built-in distributions: normal, poisson, exponential, gamma, uniform, binomial, and lognormal.

In [None]:
# Generate Poisson-distributed data (useful for count data like daily sales)
sim_poisson = TimeSeriesSimulator(
    length=200,
    distribution="poisson",
    dist_params={"lam": 25},  # Average of 25 events
    random_state=42,
)
poisson_data = sim_poisson.simulate()

plt.figure(figsize=(12, 4))
plt.plot(poisson_data)
plt.title("Poisson Distribution (λ=25) - Count Data")
plt.ylabel("Value")
plt.show()

print(f"Mean: {poisson_data.mean():.2f}, Std: {poisson_data.std():.2f}")

## 2. Adding Trend and Seasonality

Real-world time series often have trends and seasonal patterns. The simulator supports linear, quadratic, and exponential trends, plus seasonal components.

In [None]:
# Generate time series with trend and seasonality
sim_complex = TimeSeriesSimulator(
    length=200,
    distribution="normal",
    dist_params={"loc": 100, "scale": 10},
    trend="linear",
    trend_params={"slope": 0.5, "intercept": 0},
    seasonality=12,  # Monthly seasonality
    seasonality_strength=15.0,
    random_state=42,
)
complex_data = sim_complex.simulate()

plt.figure(figsize=(12, 4))
plt.plot(complex_data)
plt.title("Time Series with Linear Trend and Seasonality")
plt.ylabel("Value")
plt.show()

## 3. Using Custom Distribution Functions

For specialized use cases, you can define custom distribution functions.

In [None]:
# Custom distribution: Beta distribution for bounded values (e.g., conversion rates)
def beta_distribution(size, random_state):
    """Generate beta-distributed values scaled to 0-100."""
    return random_state.beta(a=2, b=5, size=size) * 100  # Scale to 0-100%


sim_custom = TimeSeriesSimulator(
    length=200,
    distribution=beta_distribution,
    seasonality=7,  # Weekly pattern
    seasonality_strength=5.0,
    random_state=42,
)
custom_data = sim_custom.simulate()

plt.figure(figsize=(12, 4))
plt.plot(custom_data)
plt.title("Custom Beta Distribution (Conversion Rates)")
plt.ylabel("Conversion Rate (%)")
plt.axhline(y=0, color="r", linestyle="--", alpha=0.3)
plt.axhline(y=100, color="r", linestyle="--", alpha=0.3)
plt.show()

## 4. Benchmarking Models on Different Distributions

Test which forecasting models perform best on specific data distributions.

In [None]:
# Generate test data with Poisson distribution
test_data = TimeSeriesSimulator(
    length=200,
    distribution="poisson",
    dist_params={"lam": 30},
    trend="linear",
    trend_params={"slope": 0.1},
    random_state=42,
).simulate()

# Benchmark multiple models with custom parameters
benchmark = ModelComparisonBenchmark(
    models=[
        ("naive_last", NaiveForecaster(strategy="last")),
        ("naive_mean", NaiveForecaster(strategy="mean")),
        ("naive_drift", NaiveForecaster(strategy="drift")),
        ("linear_trend", PolynomialTrendForecaster(degree=1)),
        ("quadratic_trend", PolynomialTrendForecaster(degree=2)),
    ],
    test_size=0.2,
    verbose=True,
    random_state=42,
)

results = benchmark.run(test_data)
print("\nBenchmark Results:")
print(results)

## 5. Comparing Model Performance Across Distributions

Test the same models on different data distributions to understand their strengths and weaknesses.

In [None]:
import pandas as pd

distributions_to_test = [
    ("Poisson", "poisson", {"lam": 25}),
    ("Normal", "normal", {"loc": 50, "scale": 10}),
    ("Exponential", "exponential", {"scale": 10}),
]

comparison_results = []

for dist_name, dist_type, dist_params in distributions_to_test:
    # Generate data
    data = TimeSeriesSimulator(
        length=200, distribution=dist_type, dist_params=dist_params, random_state=42
    ).simulate()

    # Benchmark
    benchmark = ModelComparisonBenchmark(
        models=[
            ("naive_last", NaiveForecaster(strategy="last")),
            ("naive_mean", NaiveForecaster(strategy="mean")),
        ],
        test_size=0.2,
        verbose=False,
        random_state=42,
    )
    results = benchmark.run(data)

    # Get best model
    best_model, _, mae = benchmark.get_best_model("mae")
    comparison_results.append(
        {"Distribution": dist_name, "Best Model": best_model, "MAE": f"{mae:.4f}"}
    )

comparison_df = pd.DataFrame(comparison_results)
print("\nModel Performance Across Distributions:")
print(comparison_df)

## 6. Automatic Sizing for Seasonal Data

The simulator automatically ensures at least 3 full seasonal cycles for robust model training.

In [None]:
# Request small length, but with seasonality
sim_auto = TimeSeriesSimulator(
    length=20,  # Request only 20 points
    distribution="normal",
    seasonality=12,  # Monthly seasonality
    seasonality_strength=10.0,
    random_state=42,
)
auto_sized_data = sim_auto.simulate()

print("Requested length: 20")
print("Seasonal period: 12")
print(f"Actual generated length: {len(auto_sized_data)}")
print(f"Number of seasonal cycles: {len(auto_sized_data) / 12:.1f}")
print("✓ Auto-sized to ensure at least 3 full seasonal cycles!")

## Summary

The `TimeSeriesSimulator` provides a flexible framework for:
- Generating synthetic time series with specific statistical properties
- Testing model performance on different data distributions
- Creating reproducible benchmarking studies
- Understanding which forecasting models work best for different data types

Combined with `ModelComparisonBenchmark`, it enables comprehensive model evaluation across various data characteristics.