## 1. Basic Usage

The `TimeSeriesSimulator` can generate time series data from various distributions.

In [None]:
from sktime.benchmarking import TimeSeriesSimulator

# Generate a simple normal distributed time series
sim = TimeSeriesSimulator(
    length=100,
    distribution="normal",
    dist_params={"loc": 50, "scale": 10},
    random_state=42,
)
y = sim.simulate()
print(f"Generated series: mean={y.mean():.2f}, std={y.std():.2f}")
y.plot(title="Normal Distribution", figsize=(10, 4))

## 2. Different Distributions

The simulator supports multiple built-in distributions.

In [None]:
import matplotlib.pyplot as plt

distributions = {
    "poisson": {"lam": 10},
    "exponential": {"scale": 5.0},
    "gamma": {"shape": 2.0, "scale": 3.0},
    "uniform": {"low": 0, "high": 20},
}

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()

for ax, (dist_name, params) in zip(axes, distributions.items()):
    sim = TimeSeriesSimulator(
        length=100,
        distribution=dist_name,
        dist_params=params,
        random_state=42,
    )
    y = sim.simulate()
    ax.plot(y)
    ax.set_title(f"{dist_name.title()} Distribution")
    ax.set_xlabel("Time")

plt.tight_layout()
plt.show()

## 3. Adding Trend and Seasonality

You can add trend and seasonal components to make the data more realistic.

In [None]:
# Time series with linear trend and weekly seasonality
sim = TimeSeriesSimulator(
    length=365,
    distribution="normal",
    dist_params={"loc": 50, "scale": 5},
    trend="linear",
    trend_params={"slope": 0.1, "intercept": 0},
    seasonality=7,
    seasonality_strength=10.0,
    random_state=42,
)
y = sim.simulate()
y.plot(title="Time Series with Trend and Weekly Seasonality", figsize=(12, 4))
plt.show()

In [None]:
# Multiple seasonal periods (weekly + monthly)
sim = TimeSeriesSimulator(
    length=365,
    distribution="poisson",
    dist_params={"lam": 20},
    trend="linear",
    trend_params={"slope": 0.05},
    seasonality=[7, 30],
    seasonality_strength=[5.0, 10.0],
    random_state=42,
)
y = sim.simulate()
y.plot(title="Multiple Seasonalities (Weekly + Monthly)", figsize=(12, 4))
plt.show()

## 4. Custom Distributions

You can define your own distribution function for specialized use cases.

In [None]:
import numpy as np


def beta_distribution(size, random_state):
    """Generate beta-distributed values scaled to percentage range."""
    return random_state.beta(2, 5, size=size) * 100


sim = TimeSeriesSimulator(
    length=100,
    distribution=beta_distribution,
    random_state=42,
)
y = sim.simulate()
print(f"Beta distribution: mean={y.mean():.2f}%, range=[{y.min():.2f}, {y.max():.2f}]")
y.plot(title="Custom Beta Distribution (Conversion Rates)", figsize=(10, 4))
plt.show()

In [None]:
def bimodal_distribution(size, random_state):
    """Generate bimodal distribution from mixture of two normals."""
    n1 = size // 2
    n2 = size - n1
    mode1 = random_state.normal(20, 3, size=n1)
    mode2 = random_state.normal(80, 5, size=n2)
    mixture = np.concatenate([mode1, mode2])
    random_state.shuffle(mixture)
    return mixture


sim = TimeSeriesSimulator(
    length=200,
    distribution=bimodal_distribution,
    random_state=42,
)
y = sim.simulate()
y.plot(title="Bimodal Distribution", figsize=(10, 4))
plt.show()

## 5. Using with ForecastingBenchmark

The simulated data can be directly used with sktime's `ForecastingBenchmark`
to evaluate how models perform on data with specific characteristics.

In [None]:
from sktime.benchmarking import TimeSeriesSimulator
from sktime.benchmarking.forecasting import ForecastingBenchmark
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.trend import TrendForecaster
from sktime.performance_metrics.forecasting import (
    MeanAbsoluteError,
    MeanAbsolutePercentageError,
)
from sktime.split import ExpandingWindowSplitter

# Generate Poisson-distributed data (e.g., daily count data)
sim = TimeSeriesSimulator(
    length=200,
    distribution="poisson",
    dist_params={"lam": 25},
    trend="linear",
    trend_params={"slope": 0.05},
    random_state=42,
)
y = sim.simulate()

print(f"Generated data: {len(y)} observations")
print(f"Mean: {y.mean():.2f}, Std: {y.std():.2f}")

In [None]:
# Set up the benchmark
benchmark = ForecastingBenchmark()

# Add estimators to compare
benchmark.add_estimator(NaiveForecaster(strategy="last"), estimator_id="naive_last")
benchmark.add_estimator(NaiveForecaster(strategy="mean"), estimator_id="naive_mean")
benchmark.add_estimator(NaiveForecaster(strategy="drift"), estimator_id="naive_drift")
benchmark.add_estimator(TrendForecaster(), estimator_id="trend")

# Add task with simulated data
benchmark.add_task(
    dataset_loader=y,
    cv_splitter=ExpandingWindowSplitter(
        fh=[1, 2, 3], initial_window=100, step_length=10
    ),
    scorers=[MeanAbsoluteError(), MeanAbsolutePercentageError()],
    task_id="poisson_with_trend",
)

# Run benchmark
results = benchmark.run()
print(results)

## 6. Comparing Models Across Different Data Distributions

One powerful use case is comparing how models perform on different data types.

In [None]:
# Generate different types of data
data_scenarios = {
    "normal": TimeSeriesSimulator(
        length=150,
        distribution="normal",
        dist_params={"loc": 50, "scale": 10},
        random_state=42,
    ).simulate(),
    "poisson": TimeSeriesSimulator(
        length=150, distribution="poisson", dist_params={"lam": 20}, random_state=42
    ).simulate(),
    "trending": TimeSeriesSimulator(
        length=150,
        distribution="normal",
        dist_params={"loc": 10, "scale": 3},
        trend="linear",
        trend_params={"slope": 0.5},
        random_state=42,
    ).simulate(),
    "seasonal": TimeSeriesSimulator(
        length=150,
        distribution="normal",
        dist_params={"loc": 50, "scale": 5},
        seasonality=7,
        seasonality_strength=15.0,
        random_state=42,
    ).simulate(),
}

# Visualize the different scenarios
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
axes = axes.flatten()

for ax, (name, data) in zip(axes, data_scenarios.items()):
    ax.plot(data)
    ax.set_title(f"{name.title()} Data")
    ax.set_xlabel("Time")

plt.tight_layout()
plt.show()

In [None]:
# Benchmark models on each scenario
from sktime.forecasting.naive import NaiveForecaster

for scenario_name, y_data in data_scenarios.items():
    benchmark = ForecastingBenchmark()

    benchmark.add_estimator(NaiveForecaster(strategy="last"), estimator_id="naive_last")
    benchmark.add_estimator(NaiveForecaster(strategy="mean"), estimator_id="naive_mean")
    benchmark.add_estimator(
        NaiveForecaster(strategy="drift"), estimator_id="naive_drift"
    )

    benchmark.add_task(
        dataset_loader=y_data,
        cv_splitter=ExpandingWindowSplitter(
            fh=[1, 2, 3], initial_window=80, step_length=10
        ),
        scorers=[MeanAbsoluteError()],
        task_id=scenario_name,
    )

    results = benchmark.run()
    print(f"\n=== {scenario_name.upper()} ===")
    print(results[["model_id", "MeanAbsoluteError_mean"]])

## 7. Real-World Use Case: Demand Spikes

A common challenge in demand forecasting is handling sudden spikes - promotional events,
viral moments, or supply chain disruptions. Different models handle these anomalies
differently. Let's simulate demand data with random spikes and benchmark which
forecasting approach is most robust.

In [None]:
def demand_with_spikes(size, random_state):
    """Simulate demand data with random promotional spikes.

    This represents real-world retail scenarios where demand is normally
    stable but experiences sudden spikes due to promotions, viral events,
    or external factors.
    """
    # Base demand follows a gamma distribution (always positive, slightly skewed)
    base_demand = random_state.gamma(shape=5, scale=10, size=size)

    # Add random spikes (5% of days have promotional spikes)
    spike_probability = 0.05
    spike_mask = random_state.random(size) < spike_probability
    spike_multiplier = random_state.uniform(2.5, 5.0, size=size)

    # Apply spikes
    demand = base_demand.copy()
    demand[spike_mask] = base_demand[spike_mask] * spike_multiplier[spike_mask]

    return demand


# Generate demand data with spikes
sim = TimeSeriesSimulator(
    length=300,
    distribution=demand_with_spikes,
    trend="linear",
    trend_params={"slope": 0.02},  # Slight growth trend
    seasonality=7,  # Weekly pattern
    seasonality_strength=5.0,
    random_state=42,
)
demand_data = sim.simulate()

# Visualize the demand with spikes
fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(demand_data, label="Demand", alpha=0.8)
ax.axhline(y=demand_data.median(), color="r", linestyle="--", label="Median Demand")

# Highlight spike regions
spike_threshold = demand_data.quantile(0.95)
spike_indices = demand_data[demand_data > spike_threshold].index
ax.scatter(
    spike_indices,
    demand_data[spike_indices],
    color="red",
    s=50,
    label=f"Spikes (>{spike_threshold:.0f})",
    zorder=5,
)

ax.set_title("Simulated Demand with Random Promotional Spikes")
ax.set_xlabel("Date")
ax.set_ylabel("Units Sold")
ax.legend()
plt.tight_layout()
plt.show()

print(f"Total observations: {len(demand_data)}")
print(f"Mean demand: {demand_data.mean():.1f}, Median: {demand_data.median():.1f}")
spike_pct = 100 * len(spike_indices) / len(demand_data)
print(f"Spike days (>95th percentile): {len(spike_indices)} ({spike_pct:.1f}%)")

In [None]:
# Benchmark different forecasting strategies on spike-prone data
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.trend import TrendForecaster
from sktime.performance_metrics.forecasting import (
    MeanAbsoluteError,
    MeanSquaredError,
)

benchmark = ForecastingBenchmark()

# Add various forecasting strategies
benchmark.add_estimator(NaiveForecaster(strategy="last"), estimator_id="naive_last")
benchmark.add_estimator(NaiveForecaster(strategy="mean"), estimator_id="naive_mean")
benchmark.add_estimator(NaiveForecaster(strategy="drift"), estimator_id="naive_drift")
benchmark.add_estimator(TrendForecaster(), estimator_id="trend")

# Set up benchmark task
benchmark.add_task(
    dataset_loader=demand_data,
    cv_splitter=ExpandingWindowSplitter(
        fh=[1, 2, 3, 7],  # Forecast 1, 2, 3, and 7 days ahead
        initial_window=150,
        step_length=14,
    ),
    scorers=[MeanAbsoluteError(), MeanSquaredError()],
    task_id="demand_with_spikes",
)

# Run the benchmark
results = benchmark.run()

print("=" * 60)
print("BENCHMARK RESULTS: Forecasting Demand with Spikes")
print("=" * 60)
print("\nModel Performance (lower is better):")
print(results[["model_id", "MeanAbsoluteError_mean", "MeanSquaredError_mean"]])

### Key Insights

When dealing with demand spikes:

- **Naive Mean** tends to be more robust as it averages out spikes
- **Naive Last** can overreact to recent spikes, causing large errors
- **MSE penalizes large errors more** - useful for identifying models that struggle with spikes

This type of analysis helps practitioners choose the right forecasting strategy
for their specific data characteristics before deploying to production.

## 8. Summary

The `TimeSeriesSimulator` provides:

- **Multiple distributions**: normal, poisson, exponential, gamma, uniform, binomial, lognormal
- **Custom distributions**: Define your own distribution function (e.g., demand with spikes)
- **Trend components**: linear, quadratic, exponential, or custom
- **Seasonality**: Single or multiple seasonal periods
- **Noise**: Add Gaussian noise to the series
- **Reproducibility**: Control random state for reproducible experiments

This integrates seamlessly with `ForecastingBenchmark` for systematic model evaluation
on data with known characteristics - helping practitioners choose the right model
before deploying to production.