## Прогноз ринку (three-phase linear)

Зошит повторює ключові кроки `3p_linear_model`: базовий Holt-Winters, сезонні та лагові ознаки, фінальна модель XGBoost для кожної товарної групи.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from three_phase_linear import ForecastConfig, run_three_phase_forecast

DATA_PATH = Path('forecast_of_market_dataset.csv')
OUTPUT_PATH = Path('market_three_phase_forecast_accuracy_calculation.csv')
GROUP_COLS = ['product_group_id']
TARGET_COLUMNS = ['market_revenue', 'revenue_amazon']


In [2]:
df = pd.read_csv(DATA_PATH)
df['month'] = pd.to_datetime(df['month'])
df = df.sort_values(GROUP_COLS + ['month']).reset_index(drop=True)

future_mask = df[TARGET_COLUMNS].isna().any(axis=1)
future_counts = df[future_mask].groupby(GROUP_COLS).size()
forecast_horizon = int(future_counts.max()) if not future_counts.empty else 12
if forecast_horizon <= 0:
    forecast_horizon = 12

history_df = df[~future_mask].copy()
history_df = history_df.sort_values(GROUP_COLS + ['month']).reset_index(drop=True)
history_df['is_evaluation_period'] = False

for _, group in history_df.groupby(GROUP_COLS):
    eval_count = min(len(group), forecast_horizon)
    if eval_count == 0:
        continue
    eval_indices = group.tail(eval_count).index
    history_df.loc[eval_indices, 'is_evaluation_period'] = True

for target in TARGET_COLUMNS:
    history_df[f'{target}_actual'] = history_df[target]
    history_df.loc[history_df['is_evaluation_period'], target] = np.nan

df = history_df

print(f'Forecast horizon for accuracy: {forecast_horizon} periods')


Forecast horizon for accuracy: 12 periods


In [3]:
prediction_frames = {}
summary_frames = []

for target in TARGET_COLUMNS:
    target_df = df[['month', *GROUP_COLS, target]].copy()
    config = ForecastConfig(
        time_col='month',
        target_col=target,
        group_cols=GROUP_COLS,
        freq='MS',
        forecast_horizon=forecast_horizon,
        seasonal_periods=12,
        min_history=24,
        lags=(1, 2, 3, 6, 12, 18, 24),
        rolling_windows=(3, 6, 12, 24),
        random_search_iterations=10,
        n_splits=4,
        random_state=46,
    )

    preds, summaries = run_three_phase_forecast(target_df, config)
    preds = preds.rename(columns={
        'prediction': f'{target}_forecast',
        f'{target}_holtwinters': f'{target}_baseline',
    })
    prediction_frames[target] = preds

    summary_df = pd.DataFrame({
        'group_key': [s.group_key[0] for s in summaries],
        'train_rows': [s.train_rows for s in summaries],
        'cv_mae': [s.best_score for s in summaries],
        'skipped_reason': [s.skipped_reason for s in summaries],
    })
    summary_df['target'] = target
    summary_frames.append(summary_df)

summary_report = pd.concat(summary_frames, ignore_index=True)
summary_report.head()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

Unnamed: 0,group_key,train_rows,cv_mae,skipped_reason,target
0,1,60,1657743000.0,,market_revenue
1,2,60,1048261000.0,,market_revenue
2,3,60,236006600.0,,market_revenue
3,4,60,1042845000.0,,market_revenue
4,5,60,228181700.0,,market_revenue


In [4]:
result_df = df.copy()
original_masks = {target: result_df[target].isna() for target in TARGET_COLUMNS}

for target, preds in prediction_frames.items():
    merge_cols = [*GROUP_COLS, 'month']
    result_df = result_df.merge(
        preds[merge_cols + [f'{target}_forecast']],
        on=merge_cols,
        how='left'
    )
    result_df[target] = result_df[target].astype(float)
    result_df[target] = result_df[target].fillna(result_df[f'{target}_forecast'])

output_columns = ['month', 'product_group_id', 'product_group_name', 'market_revenue', 'revenue_amazon']
forecast_mask = np.zeros(len(result_df), dtype=bool)
for target, mask in original_masks.items():
    forecast_mask |= mask
final_output = result_df.loc[forecast_mask, output_columns].sort_values(['product_group_id', 'month']).reset_index(drop=True)
final_output.to_csv(OUTPUT_PATH, index=False)

final_output.tail()


Unnamed: 0,month,product_group_id,product_group_name,market_revenue,revenue_amazon
343,2024-08-01,43,Communication Gadgets,1680206000.0,338989504.0
344,2024-09-01,43,Communication Gadgets,1522272000.0,309769344.0
345,2024-10-01,43,Communication Gadgets,1621392000.0,309769344.0
346,2024-11-01,43,Communication Gadgets,1520555000.0,349304416.0
347,2024-12-01,43,Communication Gadgets,1573590000.0,314651520.0


MAE - Середня абсолютна помилка 

In [None]:
evaluation_df = result_df[result_df['is_evaluation_period']].copy()
for target in TARGET_COLUMNS:
    actual = evaluation_df[f'{target}_actual'].astype(float)
    forecast = evaluation_df[f'{target}_forecast'].astype(float)
    mask = actual.notna() & forecast.notna()
    if mask.any():
        mae = np.abs(actual[mask] - forecast[mask]).mean()
        print(f'{target} MAE: {mae:.4f}')
    else:
        print(f'{target} MAE: not enough data to calculate')


MAPE-Середня абсолютна відсоткова помилка 

In [5]:
evaluation_df = result_df[result_df['is_evaluation_period']].copy()
for target in TARGET_COLUMNS:
    actual = evaluation_df[f'{target}_actual'].astype(float)
    forecast = evaluation_df[f'{target}_forecast'].astype(float)
    mask = actual.notna() & forecast.notna() & (actual.replace(0, np.nan).notna())
    if mask.any():
        mape = (np.abs((actual[mask] - forecast[mask]) / actual[mask]) * 100).mean()
        print(f'{target} MAPE: {mape:.4f}%')
    else:
        print(f'{target} MAPE: not enough data to calculate')


market_revenue MAPE: 15.5316%
revenue_amazon MAPE: 19.5304%


WMAPE-Взважена середня абсолютна помилка 

In [None]:
evaluation_df = result_df[result_df['is_evaluation_period']].copy()
for target in TARGET_COLUMNS:
    actual = evaluation_df[f'{target}_actual'].astype(float)
    forecast = evaluation_df[f'{target}_forecast'].astype(float)
    mask = actual.notna() & forecast.notna()
    denominator = np.abs(actual[mask]).sum()
    if mask.any() and denominator > 0:
        wmape = np.abs(actual[mask] - forecast[mask]).sum() / denominator * 100
        print(f'{target} WMAPE: {wmape:.4f}%')
    else:
        print(f'{target} WMAPE: not enough data to calculate')


In [None]:
summary_report