In [1]:
import warnings
warnings.filterwarnings("ignore")


import itertools

import numpy as np
import pandas as pd
import plotly.express as ple
import plotly.subplots as pls

from IPython.display import clear_output
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_percentage_error
from plotly.graph_objects import Figure

### Загрузка и обработка данных

In [4]:
data = pd.read_csv('data_hakaton1.csv', index_col='Unnamed: 0')
data['Sale_Date'] = pd.to_datetime(data['Sale_Date'])
data['Date'] = data['Sale_Date'].dt.date

daily_data = data.groupby("Date")['Product_Name'].value_counts().reset_index(name='count').set_index('Date')
daily_data_d = daily_data.groupby('Date').apply(lambda d: dict(zip(d['Product_Name'], d['count'])))
daily_data_df = pd.DataFrame.from_records(daily_data_d.values, index=daily_data_d.index).fillna(0).sort_values('Date')
daily_data_df = daily_data_df.set_index(pd.to_datetime(daily_data_df.index))
daily_data_df = daily_data_df.resample('W').sum()

plot = ple.line(daily_data_df, x=daily_data_df.index, y=daily_data_df.columns)
plot.update_layout(xaxis_title="Date", yaxis_title="Num sales")

plot

In [None]:
daily_data_df = daily_data_df[daily_data_df.index.year < 2024].sort_values('Date')

train = daily_data_df[daily_data_df.index < '2023-12-01']
test = daily_data_df[daily_data_df.index >= '2023-12-01']

train_len = len(train)
test_len = len(test);

### Обучение моделей с подбором параметров

In [9]:
def search_optimal_arima(time_series: pd.Series) -> tuple[int, int, int]:
    order_vals = range(5, 15)
    diff_vals = ma_vals = range(1, 5)
    pdq_combinations = list(itertools.product(order_vals, diff_vals, ma_vals))
    best_llf = -np.inf
    best_aic = +np.inf;
    l = len(pdq_combinations)

    for i, order_param in enumerate(pdq_combinations):
        clear_output()
        print(f"{i+1}/{l}")

        try:
            sarima_model = ARIMA(
                time_series,
				order=order_param,
				enforce_stationarity=False,
				enforce_invertibility=False,
            )

            model_results = sarima_model.fit()

            if model_results.aic < best_aic:
                optimal_order_param_aic = order_param
                best_aic = model_results.aic

            if model_results.llf > best_llf:
                optimal_order_param_llf = order_param
                best_llf = model_results.llf
        except:
            continue

    return optimal_order_param_llf


def get_mape(model: str, col: str, n_weeks: int = 6) -> float:
    forecast = model.forecast(n_weeks)
    true = test[col]
    pred = forecast.loc[true.index]

    return mean_absolute_percentage_error(true, pred)


def get_plot(model_fit: ARIMA, col: str) -> Figure:
    forecast = model_fit.get_prediction(start=train_len, end=train_len+test_len+1)
    train_predictions = model_fit.predict(start=0, end=train_len+test_len+1)

    forecast_mean = forecast.predicted_mean
    test_predictions = pd.DataFrame({'pred': forecast_mean, 'true': test[col]}, index=forecast_mean.index)
    train_predictions = pd.DataFrame({'pred': train_predictions, 'true': pd.concat([train[col], test[col]])}, index=train.index)

    true = test[col]
    pred = forecast_mean.loc[true.index]

    plot = pls.make_subplots(rows=2, x_title=f'Товар - {col}, MAPE - {mean_absolute_percentage_error(true, pred):4f}')
    plot.add_traces(ple.line(test_predictions, x=test_predictions.index, y=test_predictions.columns)['data'], rows=1, cols=1)
    plot.add_traces(ple.line(train_predictions, x=train_predictions.index, y=train_predictions.columns)['data'], rows=2, cols=1)

    return plot

In [5]:
models_full = {}

for col in train.columns:
    params = search_optimal_arima(train[col])
    model = ARIMA(train[col], order=params)
    model_fit = model.fit()
    models_full[col] = {'model': model_fit, 'order': params}
    clear_output()

In [11]:
for col in train.columns:
	display(get_plot(models_full[col]['model'], col))

In [15]:
mapes = pd.DataFrame({'MAPE': [get_mape(model_data['model'], model_name) for model_name, model_data in models_full.items()]}, index=models_full.keys()).sort_values('MAPE')

display(mapes.head(10))
display(mapes.tail(10));

Unnamed: 0,MAPE
Курица,0.028229
Молоко,0.047147
Морковь,0.048006
Свекла,0.050157
Огурцы маринованные,0.05372
Капуста,0.05373
Оливковое масло,0.054293
Сливочное масло,0.05523
Свинина,0.058626
Масло,0.060769


Unnamed: 0,MAPE
Огурцы,0.063853
Кефир,0.067342
Мед,0.067416
Творог,0.067985
Хлеб,0.068557
Куриное яйцо,0.070219
Рис,0.071911
Яблоки,0.074533
Сыр,0.076121
Картофель,0.107587


### Обучение моделей на полном наборе данных и сохранение

In [None]:
full_data = pd.concat([train, test])

for model_name, model_data in models_full.items():
    model = ARIMA(full_data[model_name], order=model_data['order'])
    model_fit = model.fit()
    model_fit.save('models/' + model_name + '.pickle')

In [29]:
import statsmodels.api as smapi


kefir_model = smapi.load_pickle('models/Кефир.pickle')
kefir_model_forecast = kefir_model.get_forecast(4)

kefir_model_forecast.predicted_mean
prognosis = kefir_model_forecast.predicted_mean.append(pd.Series({full_data.index.max(): full_data.loc[full_data.index.max()]['Кефир']}))
plot_data = pd.DataFrame({'History': full_data['Кефир'], 'Prognosis': prognosis})

ple.line(plot_data, x=plot_data.index, y=['History', 'Prognosis'])