# Лабораторная работа №3 - Store Sales - Time Series Forecasting

In [53]:
RANDOM_STATE = 1337

import numpy as np
import pandas as pd

input_path = "../input/store-sales-time-series-forecasting"

import os
for dirname, _, filenames in os.walk(input_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/store-sales-time-series-forecasting\100_score.csv
../input/store-sales-time-series-forecasting\holidays_events.csv
../input/store-sales-time-series-forecasting\oil.csv
../input/store-sales-time-series-forecasting\sample_submission.csv
../input/store-sales-time-series-forecasting\stores.csv
../input/store-sales-time-series-forecasting\test.csv
../input/store-sales-time-series-forecasting\train_1.csv
../input/store-sales-time-series-forecasting\train_2.csv
../input/store-sales-time-series-forecasting\transactions.csv


### Загрузка данных

In [None]:
# 100 mb per file - cringe
train_1_part = pd.read_csv(f"{input_path}/train_1.csv", parse_dates=['date'])
train_2_part = pd.read_csv(f"{input_path}/train_2.csv", parse_dates=['date'])
train_data = pd.concat([train_1_part, train_2_part])

test_data = pd.read_csv(f"{input_path}/test.csv", parse_dates=['date'])
oil = pd.read_csv(f"{input_path}/oil.csv", parse_dates=['date'])
holiday_events = pd.read_csv(f"{input_path}/holidays_events.csv", parse_dates=['date'])
stores = pd.read_csv(f"{input_path}/stores.csv")
transcations = pd.read_csv(f"{input_path}/transactions.csv", parse_dates=['date'])

train_data.head()

### Плоттим данные

Микроанализ:\
Объем продаж имеет возрастающий тренд и явную сезонность (праздники, особенно новый год) 
Количество транзакций, в целом, постоянно, но СИЛЬНО возрастает в период праздников, особенно перед новым годом

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")
plt.figure(figsize=(12, 6))
sns.lineplot(x="date", y="sales", data=train_data, label="sales")
sns.lineplot(x="date", y="transactions", data=transcations, label="transactions")
plt.title("Sales and transactions over time")
plt.show()

### Проверка гипотезы того, что во время землетрясения объем продаж падает

16 числа произошло землетрясение

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x="date", y="sales", data=train_data[(train_data.date > "2016-04-10") & (train_data.date < "2016-05-30")])
sns.lineplot(x="date", y="transactions", data=transcations[(transcations.date > "2016-04-10") & (transcations.date < "2016-05-30")])
plt.title("Sales and Transactions over time (Earthquake period)")
plt.show()

In [None]:
# Список типов товаров
family_list = train_data["family"].unique()
family_list

### Препроцессинг

### Целевые и статик данные

Цели - 'sales' (продажи)\
Статик: \
'city'      - город, в котором находится магазин\
'state'     - штат, в котором находится магазин\
'type'      - тип продаваемых товаров\
'cluster'   - группа похожих магазинов


In [None]:
static_merged = pd.merge(train_data, stores, on='store_nbr').set_index('date')
static_merged.head()

### Изменяемые данные
Данные, меняющиеся во времени\
День, день недели, месяц и т.д.

Цена на бензин, праздники

In [None]:
future_merged = pd.merge(
    pd.concat([train_data, test_data]), oil, on="date", how="left"
).set_index(["date"])

future_merged['day'] = future_merged.index.day
future_merged['dayofweek'] = future_merged.index.dayofweek
future_merged['dayofyear'] = future_merged.index.dayofyear
future_merged['month'] = future_merged.index.month
future_merged['year'] = future_merged.index.year

# Разбиваем день недели на столбцы, так как он нет линейной зависимости, только катеригоиальная
day_of_week_dummies = pd.get_dummies(future_merged["dayofweek"])

future_merged = pd.concat([future_merged, day_of_week_dummies], axis=1)
future_merged = future_merged.drop(["dayofweek"], axis=1)

future_merged.head()

### Скейлим

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_cols = ['dcoilwtico', 'onpromotion','day','dayofyear','month','year']

future_merged.columns = future_merged.columns.astype(str)
future_merged[scaled_cols] = scaler.fit_transform(future_merged[scaled_cols])

future_merged.head()

### Разбиваем праздники на категории

In [None]:
holiday_store_list = []

def filter_column(data, column, value):
    return np.where(data[column].str.contains(value), 1, 0)

for i in range(len(stores)):
    df_holiday_dummies = pd.DataFrame(columns=["date"])
    df_holiday_dummies["date"] = holiday_events["date"]
    df_holiday_dummies["store_nbr"] = i + 1

    # Столбцы-классификаторы
    df_holiday_dummies["national_holiday"] = np.where(((holiday_events["type"] == "Holiday") & (holiday_events["locale"] == "National")), 1, 0)    
    df_holiday_dummies["national_event"] = np.where(((holiday_events["type"] == "Event") & (holiday_events["locale"] == "National") & (~holiday_events['description'].str.contains('Terremoto Manabi')) & (~holiday_events['description'].str.contains('futbol'))), 1, 0)    
    df_holiday_dummies["local_holiday"] = np.where(((holiday_events["type"] == "Holiday") & ((holiday_events["locale_name"] == stores['state'][i]) | (holiday_events["locale_name"] == stores['city'][i]))), 1, 0)

    # Землетрясение (продажи должны быть сильно ниже (?))
    df_holiday_dummies["earthquake"] = filter_column(holiday_events, "description", "Terremoto Manabi")

    # Праздники с наибольшим влянием
    df_holiday_dummies["christmas"] = filter_column(holiday_events, "description", "Navidad")
    df_holiday_dummies["football"] = filter_column(holiday_events, "description", "futbol")

    # Рабочий день в праздник (продажи ниже (?))
    df_holiday_dummies["work_day"] = filter_column(holiday_events, "type", "Work Day")

    df_holiday_dummies = df_holiday_dummies[~df_holiday_dummies['date'].duplicated(keep='first')]

    holiday_store_list.append(df_holiday_dummies)

holiday_store_df = pd.concat(holiday_store_list)

future_merged = pd.merge(future_merged, holiday_store_df, on=['date','store_nbr'], how='left')
future_merged.head()

### Транзакции

In [None]:
past_merged = pd.merge(train_data, transcations, on=['date','store_nbr']).set_index('date')

# Скейлим транзации от Min до Max
past_merged['transactions'] = scaler.fit_transform(past_merged[['transactions']])

past_merged.head()

### Разбиение на обучающую и валидационную выборки

In [None]:
train_static_cov, val_static_cov = static_merged.loc[static_merged.index <= '2017-07-30'], static_merged.loc[static_merged.index > '2017-07-30']

train_past_cov = past_merged.loc[past_merged.index <= '2017-07-30']

### Конверт к TimeSeries

In [None]:
from darts import TimeSeries

family_ts_dict = {}

for family in family_list:
    train_family = static_merged.loc[static_merged['family'] == family]
    
    family_ts_list = TimeSeries.from_group_dataframe(
        df=train_family,
        group_cols=['store_nbr', 'family'],
        value_cols='sales',
        static_cols=['city','state','type','cluster'],
        fill_missing_dates=True,
        freq='D',
        fillna_value=0,
    )
    
    family_ts_dict[family] = family_ts_list

In [None]:
from darts.dataprocessing.transformers import StaticCovariatesTransformer, Scaler, InvertibleMapper
from darts.dataprocessing import Pipeline

family_pipeline_dict = {}
family_ts_transformed_dict = {}

for family in family_ts_dict:
    # Создаем pipeline для каждой семьи товаров
    static_cov_transformer = StaticCovariatesTransformer()
    log_transformer = InvertibleMapper(np.log1p, np.expm1)
    scaler = Scaler()
    
    # Добавляем статические ковариаты
    train_pipeline = Pipeline([
        static_cov_transformer,
        log_transformer,
        scaler
    ])
    
    train_transformed = train_pipeline.fit_transform(family_ts_dict[family])
    family_pipeline_dict[family] = train_pipeline
    family_ts_transformed_dict[family] = train_transformed

In [None]:
family_future_cov_dict = {}

for family in family_list:
    future_cov_family = future_merged.loc[future_merged['family'] == family]
    
    family_future_cov_list = TimeSeries.from_group_dataframe(
        df=future_cov_family,
        group_cols=['store_nbr', 'family'],
        time_col='date',
        value_cols=[
            'onpromotion','dcoilwtico', 'day',
            'dayofyear','month','year',
            '0','1','2','3','4','5','6',
            'national_holiday','earthquake','christmas',
            'football','national_event','work_day',
            'local_holiday'
        ],
        fill_missing_dates=True,
        freq='D',
    )

    family_future_cov_dict[family] = family_future_cov_list

In [50]:
family_past_cov_dict = {}

for family in family_list:
    past_cov_family = past_merged.loc[past_merged['family'] == family]

    family_past_cov_list = TimeSeries.from_group_dataframe(
        df=past_cov_family,
        group_cols=["store_nbr", "family"],
        value_cols=["transactions"],
        fill_missing_dates=True,
        freq="D",
    )

    family_past_cov_dict[family] = family_past_cov_list



### LGBM Model

In [54]:
from darts.models import LightGBMModel
from tqdm import tqdm

def train_model(days=7):
    lgbm_model_dict = {}

    for family in tqdm(family_list):
        lgbm_model = LightGBMModel(
            lags=days, 
            lags_past_covariates=[-16,-17,-18,-19,-20,-21,-22],
            lags_future_covariates=(14,1),
            random_state=RANDOM_STATE
        )

        lgbm_model.fit(
            series = family_ts_transformed_dict[family], 
            past_covariates=family_past_cov_dict[family], 
            future_covariates=family_future_cov_dict[family]
        )

        lgbm_model_dict[family] = lgbm_model
        
    return lgbm_model_dict

In [55]:
def predict_model(lgbm_model_dict):    
    pred_dict = {}

    for family in tqdm(family_list):
        pred = lgbm_model_dict[family].predict(
            n=16, 
            series=family_ts_transformed_dict[family], 
            past_covariates=family_past_cov_dict[family], 
            future_covariates=family_future_cov_dict[family]
        )

        pred_dict[family] = family_pipeline_dict[family].inverse_transform(pred)
        
    pred_df_list = []
    for family in family_list:
        for i, pred in enumerate(pred_dict[family]):
            pred_df = pred.pd_dataframe()
            pred_df['family'] = family
            pred_df['store_nbr'] = i+1

            pred_df_list.append(pred_df)

    final_preds = pd.concat(pred_df_list)
    
    final_preds.loc[final_preds['sales'] < 0, 'sales'] = 0
    
    return final_preds

In [56]:
def train_predict_model(days=7):
    return predict_model(train_model(days=days))

In [None]:
pred_month = train_predict_model(days=31)
pred_three_month = train_predict_model(days=93)
pred_half_year = train_predict_model(days=365//2)
pred_3_4_year = train_predict_model(days=365*3//4)
pred_year = train_predict_model(days=365)

## Собираем модели

In [None]:
predictions = pred_month.copy()
predictions["sales"] = (
    pred_month["sales"]
    + pred_three_month["sales"]
    + pred_half_year["sales"]
    + pred_3_4_year["sales"]
    + pred_year["sales"]
) / 5

In [None]:
from sklearn.metrics import mean_squared_log_error

actual = val_static_cov.sort_values(['store_nbr', 'family', 'date'])['sales']
preds = predictions.sort_values(['store_nbr', 'family', 'date'])['sales']

score = np.sqrt(mean_squared_log_error(actual, preds))
print('Combined RMSLE : ', score)

actual_submission = pd.read_csv(f"{input_path}/100_score.csv")
score = np.sqrt(mean_squared_log_error(actual_submission['sales'], preds))
print('Submission RMSLE : ', score)

In [None]:
submit = pd.merge(test_data, predictions, on=['store_nbr', 'family','date'])
submit = submit.reindex(columns=['id','sales'])
submit.to_csv('submission.csv', index=False)