# Заполнение sales_submission.csv

In [1]:
!pip install catboost



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd

import catboost
from catboost import CatBoostRegressor

import pickle

from tqdm import tqdm

In [4]:
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

## Загрузка обработанных данных, модели и sales_submission

In [5]:
with open('/content/drive/MyDrive/hackathon_lenta/df_train.pkl', 'rb') as df_train:
    df_train = pickle.load(df_train)

In [6]:
with open('/content/drive/MyDrive/hackathon_lenta/df_test.pkl', 'rb') as df_test:
    df_test = pickle.load(df_test)

In [7]:
with open('/content/drive/MyDrive/hackathon_lenta/catboost_model_40.0.pkl', 'rb') as m:
    model = pickle.load(m)

In [8]:
sales_submission = pd.read_csv('/content/drive/MyDrive/hackathon_lenta/sales_submission.csv')

In [9]:
sales_submission_original = sales_submission.copy()

In [10]:
sales_submission.head()

Unnamed: 0,st_id,pr_sku_id,date,target
0,16a5cdae362b8d27a1d8f8c7b78b4330,0045ebdb1069ff4b3dd3efe628c39cd3,2023-07-20,0
1,16a5cdae362b8d27a1d8f8c7b78b4330,00661699f543753ec7e911a64b9fd2f6,2023-07-20,0
2,16a5cdae362b8d27a1d8f8c7b78b4330,0094042bfeae507dc7f62acc8e5ed03a,2023-07-20,0
3,16a5cdae362b8d27a1d8f8c7b78b4330,0169529ff660adcac9b7e354e0c4b882,2023-07-20,0
4,16a5cdae362b8d27a1d8f8c7b78b4330,01e4734745e97e52d3213449e1a05dd7,2023-07-20,0


## Создание признаков для предсказания

In [11]:
def make_features(data, cos_sin_columns = ['month','day_of_week','day'], holiday_dict = {1:[i for i in range(1,10)],
                                                                                         2:[23],
                                                                                         3:[8],
                                                                                         5:[1,9],
                                                                                         6:[12],
                                                                                         11:[4],
                                                                                         12:[31]}):
    data['date'] = pd.to_datetime(data['date'])
    data['month'] = data['date'].dt.month

    #  столбец 'season' с указанием времени года
    seasons = {
        1: 0,
        2: 0,
        3: 1,
        4: 1,
        5: 1,
        6: 2,
        7: 2,
        8: 2,
        9: 3,
        10: 3,
        11: 3,
        12: 0
    }

    data['season'] = data['month'].map(seasons)
    data['day'] = data['date'].dt.day
    data['day_of_week'] = data['date'].dt.dayofweek

    def set_flag_holiday(row, holiday_dict):
        '''
        Функция размечает даты - праздник / не праздник
        :param row: Таблица
        :param holiday_dct: Словарь месяц:[число(1),...,число(i)]
        '''

        month = row['month']
        day = row['date'].day
        weekday = row['day_of_week']
        if (month in holiday_dict and day in holiday_dict[month]):
            return 1
        return 0


    data['holiday'] = data.apply(lambda row: set_flag_holiday(row, holiday_dict), axis=1)

    def after_holidays_n_days(row, n_before=14):
        if row['holiday'] == 1:
            return 1
        for i in range(1, n_before + 1):
            if row.name - i < 0:
                break
            if unique_rec.at[row.name - i, 'holiday'] == 1:
                return 1
        return 0

    def before_holidays_n_days(row, n_after=7):
        if row['holiday'] == 1:
            return 1
        for i in range(1, n_after + 1):
            if row.name + i >= len(unique_rec):
                break
            if unique_rec.at[row.name + i, 'holiday'] == 1:
                return 1
        return 0

    data = data.sort_values(by='date')
    tmp=data.copy()
    unique_rec = data.drop_duplicates(subset=['date']).copy()
    unique_rec.reset_index(drop=False,inplace=True)
    unique_rec['before_holidays_n_days'] = unique_rec.apply(lambda row: before_holidays_n_days(row), axis=1)
    unique_rec['after_holidays_n_days'] = unique_rec.apply(lambda row: after_holidays_n_days(row), axis=1)
    before_after_day = unique_rec.loc[:,['date','before_holidays_n_days','after_holidays_n_days']]
    data = data.merge(before_after_day, on='date', how='left')


    def cos_sin_categorise(df,cos_column_name,sin_column_name,column_to_categorise):
        """
        Применяет cos-sin тригонометрическую категоризацию

        :param data: DataFrame содержащий данные
        :param cos_column_name: Название будущей колонки с cos
        :param sin_column_name: Название будущей колонки с sin
        :param column_to_categorise: Назване колонки, которое хотим категоризовать
        """
        df[cos_column_name] = np.cos((2*np.pi *df[column_to_categorise])/df[column_to_categorise].nunique())
        df[sin_column_name] = np.sin((2*np.pi *df[column_to_categorise])/df[column_to_categorise].nunique())

    for i in cos_sin_columns:
        cos_sin_categorise(data,f'cos_{i}',f'sin_{i}',i)

    return data

In [12]:
sales_submission = make_features(sales_submission)

Возьмем pr_uom_id из df_train.

Примем, что если у нас нет значений значит продукт продается по категории 1.

In [13]:
uom_dict = df_train.drop_duplicates(subset=['pr_sku_id']).set_index('pr_sku_id')['pr_uom_id'].to_dict()

In [14]:
sales_submission['pr_uom_id'] = sales_submission['pr_sku_id'].map(uom_dict).fillna(np.nan)

In [15]:
sales_submission['pr_uom_id'].isna().sum()

2702

In [16]:
sales_submission['pr_uom_id'] = sales_submission['pr_uom_id'].fillna(1)

In [17]:
sales_submission['pr_uom_id'].isna().sum()

0

In [18]:
sales_submission['pr_uom_id'] = sales_submission['pr_uom_id'].astype('int')

Предположим, что все товары продаются без промо.

In [19]:
sales_submission['pr_sales_type_id'] = 0

Возьмем те лаги которые возможно из df_test.

In [20]:
def compute_lags_rolling_means(df_test, sales_submission):

    df_test = df_test.sort_values(by='date').set_index(['st_id', 'pr_sku_id', 'date'])
    sales_submission = sales_submission.sort_values(by='date').set_index(['st_id', 'pr_sku_id', 'date'])

    combined = pd.concat([df_test, sales_submission], axis=0)

    for lag in [1, 2, 7, 14]:
        sales_submission['lag_{}_sales'.format(lag)] = combined['pr_sales_in_units'].groupby(level=[0, 1]).shift(lag).loc[sales_submission.index]

    for window in [2, 7, 14]:
        sales_submission['rolling_mean_{}'.format(window)] = combined['pr_sales_in_units'].groupby(level=[0, 1]).rolling(window=window).mean().reset_index(level=[0, 1], drop=True).loc[sales_submission.index]

    sales_submission = sales_submission.fillna(0).reset_index()

    return sales_submission

In [21]:
sales_submission = compute_lags_rolling_means(df_test, sales_submission)

Заполним признак выбросов.



In [22]:
sales_submission['is_outlier'] = 0

## Подготовка датафрейма к предсказанию

Уберем ненужные признаки.

In [23]:
sales_submission = sales_submission.drop(['month', 'day', 'day_of_week', 'target'], axis=1)

Приведем порядок признаков к тому, который был на обучении модели.

In [24]:
original_column_order = ['date',
                         'st_id',
                         'pr_sku_id',
                         'holiday',
                         'season',
                         'cos_month',
                         'sin_month',
                         'cos_day_of_week',
                         'sin_day_of_week',
                         'cos_day',
                         'sin_day',
                         'before_holidays_n_days',
                         'after_holidays_n_days',
                         'pr_sales_type_id',
                         'pr_uom_id',
                         'lag_1_sales',
                         'lag_2_sales',
                         'lag_7_sales',
                         'lag_14_sales',
                         'rolling_mean_2',
                         'rolling_mean_7',
                         'rolling_mean_14',
                         'is_outlier']

sales_submission = sales_submission[original_column_order]

## Рекурсивное предсказание с заполнением lag и rolling_mean

In [25]:
from tqdm import tqdm

def recursive_forecasting(model, sales_submission):

    sales_submission = sales_submission.sort_values(by=['st_id', 'pr_sku_id', 'date'])

    temp_date = sales_submission['date'].copy()

    sales_submission = sales_submission.drop('date', axis=1)

    for (store, product), group in tqdm(sales_submission.groupby(['st_id', 'pr_sku_id']), desc="Processing store-product pairs"):
        for index, row in group.iterrows():

            prediction = model.predict(row.dropna())

            sales_submission.loc[index, 'target'] = prediction

            next_idx = group.index.get_loc(index) + 1
            if next_idx < len(group):
                next_row_index = group.index[next_idx]

                for lag in [1, 2, 7, 14]:
                    if sales_submission.loc[next_row_index, 'lag_{}_sales'.format(lag)] == 0:
                        shifted_value = sales_submission['target'].shift(lag)
                        sales_submission.loc[next_row_index, 'lag_{}_sales'.format(lag)] = shifted_value[next_row_index] if next_row_index in shifted_value.index else 0

                        sales_submission['lag_{}_sales'.format(lag)].fillna(0, inplace=True)

                for window in [2, 7, 14]:
                    if sales_submission.loc[next_row_index, 'rolling_mean_{}'.format(window)] == 0:
                        rolling_value = sales_submission['target'].rolling(window=window).mean()
                        sales_submission.loc[next_row_index, 'rolling_mean_{}'.format(window)] = rolling_value[next_row_index] if next_row_index in rolling_value.index else 0

                        sales_submission['rolling_mean_{}'.format(window)].fillna(0, inplace=True)


    sales_submission['date'] = temp_date
    sales_submission['target'] = sales_submission['target'].apply(lambda x: max(0, x))

    return sales_submission.reset_index()

In [26]:
sales_submission_prediction = recursive_forecasting(model, sales_submission)

Processing store-product pairs: 100%|██████████| 3121/3121 [08:32<00:00,  6.08it/s]


## Выгрузка sales_submission

In [27]:
sales_submission_prediction = sales_submission_prediction.drop(['holiday',
                                                                'season',
                                                                'cos_month',
                                                                'sin_month',
                                                                'cos_day_of_week',
                                                                'sin_day_of_week',
                                                                'cos_day',
                                                                'sin_day',
                                                                'before_holidays_n_days',
                                                                'after_holidays_n_days',
                                                                'pr_sales_type_id',
                                                                'pr_uom_id',
                                                                'lag_1_sales',
                                                                'lag_2_sales',
                                                                'lag_7_sales',
                                                                'lag_14_sales',
                                                                'rolling_mean_2',
                                                                'rolling_mean_7',
                                                                'rolling_mean_14',
                                                                'is_outlier',
                                                                'index'], axis=1)

In [28]:
sales_submission_prediction.columns

Index(['st_id', 'pr_sku_id', 'target', 'date'], dtype='object')

In [29]:
sales_submission_original.columns

Index(['st_id', 'pr_sku_id', 'date', 'target'], dtype='object')

In [30]:
sales_submission_prediction = sales_submission_prediction[['st_id', 'pr_sku_id', 'date', 'target']]

In [31]:
sales_submission_prediction.head()

Unnamed: 0,st_id,pr_sku_id,date,target
0,16a5cdae362b8d27a1d8f8c7b78b4330,0045ebdb1069ff4b3dd3efe628c39cd3,2023-07-19,1.561
1,16a5cdae362b8d27a1d8f8c7b78b4330,0045ebdb1069ff4b3dd3efe628c39cd3,2023-07-20,1.892
2,16a5cdae362b8d27a1d8f8c7b78b4330,0045ebdb1069ff4b3dd3efe628c39cd3,2023-07-21,1.949
3,16a5cdae362b8d27a1d8f8c7b78b4330,0045ebdb1069ff4b3dd3efe628c39cd3,2023-07-22,1.359
4,16a5cdae362b8d27a1d8f8c7b78b4330,0045ebdb1069ff4b3dd3efe628c39cd3,2023-07-23,1.596


In [32]:
sales_submission_prediction.to_csv('/content/drive/MyDrive/hackathon_lenta/sales_submission_filled.csv')