# Обучение модели предсказания продаж

In [1]:
!pip install catboost



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd

import catboost
from catboost import CatBoostRegressor, Pool

import pickle

In [4]:
RANDOM_SEED = 12345

## Загрузка обработанных данных и создание признаков


In [5]:
df_train = pd.read_csv('/content/drive/MyDrive/hackathon_lenta/df_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/hackathon_lenta/df_test.csv')

In [6]:
features_train = df_train.drop(['pr_sales_in_units',
                                'Unnamed: 0',
                                'date'], axis=1)

target_train = df_train['pr_sales_in_units']

In [7]:
features_test = df_test.drop(['pr_sales_in_units',
                              'Unnamed: 0',
                              'date'], axis=1)

target_test = df_test['pr_sales_in_units']

## Обучение модели

In [8]:
columns_to_encode = ['st_id',
                     'pr_sku_id',
                     'holiday',
                     'season',
                     'before_holidays_n_days',
                     'after_holidays_n_days',
                     'pr_uom_id',
                     'pr_sales_type_id',
                     'is_outlier']

In [9]:
model = CatBoostRegressor(
    iterations=1500,
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=12345,
    cat_features=columns_to_encode,
    verbose=50)

In [10]:
model.fit(features_train, target_train)

Learning rate set to 0.082651
0:	learn: 14.8723265	total: 1.13s	remaining: 28m 7s
50:	learn: 7.0970828	total: 22.9s	remaining: 10m 49s
100:	learn: 6.7088149	total: 38.6s	remaining: 8m 53s
150:	learn: 6.4722270	total: 55s	remaining: 8m 11s
200:	learn: 6.2833267	total: 1m 13s	remaining: 7m 52s
250:	learn: 6.1441768	total: 1m 29s	remaining: 7m 23s
300:	learn: 6.0364274	total: 1m 44s	remaining: 6m 56s
350:	learn: 5.9333736	total: 2m	remaining: 6m 35s
400:	learn: 5.8014239	total: 2m 18s	remaining: 6m 18s
450:	learn: 5.6837514	total: 2m 34s	remaining: 5m 59s
500:	learn: 5.5947280	total: 2m 50s	remaining: 5m 39s
550:	learn: 5.5053099	total: 3m 8s	remaining: 5m 23s
600:	learn: 5.4354962	total: 3m 26s	remaining: 5m 8s
650:	learn: 5.3719273	total: 3m 43s	remaining: 4m 51s
700:	learn: 5.3193296	total: 4m	remaining: 4m 34s
750:	learn: 5.2726901	total: 4m 17s	remaining: 4m 16s
800:	learn: 5.2332271	total: 4m 35s	remaining: 4m
850:	learn: 5.1912527	total: 4m 52s	remaining: 3m 42s
900:	learn: 5.15030

<catboost.core.CatBoostRegressor at 0x7d8a6ef03670>

In [11]:
predictions = model.predict(features_test)

In [12]:
wape = np.sum(np.abs(target_test - predictions)) / np.sum(target_test)
print(f'CatBoost WAPE: {wape:.4}')

CatBoost WAPE: 0.4005


Важность признаков.

In [13]:
feature_importance = model.get_feature_importance(type=catboost.EFstrType.FeatureImportance)

In [15]:
importance_df = pd.DataFrame({'Признак': features_train.columns, 'Важность': feature_importance}).sort_values(by='Важность', ascending=False).reset_index(drop=True)

In [16]:
importance_df.head(10)

Unnamed: 0,Признак,Важность
0,lag_1_sales,23.168108
1,is_outlier,19.202784
2,rolling_mean_14,13.038671
3,rolling_mean_7,10.19362
4,lag_7_sales,6.344089
5,pr_sku_id,4.582081
6,cos_day_of_week,3.604708
7,lag_14_sales,3.5711
8,rolling_mean_2,3.39921
9,pr_uom_id,2.456832


## Сохранение данных в формате pkl для backend

In [17]:
model.save_model('/content/drive/MyDrive/hackathon_lenta/catboost_model_40.0.cbm', format='cbm')

In [18]:
with open('/content/drive/MyDrive/hackathon_lenta/catboost_model_40.0.pkl', 'wb') as m:
    pickle.dump(model, m)

In [19]:
with open('/content/drive/MyDrive/hackathon_lenta/df_train.pkl', 'wb') as df_train:
    pickle.dump(df_train, df_train)

In [20]:
with open('/content/drive/MyDrive/hackathon_lenta/df_test.pkl', 'wb') as df_test:
    pickle.dump(df_test, df_test)

In [21]:
with open('/content/drive/MyDrive/hackathon_lenta/features_test.pkl', 'wb') as f:
    pickle.dump(features_test, f)

In [22]:
with open('/content/drive/MyDrive/hackathon_lenta/target_test.pkl', 'wb') as t:
    pickle.dump(target_test, t)