CatBoost - Продвинутая библиотека градиентного бустинга на деревьях решений с открытым исходным кодом.

Заявляется, что хорошо относится к категориальным признакам и не требует их предварительной обработки.

In [1]:
# !pip install catboost

In [2]:
# !pip install ipywidgets

In [3]:
import pandas as pd
import numpy as np

from catboost import Pool, CatBoostRegressor

from sklearn.model_selection import train_test_split

# Загружаем данные

In [20]:
features_to_drop = ['description', 
                    'sell_id',
                    'time_ownership',
                   ]

X = pd.read_csv('train_preprocessed.csv').drop(columns='price').drop(columns=features_to_drop)
y = pd.read_csv('train_preprocessed.csv', usecols=['price'])

X_test = pd.read_csv('test_preprocessed.csv').drop(columns='price').drop(columns=features_to_drop)

In [21]:
X.columns

Index(['body_type', 'brand', 'color', 'engine_displacement', 'engine_power',
       'fuel_type', 'mileage', 'model_year', 'n_doors', 'production_year',
       'vehicle_transmission', 'n_owners', 'drive_type',
       'is_original_techpass', 'is_lefthand_drive', 'full_model_name'],
      dtype='object')

In [22]:
# категориальные признаки
cat_features_list = ['body_type', 
                     'brand', 
                     'color', 
                     'fuel_type', 
                     'n_doors', 
                     'vehicle_transmission', 
                     'drive_type', 
                     'n_owners', 
                     'is_original_techpass', 
                     'is_lefthand_drive', 
                     'full_model_name',
                    ]

In [18]:
# посмотрим сколько уникальных значений по кат-признакам
pd.concat([X, X_test])[cat_features_list].nunique()
# full_model_name точно будет слишком накладно делать one-hot, остальное можно проверить

body_type                11
brand                     3
color                    16
fuel_type                 4
n_doors                   4
vehicle_transmission      4
drive_type                3
n_owners                  4
is_original_techpass      2
is_lefthand_drive         2
full_model_name         122
dtype: int64

In [8]:
# разделим на обучающую и валидационную
test_size = 0.2
seed = 73
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

(5345, 16) (5345, 1)
(1337, 16) (1337, 1)


In [9]:
# готовим для CatBoost
train_pool = Pool(X_train, y_train, cat_features=cat_features_list)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features_list)
full_pool = Pool(X, y, cat_features=cat_features_list)

In [10]:
# готовим тюнинг гиперпараметров по сетке #1
model_grid = CatBoostRegressor(iterations=1000, 
                               loss_function='MAE', 
                               eval_metric='MAPE',
                               logging_level='Silent')
grid = {'depth': [4, 6, 8],
        'rsm': [0.5, 0.75, 1.0],
        'one_hot_max_size': [4, 11, 16],
        'random_strength': [1, 2]}

# grid_search_result = model_grid.grid_search(grid, full_pool, cv=3, plot=True)

Выводы:
- one_hot_max_size = 11
- depth = 8+
- rsm = 0.5-
- random_strength - неважно

In [11]:
# готовим тюнинг гиперпараметров по сетке #2
model_grid_2 = CatBoostRegressor(iterations=1000, 
                               loss_function='MAE', 
                               eval_metric='MAPE',
                               one_hot_max_size=11,
                               logging_level='Silent')
grid_2 = {'depth': [8, 10, 12, 14, 16],
        'rsm': [0.3, 0.4, 0.5]
       }

# grid_search_result2 = model_grid_2.grid_search(grid_2, full_pool, cv=3, plot=True)

Выводы:
- one_hot_max_size = 11 - перепроверим
- depth = 8+-
- rsm = 0.5+-

In [12]:
# готовим тюнинг гиперпараметров по сетке #3
model_grid_3 = CatBoostRegressor(iterations=1000, 
                               loss_function='MAE', 
                               eval_metric='MAPE',
                               logging_level='Silent')
grid_3 = {'depth': [6, 7, 8, 9],
        'rsm': [0.40, 0.45, 0.50, 0.55],
        'one_hot_max_size': [3, 4, 11, 16],
       }

grid_search_result3 = model_grid_3.grid_search(grid_3, full_pool, cv=3, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	loss: 0.1260603	best: 0.1260603 (0)	total: 35.7s	remaining: 37m 30s
1:	loss: 0.1231594	best: 0.1231594 (1)	total: 1m 3s	remaining: 32m 52s
2:	loss: 0.1235969	best: 0.1231594 (1)	total: 1m 25s	remaining: 28m 53s
3:	loss: 0.1226935	best: 0.1226935 (3)	total: 1m 40s	remaining: 25m 10s
4:	loss: 0.1238229	best: 0.1226935 (3)	total: 2m 18s	remaining: 27m 15s
5:	loss: 0.1214369	best: 0.1214369 (5)	total: 2m 48s	remaining: 27m 7s
6:	loss: 0.1216724	best: 0.1214369 (5)	total: 3m 14s	remaining: 26m 22s
7:	loss: 0.1231307	best: 0.1214369 (5)	total: 3m 32s	remaining: 24m 44s
8:	loss: 0.1227620	best: 0.1214369 (5)	total: 4m 15s	remaining: 26m 2s
9:	loss: 0.1235839	best: 0.1214369 (5)	total: 4m 49s	remaining: 26m 5s
10:	loss: 0.1234280	best: 0.1214369 (5)	total: 5m 19s	remaining: 25m 38s
11:	loss: 0.1241031	best: 0.1214369 (5)	total: 5m 40s	remaining: 24m 34s
12:	loss: 0.1258569	best: 0.1214369 (5)	total: 6m 30s	remaining: 25m 29s
13:	loss: 0.1229895	best: 0.1214369 (5)	total: 7m 10s	remaining: 2

Выводы:
- one_hot_max_size = 11
- depth = 8
- rsm = 0.5

In [None]:
# учим модель
# model = CatBoostRegressor(iterations=5000, 
#                           loss_function='MAE', 
#                           eval_metric='MAPE', 
#                           custom_metric='RMSE', 
#                           one_hot_max_size=4)
# model.fit(train_pool,
#           eval_set=valid_pool, 
#           logging_level='Silent', plot=True)

In [28]:
# лучший результат
# display(model.get_best_score())
# число деревьев
# display(model.tree_count_)

# Результат для Kaggle

In [29]:
# X = pd.read_csv('EDAv2_Train.zip').drop(columns='price').drop(columns='description')
# y = pd.read_csv('EDAv2_Train.zip', usecols=['price'])
display(X.shape, y.shape)
# X_test = pd.read_csv('EDAv2_Test.zip').drop(columns='description')
y_test = pd.read_csv('sample_submission.csv', usecols=['price']) # это пустой вектор, но нужен для полного обучения
display(X_test.shape, y_test.shape)

(6682, 16)

(6682, 1)

(1671, 16)

(1671, 1)

In [30]:
# надо сделать одинаковый порядок фичей в train и test
# для этого сперва склеим их, а потом разделим
# X['is_train'] = 1
# X_test['is_train'] = 0
# X_ = X.append(X_test)
# X = X_[X_['is_train'] == 1].drop(columns=['is_train'])
# X_test = X_[X_['is_train'] == 0].drop(columns=['is_train'])

In [31]:
# категориальные признаки
cat_features_list = ['body_type', 
                     'brand', 
                     'color', 
                     'fuel_type', 
                     'n_doors', 
                     'vehicle_transmission', 
                     'drive_type', 
                     'n_owners', 
                     'is_original_techpass', 
                     'is_lefthand_drive', 
                     'full_model_name',
                    ]

In [32]:
display(X.columns)
display(X_test.columns)

Index(['body_type', 'brand', 'color', 'engine_displacement', 'engine_power',
       'fuel_type', 'mileage', 'model_year', 'n_doors', 'production_year',
       'vehicle_transmission', 'n_owners', 'drive_type',
       'is_original_techpass', 'is_lefthand_drive', 'full_model_name'],
      dtype='object')

Index(['body_type', 'brand', 'color', 'engine_displacement', 'engine_power',
       'fuel_type', 'mileage', 'model_year', 'n_doors', 'production_year',
       'vehicle_transmission', 'n_owners', 'drive_type',
       'is_original_techpass', 'is_lefthand_drive', 'full_model_name'],
      dtype='object')

In [33]:
# готовим для CatBoost
train_pool = Pool(X, y, cat_features=cat_features_list)
test_pool = Pool(X_test, y_test, cat_features=cat_features_list)

In [34]:
# учим модель
model = CatBoostRegressor(iterations=6000, 
                          loss_function='MAE', 
                          eval_metric='MAPE', 
                          custom_metric='RMSE',
                          one_hot_max_size=11,
                          depth=8,
                          rsm=0.5)

model.fit(train_pool, eval_set=test_pool, logging_level='Silent', plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1b318e1d3d0>

In [35]:
# лучший результат
display(model.get_best_score())
# число деревьев

display(model.tree_count_)

{'learn': {'MAE': 102873.68418196445,
  'MAPE': 0.05088361766052508,
  'RMSE': 374261.3531721225},
 'validation': {'MAE': 1312476.6244957072,
  'MAPE': 1312476.624496705,
  'RMSE': 1312983.8137653768}}

6000

In [36]:
# предсказываем результат
y_test_predict = model.predict(test_pool)

In [38]:
submission = pd.read_csv('sample_submission.csv')
submission['price'] = y_test_predict
submission.to_csv('submission_catboost.csv', index=False)

In [39]:
# Тюнинг хорошо помог. До него был результат 15+, а стал 13+. Но всё равно хуже бэггинга.
# Сохраним модель для стекинга.
model.save_model('model_catboost.cbm')