CatBoost - Продвинутая библиотека градиентного бустинга на деревьях решений с открытым исходным кодом.

Заявляется, что хорошо относится к категориальным признакам и не требует их предварительной обработки.

In [1]:
# !pip install catboost

In [2]:
# !pip install ipywidgets

In [3]:
import pandas as pd
import numpy as np

from catboost import Pool, CatBoostRegressor

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# Загружаем данные

In [4]:
X = pd.read_csv('EDAv1_Train.zip').drop(columns='price').drop(columns='model_name')
y = pd.read_csv('EDAv1_Train.zip', usecols=['price'])

X_test = pd.read_csv('EDAv1_Test.zip').drop(columns='model_name')

In [5]:
X.columns

Index(['body_type', 'brand', 'color', 'fuel_type', 'model_year', 'n_doors',
       'production_year', 'vehicle_transmission', 'engine_power', 'mileage',
       'drive_type', 'n_owners', 'is_original_techpass', 'is_lefthand_drive',
       'engine_displacement'],
      dtype='object')

In [6]:
# категориальные признаки
cat_features_list = ['body_type', 'brand', 'color', 'fuel_type', 'n_doors', 
                     'vehicle_transmission', 'drive_type', 'n_owners', 
                     'is_original_techpass', 'is_lefthand_drive']

In [7]:
# разделим на обучающую и валидационную
test_size = 0.2
seed = 73
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

(69482, 15) (69482, 1)
(17371, 15) (17371, 1)


In [8]:
# готовим для CatBoost
train_pool = Pool(X_train, y_train, cat_features=cat_features_list)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features_list)
test_pool = Pool(X_test, cat_features=cat_features_list)

In [9]:
# учим модель как есть
model = CatBoostRegressor(iterations=5000, eval_metric='MAPE', loss_function='MAE')
model.fit(train_pool, eval_set=valid_pool, logging_level='Silent', plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1d7e876cca0>

In [10]:
# предсказываем результат
y_test_predict = model.predict(test_pool)

In [11]:
submission = pd.read_csv('sample_submission_empty.csv')
submission['price'] = y_test_predict
submission.to_csv('sample_submission_catboost.csv', index=False)