# Библиотеки

In [1]:
# !pip install catboost

In [2]:
# !pip install ipywidgets

In [3]:
import pandas as pd
import numpy as np

from catboost import Pool, CatBoostRegressor

from sklearn.model_selection import train_test_split

# Данные

## Грузим

In [4]:
features_to_drop = ['description', 
                    'sell_id',
                   ]

X = pd.read_csv('train_preprocessed.csv').drop(columns='price').drop(columns=features_to_drop)
y = pd.read_csv('train_preprocessed.csv', usecols=['price'])

X_test = pd.read_csv('test_preprocessed.csv').drop(columns='price').drop(columns=features_to_drop)

In [5]:
X.sample(1).T

Unnamed: 0,4326
body_type,внедорожник
brand,BMW
color,красный
engine_displacement,3.0
engine_power,510
fuel_type,бензин
mileage,2965
model_year,2019
n_doors,5
production_year,2019


In [6]:
X.columns

Index(['body_type', 'brand', 'color', 'engine_displacement', 'engine_power',
       'fuel_type', 'mileage', 'model_year', 'n_doors', 'production_year',
       'vehicle_transmission', 'n_owners', 'ti_own', 'drive_type',
       'is_original_techpass', 'is_lefthand_drive', 'full_model_name'],
      dtype='object')

In [7]:
# категориальные признаки
cat_features_list = ['body_type', 
                     'brand', 
                     'color', 
                     'fuel_type', 
                     'n_doors', 
                     'vehicle_transmission', 
                     'drive_type', 
                     'n_owners', 
                     'is_original_techpass', 
                     'is_lefthand_drive', 
                     'full_model_name',
                    ]

In [8]:
# посмотрим сколько уникальных значений по кат-признакам
pd.concat([X, X_test])[cat_features_list].nunique()
# full_model_name точно будет слишком накладно делать one-hot, остальное можно проверить

body_type                11
brand                     3
color                    16
fuel_type                 4
n_doors                   4
vehicle_transmission      4
drive_type                3
n_owners                  4
is_original_techpass      2
is_lefthand_drive         2
full_model_name         122
dtype: int64

## Делим для обучения

In [9]:
# разделим на обучающую и валидационную
# test_size = 0.2
# seed = 73
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)
# print(X_train.shape, y_train.shape)
# print(X_valid.shape, y_valid.shape)

In [10]:
# готовим для CatBoost
full_pool = Pool(X, y, cat_features=cat_features_list)

In [11]:
# параметры CatBoostRegressor
common_kwargs = {'loss_function': 'MAE',
                 'eval_metric': 'MAPE',
                 'logging_level': 'Silent',
                }
tuned_kwargs = {'depth': 1,
                'rsm': 1.0,
                'one_hot_max_size': 2,
                'random_strength': 1,
                'iterations': 500,
               }

# Тюним CatBoost

## Раунд 1

In [12]:
# готовим тюнинг гиперпараметров по сетке #1
model_grid = CatBoostRegressor(**common_kwargs,
                               **tuned_kwargs,
                              )
grid = {'depth': [4, 6, 8],
        'rsm': [0.5, 0.75, 1.0],
        'one_hot_max_size': [4, 11, 16],
        'random_strength': [1, 2],
        'iterations': 500,
       }

# grid_search_result = model_grid.grid_search(grid, full_pool, cv=3, plot=True)

In [13]:
# to next round
tuned_kwargs['depth'] = 8
tuned_kwargs['rsm'] = 1
tuned_kwargs['one_hot_max_size'] = 11
tuned_kwargs['random_strength'] = 1

## Раунд 2

In [14]:
# готовим тюнинг гиперпараметров по сетке #2
model_grid_2 = CatBoostRegressor(**common_kwargs,
                                 **tuned_kwargs,
                                )
grid_2 = {'depth': [8, 10, 12, 14, 16],
          'rsm': [0.9, 0.95, 1.0],
          'one_hot_max_size': [11],
          'random_strength': [1],
          'iterations': 500,
         }

# grid_search_result2 = model_grid_2.grid_search(grid_2, full_pool, cv=3, plot=True)

In [15]:
# to next round
tuned_kwargs['depth'] = 8
tuned_kwargs['rsm'] = 1

## Раунд 3

In [16]:
# готовим тюнинг гиперпараметров по сетке #3
model_grid_3 = CatBoostRegressor(**common_kwargs,
                                 **tuned_kwargs,
                                )
grid_3 = {'depth': [7, 8, 9],
          'rsm': [0.9, 1.0],
          'one_hot_max_size': [11],
          'random_strength': [1],
          'iterations': 500,
         }

# grid_search_result3 = model_grid_3.grid_search(grid_3, full_pool, cv=3, plot=True)

In [17]:
# to next round
tuned_kwargs['depth'] = 8

## Раунд последний

In [18]:
# меняем рандом
# test_size = 0.2
# seed = 37
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)

# full_pool = Pool(X, y, cat_features=cat_features_list)

In [19]:
print(common_kwargs)
print(tuned_kwargs)

{'loss_function': 'MAE', 'eval_metric': 'MAPE', 'logging_level': 'Silent'}
{'depth': 8, 'rsm': 1, 'one_hot_max_size': 11, 'random_strength': 1, 'iterations': 500}


In [20]:
# ищем нужное число деревьев, чтобы не попасть в переобучение
model = CatBoostRegressor(**common_kwargs,
                          **tuned_kwargs,
                         )

grid = {'iterations': [900, 1000, 1100, 1400],
       }

grid_search_result = model.grid_search(grid, full_pool, cv=3, plot=True)

# model.fit(train_pool, eval_set=valid_pool, logging_level='Silent', plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	loss: 0.1201410	best: 0.1201410 (0)	total: 35.7s	remaining: 1m 47s
1:	loss: 0.1199297	best: 0.1199297 (1)	total: 1m 16s	remaining: 1m 16s
2:	loss: 0.1198494	best: 0.1198494 (2)	total: 2m 3s	remaining: 41.2s
3:	loss: 0.1198494	best: 0.1198494 (2)	total: 2m 58s	remaining: 0us
Estimating final quality...


In [22]:
tuned_kwargs['iterations'] = 1400  # был лучший результат, с повторением вышла проблема

# Результат для Kaggle

In [23]:
display(X.shape, y.shape)
y_test = pd.read_csv('sample_submission.csv', usecols=['price']) # это пустой вектор, но нужен для полного обучения
display(X_test.shape, y_test.shape)

(6682, 17)

(6682, 1)

(1671, 17)

(1671, 1)

In [24]:
# готовим для CatBoost
train_pool = Pool(X, y, cat_features=cat_features_list)
test_pool = Pool(X_test, y_test, cat_features=cat_features_list)

In [25]:
# учим модель
model = CatBoostRegressor(**common_kwargs,
                          **tuned_kwargs,
                         )

model.fit(train_pool, eval_set=test_pool, logging_level='Silent', plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1b282dd1ca0>

In [26]:
# предсказываем результат
y_test_predict = model.predict(test_pool)

In [27]:
submission = pd.read_csv('sample_submission.csv')
submission['price'] = y_test_predict
submission.to_csv('submission_catboost.csv', index=False)

In [28]:
# Сохраним модель
model.save_model('model_catboost.cbm')