# Библиотеки

In [1]:
# !pip install catboost

In [2]:
# !pip install ipywidgets

In [3]:
import pandas as pd
import numpy as np

from catboost import Pool, CatBoostRegressor

from sklearn.model_selection import train_test_split

# Данные

## Грузим

In [4]:
features_to_drop = ['description', 
                    'sell_id',
                   ]

X = pd.read_csv('train_preprocessed.csv').drop(columns='price').drop(columns=features_to_drop)
y = pd.read_csv('train_preprocessed.csv', usecols=['price'])

X_test = pd.read_csv('test_preprocessed.csv').drop(columns='price').drop(columns=features_to_drop)

In [5]:
X.sample(1).T

Unnamed: 0,6045
body_type,внедорожник
brand,BMW
color,чёрный
engine_displacement,2.0
engine_power,5.252273
fuel_type,дизель
mileage,83896
model_year,2014
n_doors,5
production_year,2016


In [6]:
X.columns

Index(['body_type', 'brand', 'color', 'engine_displacement', 'engine_power',
       'fuel_type', 'mileage', 'model_year', 'n_doors', 'production_year',
       'vehicle_transmission', 'n_owners', 'ti_own', 'drive_type',
       'is_original_techpass', 'is_lefthand_drive', 'full_model_name'],
      dtype='object')

In [7]:
# категориальные признаки
cat_features_list = ['body_type', 
                     'brand', 
                     'color', 
                     'fuel_type', 
                     'n_doors', 
                     'vehicle_transmission', 
                     'drive_type', 
                     'n_owners', 
                     'is_original_techpass', 
                     'is_lefthand_drive', 
                     'full_model_name',
                    ]

In [8]:
# посмотрим сколько уникальных значений по кат-признакам
pd.concat([X, X_test])[cat_features_list].nunique()
# full_model_name точно будет слишком накладно делать one-hot, остальное можно проверить

body_type                11
brand                     3
color                    16
fuel_type                 4
n_doors                   4
vehicle_transmission      4
drive_type                3
n_owners                  4
is_original_techpass      2
is_lefthand_drive         2
full_model_name         122
dtype: int64

In [9]:
# целевую переменную логарифмируем, предскажем и вернём обратно в конце
y = np.log(y)

## Делим для обучения

In [10]:
# готовим для CatBoost
full_pool = Pool(X, y, cat_features=cat_features_list)

In [11]:
# параметры CatBoostRegressor
common_kwargs = {'loss_function': 'MAE',
                 'eval_metric': 'MAPE',
                 'logging_level': 'Silent',
                }
tuned_kwargs = {'depth': 1,
                'rsm': 1.0,
                'one_hot_max_size': 2,
                'random_strength': 1,
                'iterations': 500,
               }

# Тюним CatBoost

## Раунд 1

In [12]:
# готовим тюнинг гиперпараметров по сетке #1
model_grid = CatBoostRegressor(**common_kwargs,
                               **tuned_kwargs,
                              )
grid = {'depth': [4, 6, 8],
        'rsm': [0.5, 0.75, 1.0],
        'one_hot_max_size': [4, 11, 16],
        'random_strength': [1, 2],
        'iterations': 500,
       }

# grid_search_result = model_grid.grid_search(grid, full_pool, cv=3, plot=True)

In [13]:
# to next round
tuned_kwargs['depth'] = 8
tuned_kwargs['rsm'] = 1
tuned_kwargs['one_hot_max_size'] = 11
tuned_kwargs['random_strength'] = 1

## Раунд 2

In [14]:
# готовим тюнинг гиперпараметров по сетке #2
model_grid_2 = CatBoostRegressor(**common_kwargs,
                                 **tuned_kwargs,
                                )
grid_2 = {'depth': [8, 10, 12, 14, 16],
          'rsm': [0.9, 0.95, 1.0],
          'one_hot_max_size': [11],
          'random_strength': [1],
          'iterations': 500,
         }

# grid_search_result2 = model_grid_2.grid_search(grid_2, full_pool, cv=3, plot=True)

In [15]:
# to next round
tuned_kwargs['depth'] = 8
tuned_kwargs['rsm'] = 1

## Раунд 3

In [16]:
# готовим тюнинг гиперпараметров по сетке #3
model_grid_3 = CatBoostRegressor(**common_kwargs,
                                 **tuned_kwargs,
                                )
grid_3 = {'depth': [7, 8, 9],
          'rsm': [0.9, 1.0],
          'one_hot_max_size': [11],
          'random_strength': [1],
          'iterations': 500,
         }

# grid_search_result3 = model_grid_3.grid_search(grid_3, full_pool, cv=3, plot=True)

In [17]:
# to next round
tuned_kwargs['depth'] = 8

## Раунд последний

In [18]:
print(common_kwargs)
print(tuned_kwargs)

{'loss_function': 'MAE', 'eval_metric': 'MAPE', 'logging_level': 'Silent'}
{'depth': 8, 'rsm': 1, 'one_hot_max_size': 11, 'random_strength': 1, 'iterations': 500}


In [19]:
# ищем нужное число деревьев, чтобы не попасть в переобучение
model = CatBoostRegressor(**common_kwargs,
                          **tuned_kwargs,
                         )

grid = {'iterations': [1200, 1400, 1600, 1800, 2000, 2200, 2400],
       }

# grid_search_result = model.grid_search(grid, full_pool, cv=3, plot=True, partition_random_seed=37)

In [20]:
tuned_kwargs['iterations'] = 2000

# Результат для Kaggle

In [21]:
display(X.shape, y.shape)
y_test = pd.read_csv('sample_submission.csv', usecols=['price']) # это пустой вектор, но нужен для полного обучения
display(X_test.shape, y_test.shape)

(6682, 17)

(6682, 1)

(1671, 17)

(1671, 1)

In [22]:
# готовим для CatBoost
train_pool = Pool(X, y, cat_features=cat_features_list)
test_pool = Pool(X_test, y_test, cat_features=cat_features_list)

In [23]:
print(common_kwargs)
print(tuned_kwargs)

{'loss_function': 'MAE', 'eval_metric': 'MAPE', 'logging_level': 'Silent'}
{'depth': 8, 'rsm': 1, 'one_hot_max_size': 11, 'random_strength': 1, 'iterations': 2000}


In [24]:
# учим модель
model = CatBoostRegressor(**common_kwargs,
                          **tuned_kwargs,
                         )

model.fit(train_pool, eval_set=test_pool, logging_level='Silent', plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x2c1100d7130>

In [25]:
# предсказываем результат
y_test_predict = model.predict(test_pool)
# вернём в нормальный масштаб
y_test_predict = np.exp(y_test_predict)

In [26]:
submission = pd.read_csv('sample_submission.csv')
submission['price'] = y_test_predict
submission.to_csv('submission_catboost.csv', index=False)
submission['price'].head()

0    5.576804e+05
1    2.413823e+06
2    6.310907e+05
3    4.967028e+05
4    2.669023e+06
Name: price, dtype: float64

In [27]:
# Сохраним модель
model.save_model('best_catboost.cbm')

In [28]:
# постобработка
submission['price'] = submission['price']//1000*1000
submission.to_csv('submission_catboost_post.csv', index=False)
submission['price'].head()

0     557000.0
1    2413000.0
2     631000.0
3     496000.0
4    2669000.0
Name: price, dtype: float64

In [29]:
# model = CatBoostRegressor()
# model.load_model('best_catboost.cbm')

In [30]:
pd.Series(model.get_feature_importance(), X.columns).sort_values(ascending=False)

production_year         18.973441
engine_power            15.290441
model_year              12.864236
mileage                 12.753227
engine_displacement      9.116506
full_model_name          8.498752
fuel_type                4.565574
brand                    3.211787
color                    3.189138
body_type                3.095768
drive_type               2.040362
n_doors                  2.030079
n_owners                 1.868826
vehicle_transmission     1.515610
ti_own                   0.660638
is_original_techpass     0.325451
is_lefthand_drive        0.000163
dtype: float64