# Градиентный бустинг на решающих деревьях

## Подготовка датасета


In [4]:
%matplotlib inline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer

from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll import scope

from catboost import CatBoostRegressor

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

import time

In [5]:
test_parameters = {"n_estimators": 1000, "max_depth": 5, "learning_rate":0.1}

df = pd.read_csv('dataframe_YesIndex_YesHeader_C.csv', index_col=0)
df.head()

Unnamed: 0,Engine Capacity,Cylinders,Drive Type,Fuel Tank Capacity,Fuel Economy,Fuel Type,Horsepower,Torque,Transmission,Top Speed,...,Acceleration,Length,Width,Height,Wheelbase,Trunk Capacity,name,price,currency,Country
0,1.2,3,0,42.0,4.9,0,76,100.0,0,170,...,14.0,4.245,1.67,1.515,2.55,450.0,Mitsubishi Attrage 2021 1.2 GLX (Base),34099.0,0,0
1,1.2,3,0,42.0,4.9,0,76,100.0,0,170,...,14.0,4.245,1.67,1.515,2.55,450.0,Mitsubishi Attrage 2021 1.2 GLX (Base),34099.0,0,0
2,1.4,4,0,45.0,6.3,0,75,118.0,1,156,...,16.0,3.864,1.716,1.721,2.513,2800.0,Fiat Fiorino 2021 1.4L Standard,41250.0,0,0
3,1.6,4,0,50.0,6.4,0,102,145.0,0,180,...,11.0,4.354,1.994,1.529,2.635,510.0,Renault Symbol 2021 1.6L PE,44930.0,0,0
4,1.5,4,0,48.0,5.8,0,112,150.0,0,170,...,10.9,4.314,1.809,1.624,2.585,448.0,MG ZS 2021 1.5L STD,57787.0,0,0


In [6]:
len(df) # сколько всего строк

4560

In [7]:
len(df.groupby(["name"])) # => сколько уникальных названий машин

1734

In [8]:
df.isnull().any().any() # => NaN нет в таблице

False

In [9]:
df1 = df.copy()

In [10]:
import re
df1['year'] = df1['name'].map(lambda name: re.search('(20\d{2})', name).group())

In [11]:
len(df1.groupby(["year"])) # => сколько уникальных годов? Тк 1, то год учитывать не будем!  :)

1

In [12]:
df1 = df1.drop('year', axis=1)

In [13]:
df1['brand'] = df1['name'].map(lambda name: ' '.join(name.split()[:1]) )

In [14]:
df1 = df1.drop('name', axis=1)

In [15]:
df1.drop_duplicates()

Unnamed: 0,Engine Capacity,Cylinders,Drive Type,Fuel Tank Capacity,Fuel Economy,Fuel Type,Horsepower,Torque,Transmission,Top Speed,...,Acceleration,Length,Width,Height,Wheelbase,Trunk Capacity,price,currency,Country,brand
0,1.2,3,0,42.0,4.9,0,76,100.0,0,170,...,14.0,4.245,1.670,1.515,2.550,450.0,34099.0,0,0,Mitsubishi
2,1.4,4,0,45.0,6.3,0,75,118.0,1,156,...,16.0,3.864,1.716,1.721,2.513,2800.0,41250.0,0,0,Fiat
3,1.6,4,0,50.0,6.4,0,102,145.0,0,180,...,11.0,4.354,1.994,1.529,2.635,510.0,44930.0,0,0,Renault
4,1.5,4,0,48.0,5.8,0,112,150.0,0,170,...,10.9,4.314,1.809,1.624,2.585,448.0,57787.0,0,0,MG
5,1.4,4,0,35.0,5.1,0,98,127.0,0,170,...,12.0,3.636,1.597,1.483,2.385,314.0,53790.0,0,0,Chevrolet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5662,6.6,12,2,82.0,14.1,0,563,820.0,0,250,...,5.0,5.285,1.947,1.502,3.112,283.0,1300000.0,2,2,Rolls
5663,6.8,8,2,96.0,15.0,0,530,1100.0,0,305,...,4.9,5.575,1.926,1.521,3.266,443.0,1600000.0,2,2,Bentley
5664,6.7,12,2,100.0,14.8,0,563,900.0,0,250,...,5.3,5.762,2.018,1.646,3.552,548.0,1950000.0,2,2,Rolls
5665,6.5,12,1,90.0,16.0,0,770,720.0,0,350,...,2.8,4.343,2.098,1.136,2.700,140.0,1850000.0,2,2,Lamborghini


In [16]:
(df1["currency"]!=df1["Country"]).any()  # -> можно удалить столбец currency

False

In [17]:
df1 = df1.drop('currency', axis=1)

In [18]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4560 entries, 0 to 5667
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Engine Capacity     4560 non-null   float64
 1   Cylinders           4560 non-null   int64  
 2   Drive Type          4560 non-null   int64  
 3   Fuel Tank Capacity  4560 non-null   float64
 4   Fuel Economy        4560 non-null   float64
 5   Fuel Type           4560 non-null   int64  
 6   Horsepower          4560 non-null   int64  
 7   Torque              4560 non-null   float64
 8   Transmission        4560 non-null   int64  
 9   Top Speed           4560 non-null   int64  
 10  Seating Capacity    4560 non-null   int64  
 11  Acceleration        4560 non-null   float64
 12  Length              4560 non-null   float64
 13  Width               4560 non-null   float64
 14  Height              4560 non-null   float64
 15  Wheelbase           4560 non-null   float64
 16  Trunk 

In [19]:
df2 = df1.copy()

In [20]:
for (columnName, columnData) in df2.iteritems():
    print(str(columnName),len(df2.groupby([columnName])))  

Engine Capacity 51
Cylinders 8
Drive Type 3
Fuel Tank Capacity 75
Fuel Economy 134
Fuel Type 3
Horsepower 277
Torque 234
Transmission 3
Top Speed 143
Seating Capacity 8
Acceleration 121
Length 456
Width 250
Height 341
Wheelbase 263
Trunk Capacity 306
price 2011
Country 6
brand 66


  for (columnName, columnData) in df2.iteritems():


In [21]:
for (columnName, columnData) in df2.iteritems():
    if len(df2.groupby([columnName])) < 9:
        df2[columnName] = df2[columnName].astype("category")

  for (columnName, columnData) in df2.iteritems():


In [22]:
df2['brand'] = df2['brand'].astype("category")
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4560 entries, 0 to 5667
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Engine Capacity     4560 non-null   float64 
 1   Cylinders           4560 non-null   category
 2   Drive Type          4560 non-null   category
 3   Fuel Tank Capacity  4560 non-null   float64 
 4   Fuel Economy        4560 non-null   float64 
 5   Fuel Type           4560 non-null   category
 6   Horsepower          4560 non-null   int64   
 7   Torque              4560 non-null   float64 
 8   Transmission        4560 non-null   category
 9   Top Speed           4560 non-null   int64   
 10  Seating Capacity    4560 non-null   category
 11  Acceleration        4560 non-null   float64 
 12  Length              4560 non-null   float64 
 13  Width               4560 non-null   float64 
 14  Height              4560 non-null   float64 
 15  Wheelbase           4560 non-null   fl

## датасет А с категориальными признаками в виде категорий. 

In [23]:
A = df2.copy()

In [24]:
target_variable_A = A['price']

In [25]:
A = A.drop('price', axis=1)

In [26]:
category_vars_and_ind = {}
for (columnName, columnData) in A.iteritems():
    if A[columnName].dtypes == 'category':
        index_no = A.columns.get_loc(columnName)
        category_vars_and_ind[columnName] = index_no
category_vars_and_ind

  for (columnName, columnData) in A.iteritems():


{'Cylinders': 1,
 'Drive Type': 2,
 'Fuel Type': 5,
 'Transmission': 8,
 'Seating Capacity': 10,
 'Country': 17,
 'brand': 18}

In [27]:
cat_features = []
for (columnName, columnData) in A.iteritems():
    if A[columnName].dtypes == 'category':
        cat_features.append(columnName)
cat_features

  for (columnName, columnData) in A.iteritems():


['Cylinders',
 'Drive Type',
 'Fuel Type',
 'Transmission',
 'Seating Capacity',
 'Country',
 'brand']

In [28]:
cat_features_A = cat_features

In [29]:
for (columnName, columnData) in A.iteritems():
    if A[columnName].dtypes == 'category':
        A[columnName] = pd.factorize(A[columnName])[0]

  for (columnName, columnData) in A.iteritems():


## датасет B, с удаленными категориальными признаками.

In [30]:
B = df2.copy()

In [31]:
target_variable_B = B['price']

In [32]:
B = B.drop('price', axis=1)

In [33]:
for (columnName, columnData) in B.iteritems():
    if B[columnName].dtypes == 'category':
        B = B.drop(columnName, axis=1)

  for (columnName, columnData) in B.iteritems():


In [34]:
B.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4560 entries, 0 to 5667
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Engine Capacity     4560 non-null   float64
 1   Fuel Tank Capacity  4560 non-null   float64
 2   Fuel Economy        4560 non-null   float64
 3   Horsepower          4560 non-null   int64  
 4   Torque              4560 non-null   float64
 5   Top Speed           4560 non-null   int64  
 6   Acceleration        4560 non-null   float64
 7   Length              4560 non-null   float64
 8   Width               4560 non-null   float64
 9   Height              4560 non-null   float64
 10  Wheelbase           4560 non-null   float64
 11  Trunk Capacity      4560 non-null   float64
dtypes: float64(10), int64(2)
memory usage: 463.1 KB


## датасет C с категориальными признаками в виде one-hot encoding

In [35]:
C = df2.copy()

In [36]:
C.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4560 entries, 0 to 5667
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Engine Capacity     4560 non-null   float64 
 1   Cylinders           4560 non-null   category
 2   Drive Type          4560 non-null   category
 3   Fuel Tank Capacity  4560 non-null   float64 
 4   Fuel Economy        4560 non-null   float64 
 5   Fuel Type           4560 non-null   category
 6   Horsepower          4560 non-null   int64   
 7   Torque              4560 non-null   float64 
 8   Transmission        4560 non-null   category
 9   Top Speed           4560 non-null   int64   
 10  Seating Capacity    4560 non-null   category
 11  Acceleration        4560 non-null   float64 
 12  Length              4560 non-null   float64 
 13  Width               4560 non-null   float64 
 14  Height              4560 non-null   float64 
 15  Wheelbase           4560 non-null   fl

In [37]:
target_variable_C = C['price']

In [38]:
C = C.drop('price', axis=1)

In [39]:
C.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4560 entries, 0 to 5667
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Engine Capacity     4560 non-null   float64 
 1   Cylinders           4560 non-null   category
 2   Drive Type          4560 non-null   category
 3   Fuel Tank Capacity  4560 non-null   float64 
 4   Fuel Economy        4560 non-null   float64 
 5   Fuel Type           4560 non-null   category
 6   Horsepower          4560 non-null   int64   
 7   Torque              4560 non-null   float64 
 8   Transmission        4560 non-null   category
 9   Top Speed           4560 non-null   int64   
 10  Seating Capacity    4560 non-null   category
 11  Acceleration        4560 non-null   float64 
 12  Length              4560 non-null   float64 
 13  Width               4560 non-null   float64 
 14  Height              4560 non-null   float64 
 15  Wheelbase           4560 non-null   fl

In [40]:
C_one_hot = pd.get_dummies(C)
C_one_hot

Unnamed: 0,Engine Capacity,Fuel Tank Capacity,Fuel Economy,Horsepower,Torque,Top Speed,Acceleration,Length,Width,Height,...,brand_Rolls,brand_Seat,brand_Skoda,brand_SsangYong,brand_Subaru,brand_Suzuki,brand_Toyota,brand_Volkswagen,brand_Volvo,brand_ZNA
0,1.2,42.0,4.9,76,100.0,170,14.0,4.245,1.670,1.515,...,0,0,0,0,0,0,0,0,0,0
1,1.2,42.0,4.9,76,100.0,170,14.0,4.245,1.670,1.515,...,0,0,0,0,0,0,0,0,0,0
2,1.4,45.0,6.3,75,118.0,156,16.0,3.864,1.716,1.721,...,0,0,0,0,0,0,0,0,0,0
3,1.6,50.0,6.4,102,145.0,180,11.0,4.354,1.994,1.529,...,0,0,0,0,0,0,0,0,0,0
4,1.5,48.0,5.8,112,150.0,170,10.9,4.314,1.809,1.624,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5662,6.6,82.0,14.1,563,820.0,250,5.0,5.285,1.947,1.502,...,1,0,0,0,0,0,0,0,0,0
5663,6.8,96.0,15.0,530,1100.0,305,4.9,5.575,1.926,1.521,...,0,0,0,0,0,0,0,0,0,0
5664,6.7,100.0,14.8,563,900.0,250,5.3,5.762,2.018,1.646,...,1,0,0,0,0,0,0,0,0,0
5665,6.5,90.0,16.0,770,720.0,350,2.8,4.343,2.098,1.136,...,0,0,0,0,0,0,0,0,0,0


In [41]:
datasets = {'A' : A, 'B': B, 'C': C_one_hot}

# my awesome code here:

A_train, A_test, target_variable_A_train, target_variable_A_test = \
    train_test_split(A, target_variable_A, test_size=0.3, random_state=42)

B_train, B_test, target_variable_B_train, target_variable_B_test = \
    train_test_split(B, target_variable_B, test_size=0.3, random_state=42)

C_train, C_test, target_variable_C_train, target_variable_C_test = \
    train_test_split(C_one_hot, target_variable_C, test_size=0.3, random_state=42)

1. использование CatBoost предсказания стоимости автомобиля на всех построенных датасетах (A, B и C)
2. Подбор оптимального набор параметров модели с помощью библиотеки hyperopt

In [42]:
# my cool code here:

# A dataset:

clf = CatBoostRegressor(cat_features = cat_features_A, n_estimators=1000, max_depth=5, learning_rate=0.1)

In [43]:
clf.fit(A_train, target_variable_A_train)

0:	learn: 297349.3884788	total: 63.4ms	remaining: 1m 3s
1:	learn: 294515.5415768	total: 68.1ms	remaining: 34s
2:	learn: 291815.8138635	total: 69.1ms	remaining: 23s
3:	learn: 289177.6810620	total: 71.8ms	remaining: 17.9s
4:	learn: 286694.2201328	total: 72.8ms	remaining: 14.5s
5:	learn: 284313.5007777	total: 73.7ms	remaining: 12.2s
6:	learn: 281965.2282704	total: 77.2ms	remaining: 10.9s
7:	learn: 267490.8044030	total: 80.1ms	remaining: 9.93s
8:	learn: 265299.4654561	total: 81ms	remaining: 8.92s
9:	learn: 252178.7267968	total: 83.9ms	remaining: 8.31s
10:	learn: 250052.2142924	total: 86ms	remaining: 7.74s
11:	learn: 248038.8600784	total: 87ms	remaining: 7.16s
12:	learn: 246044.8166216	total: 90.2ms	remaining: 6.85s
13:	learn: 244189.1102547	total: 91.2ms	remaining: 6.43s
14:	learn: 242418.7913826	total: 91.9ms	remaining: 6.04s
15:	learn: 230982.4312035	total: 93.9ms	remaining: 5.77s
16:	learn: 229263.5917003	total: 96ms	remaining: 5.55s
17:	learn: 216079.2847159	total: 101ms	remaining: 5.5

187:	learn: 39418.2751160	total: 457ms	remaining: 1.97s
188:	learn: 39227.4223457	total: 460ms	remaining: 1.97s
189:	learn: 39128.7015449	total: 462ms	remaining: 1.97s
190:	learn: 38869.7323658	total: 464ms	remaining: 1.96s
191:	learn: 38806.2676267	total: 466ms	remaining: 1.96s
192:	learn: 38688.7848600	total: 469ms	remaining: 1.96s
193:	learn: 38422.0720816	total: 471ms	remaining: 1.96s
194:	learn: 38349.9397338	total: 473ms	remaining: 1.95s
195:	learn: 38247.4677662	total: 475ms	remaining: 1.95s
196:	learn: 38173.2958376	total: 478ms	remaining: 1.95s
197:	learn: 38010.4448659	total: 483ms	remaining: 1.96s
198:	learn: 37853.5712704	total: 485ms	remaining: 1.95s
199:	learn: 37800.8683242	total: 489ms	remaining: 1.95s
200:	learn: 37714.4488838	total: 491ms	remaining: 1.95s
201:	learn: 37636.6027988	total: 493ms	remaining: 1.95s
202:	learn: 37501.8192492	total: 495ms	remaining: 1.94s
203:	learn: 37454.9572411	total: 509ms	remaining: 1.99s
204:	learn: 37403.3566676	total: 511ms	remaining

414:	learn: 26546.1678337	total: 1.05s	remaining: 1.48s
415:	learn: 26542.6592530	total: 1.05s	remaining: 1.48s
416:	learn: 26528.5647963	total: 1.05s	remaining: 1.47s
417:	learn: 26472.0712314	total: 1.06s	remaining: 1.47s
418:	learn: 26452.5547406	total: 1.06s	remaining: 1.47s
419:	learn: 26438.8700717	total: 1.06s	remaining: 1.47s
420:	learn: 26408.7067717	total: 1.06s	remaining: 1.46s
421:	learn: 26403.3788575	total: 1.07s	remaining: 1.46s
422:	learn: 26380.1848809	total: 1.07s	remaining: 1.46s
423:	learn: 26375.0345544	total: 1.07s	remaining: 1.45s
424:	learn: 26347.1189925	total: 1.07s	remaining: 1.45s
425:	learn: 26321.5482652	total: 1.07s	remaining: 1.45s
426:	learn: 26279.8228695	total: 1.07s	remaining: 1.44s
427:	learn: 26254.8337639	total: 1.08s	remaining: 1.44s
428:	learn: 26225.0961459	total: 1.08s	remaining: 1.44s
429:	learn: 26177.4223088	total: 1.08s	remaining: 1.44s
430:	learn: 26118.4457151	total: 1.08s	remaining: 1.43s
431:	learn: 26049.8325299	total: 1.09s	remaining

566:	learn: 22401.3550440	total: 1.45s	remaining: 1.11s
567:	learn: 22391.8176671	total: 1.45s	remaining: 1.1s
568:	learn: 22384.2165970	total: 1.46s	remaining: 1.1s
569:	learn: 22355.7245702	total: 1.46s	remaining: 1.1s
570:	learn: 22339.5964122	total: 1.46s	remaining: 1.1s
571:	learn: 22302.0979900	total: 1.46s	remaining: 1.09s
572:	learn: 22282.5756962	total: 1.47s	remaining: 1.09s
573:	learn: 22260.5909769	total: 1.47s	remaining: 1.09s
574:	learn: 22256.6825558	total: 1.48s	remaining: 1.09s
575:	learn: 22231.2887578	total: 1.48s	remaining: 1.09s
576:	learn: 22227.1155487	total: 1.48s	remaining: 1.08s
577:	learn: 22224.4203561	total: 1.48s	remaining: 1.08s
578:	learn: 22173.8610730	total: 1.48s	remaining: 1.08s
579:	learn: 22131.1307544	total: 1.48s	remaining: 1.07s
580:	learn: 22124.4342844	total: 1.49s	remaining: 1.07s
581:	learn: 22103.9231416	total: 1.49s	remaining: 1.07s
582:	learn: 22085.9542913	total: 1.49s	remaining: 1.07s
583:	learn: 22054.0081875	total: 1.49s	remaining: 1.

778:	learn: 19243.3030260	total: 2.08s	remaining: 590ms
779:	learn: 19225.2751506	total: 2.08s	remaining: 588ms
780:	learn: 19206.9039475	total: 2.09s	remaining: 585ms
781:	learn: 19182.8539366	total: 2.09s	remaining: 582ms
782:	learn: 19153.8662651	total: 2.09s	remaining: 580ms
783:	learn: 19151.8112054	total: 2.09s	remaining: 577ms
784:	learn: 19146.4234963	total: 2.1s	remaining: 575ms
785:	learn: 19127.9216877	total: 2.1s	remaining: 573ms
786:	learn: 19114.9436430	total: 2.11s	remaining: 570ms
787:	learn: 19106.7564599	total: 2.11s	remaining: 567ms
788:	learn: 19092.6857210	total: 2.11s	remaining: 564ms
789:	learn: 19053.9894837	total: 2.11s	remaining: 561ms
790:	learn: 19047.9145837	total: 2.11s	remaining: 559ms
791:	learn: 19043.7026610	total: 2.12s	remaining: 556ms
792:	learn: 19010.2555095	total: 2.12s	remaining: 554ms
793:	learn: 19002.9280542	total: 2.12s	remaining: 551ms
794:	learn: 18981.1368249	total: 2.12s	remaining: 548ms
795:	learn: 18967.0088225	total: 2.13s	remaining: 

976:	learn: 17054.8246817	total: 2.69s	remaining: 63.3ms
977:	learn: 17052.9798726	total: 2.69s	remaining: 60.5ms
978:	learn: 17044.2658437	total: 2.69s	remaining: 57.8ms
979:	learn: 17023.0887292	total: 2.69s	remaining: 55ms
980:	learn: 17022.2305320	total: 2.7s	remaining: 52.3ms
981:	learn: 17016.0758031	total: 2.7s	remaining: 49.5ms
982:	learn: 16989.9662397	total: 2.7s	remaining: 46.8ms
983:	learn: 16985.6741062	total: 2.71s	remaining: 44ms
984:	learn: 16974.0407720	total: 2.71s	remaining: 41.3ms
985:	learn: 16962.3309200	total: 2.71s	remaining: 38.5ms
986:	learn: 16961.6325527	total: 2.72s	remaining: 35.8ms
987:	learn: 16953.5558509	total: 2.72s	remaining: 33.1ms
988:	learn: 16942.8420207	total: 2.73s	remaining: 30.3ms
989:	learn: 16938.2776123	total: 2.73s	remaining: 27.6ms
990:	learn: 16894.8561360	total: 2.73s	remaining: 24.8ms
991:	learn: 16885.8805758	total: 2.73s	remaining: 22ms
992:	learn: 16882.1672900	total: 2.73s	remaining: 19.3ms
993:	learn: 16860.7805143	total: 2.74s	r

<catboost.core.CatBoostRegressor at 0x179c83340>

In [44]:
A_pred_train = clf.predict(A_train)
A_pred_test = clf.predict(A_test)
print(f'MAE Train: {mean_absolute_error(target_variable_A_train, A_pred_train)}')
print(f'MAE Test: {mean_absolute_error(target_variable_A_test, A_pred_test)}')

MAE Train: 10650.33334242769
MAE Test: 21154.145603094705


In [45]:
from sklearn.metrics import mean_squared_error

In [46]:
print(f'RMSE Train: {mean_squared_error(target_variable_A_train, A_pred_train, squared=False)}')
print(f'RMSE Test: {mean_squared_error(target_variable_A_test, A_pred_test, squared=False)}')

RMSE Train: 24510.9093921808
RMSE Test: 172091.10904630335


In [47]:
from sklearn.metrics import r2_score

In [48]:
A_pred_train = clf.predict(A_train)
A_pred_test = clf.predict(A_test)
print(f'R2 Train: {r2_score(target_variable_A_train, A_pred_train)}')
print(f'R2 Test: {r2_score(target_variable_A_test, A_pred_test)}')

R2 Train: 0.9933371154635534
R2 Test: 0.8439075876036198


In [49]:
# B dataset:

clf_B = CatBoostRegressor(n_estimators=1000, max_depth=5, learning_rate=0.1)

clf_B.fit(B_train, target_variable_B_train)

0:	learn: 297329.8172780	total: 2.56ms	remaining: 2.56s
1:	learn: 294442.7717658	total: 3.5ms	remaining: 1.75s
2:	learn: 291722.6673989	total: 4.34ms	remaining: 1.44s
3:	learn: 289049.9734523	total: 5.71ms	remaining: 1.42s
4:	learn: 286532.2829388	total: 6.64ms	remaining: 1.32s
5:	learn: 284094.9355148	total: 7.7ms	remaining: 1.27s
6:	learn: 281795.3532938	total: 8.51ms	remaining: 1.21s
7:	learn: 273409.9447298	total: 9.52ms	remaining: 1.18s
8:	learn: 271261.0128054	total: 10.6ms	remaining: 1.17s
9:	learn: 269166.6309669	total: 12.2ms	remaining: 1.21s
10:	learn: 262354.1676373	total: 13.6ms	remaining: 1.22s
11:	learn: 260435.7531615	total: 14ms	remaining: 1.15s
12:	learn: 258376.4948375	total: 15.1ms	remaining: 1.15s
13:	learn: 256925.0108161	total: 16.3ms	remaining: 1.15s
14:	learn: 250918.0278899	total: 17.2ms	remaining: 1.13s
15:	learn: 249215.2607501	total: 18.4ms	remaining: 1.13s
16:	learn: 247627.6950191	total: 19.2ms	remaining: 1.11s
17:	learn: 246106.3497301	total: 22.1ms	remai

190:	learn: 164177.4689943	total: 197ms	remaining: 835ms
191:	learn: 164096.7378246	total: 199ms	remaining: 836ms
192:	learn: 164052.9455818	total: 200ms	remaining: 834ms
193:	learn: 163963.2927063	total: 200ms	remaining: 832ms
194:	learn: 163887.4367140	total: 201ms	remaining: 830ms
195:	learn: 163836.4410469	total: 202ms	remaining: 830ms
196:	learn: 163799.8931396	total: 204ms	remaining: 831ms
197:	learn: 163736.0825426	total: 205ms	remaining: 829ms
198:	learn: 163674.9244469	total: 206ms	remaining: 828ms
199:	learn: 163633.7744234	total: 207ms	remaining: 829ms
200:	learn: 163591.7908959	total: 208ms	remaining: 827ms
201:	learn: 163528.0312963	total: 209ms	remaining: 825ms
202:	learn: 163487.1131822	total: 210ms	remaining: 824ms
203:	learn: 163449.6038410	total: 211ms	remaining: 822ms
204:	learn: 163360.9542292	total: 211ms	remaining: 820ms
205:	learn: 163295.3159004	total: 212ms	remaining: 818ms
206:	learn: 163236.3612982	total: 213ms	remaining: 816ms
207:	learn: 163184.5891665	tota

375:	learn: 158526.1739615	total: 402ms	remaining: 667ms
376:	learn: 158519.6082930	total: 403ms	remaining: 666ms
377:	learn: 158493.1530091	total: 404ms	remaining: 664ms
378:	learn: 158470.0265664	total: 405ms	remaining: 663ms
379:	learn: 158447.0431319	total: 406ms	remaining: 662ms
380:	learn: 158436.8402425	total: 407ms	remaining: 661ms
381:	learn: 158424.7915020	total: 408ms	remaining: 660ms
382:	learn: 158412.5863926	total: 408ms	remaining: 658ms
383:	learn: 158400.2092317	total: 409ms	remaining: 656ms
384:	learn: 158392.9118052	total: 410ms	remaining: 655ms
385:	learn: 158377.3380448	total: 411ms	remaining: 654ms
386:	learn: 158359.4746601	total: 412ms	remaining: 653ms
387:	learn: 158348.4282770	total: 413ms	remaining: 651ms
388:	learn: 158340.0479619	total: 414ms	remaining: 650ms
389:	learn: 158322.3702114	total: 414ms	remaining: 648ms
390:	learn: 158300.7232392	total: 415ms	remaining: 647ms
391:	learn: 158287.7143346	total: 416ms	remaining: 645ms
392:	learn: 158273.4591078	tota

578:	learn: 156455.7310972	total: 597ms	remaining: 434ms
579:	learn: 156449.3378746	total: 597ms	remaining: 433ms
580:	learn: 156446.4354292	total: 598ms	remaining: 432ms
581:	learn: 156428.3981185	total: 599ms	remaining: 430ms
582:	learn: 156418.4310655	total: 600ms	remaining: 429ms
583:	learn: 156416.1314850	total: 601ms	remaining: 428ms
584:	learn: 156416.0603945	total: 601ms	remaining: 426ms
585:	learn: 156412.1612564	total: 602ms	remaining: 425ms
586:	learn: 156401.4467681	total: 603ms	remaining: 424ms
587:	learn: 156397.4643053	total: 603ms	remaining: 423ms
588:	learn: 156392.8265337	total: 604ms	remaining: 422ms
589:	learn: 156387.2167556	total: 606ms	remaining: 421ms
590:	learn: 156382.1923275	total: 607ms	remaining: 420ms
591:	learn: 156377.8591531	total: 608ms	remaining: 419ms
592:	learn: 156377.7502950	total: 608ms	remaining: 418ms
593:	learn: 156369.8891612	total: 609ms	remaining: 416ms
594:	learn: 156356.7002853	total: 610ms	remaining: 415ms
595:	learn: 156345.4279735	tota

779:	learn: 155358.7533302	total: 792ms	remaining: 223ms
780:	learn: 155357.7247388	total: 793ms	remaining: 222ms
781:	learn: 155354.2185652	total: 793ms	remaining: 221ms
782:	learn: 155349.7435397	total: 794ms	remaining: 220ms
783:	learn: 155346.9914758	total: 795ms	remaining: 219ms
784:	learn: 155341.4687149	total: 795ms	remaining: 218ms
785:	learn: 155336.4640286	total: 796ms	remaining: 217ms
786:	learn: 155335.8154130	total: 797ms	remaining: 216ms
787:	learn: 155335.7728387	total: 798ms	remaining: 215ms
788:	learn: 155331.8770349	total: 799ms	remaining: 214ms
789:	learn: 155329.7260820	total: 800ms	remaining: 213ms
790:	learn: 155328.4619625	total: 801ms	remaining: 212ms
791:	learn: 155324.9947625	total: 802ms	remaining: 211ms
792:	learn: 155310.5871241	total: 803ms	remaining: 210ms
793:	learn: 155310.4823474	total: 804ms	remaining: 209ms
794:	learn: 155302.9183178	total: 805ms	remaining: 208ms
795:	learn: 155299.5913645	total: 806ms	remaining: 207ms
796:	learn: 155299.5463867	tota

929:	learn: 154896.6739586	total: 987ms	remaining: 74.3ms
930:	learn: 154892.8539161	total: 988ms	remaining: 73.2ms
931:	learn: 154890.7271503	total: 989ms	remaining: 72.2ms
932:	learn: 154890.4253357	total: 990ms	remaining: 71.1ms
933:	learn: 154887.0697322	total: 992ms	remaining: 70.1ms
934:	learn: 154885.3084646	total: 993ms	remaining: 69ms
935:	learn: 154881.9527908	total: 995ms	remaining: 68ms
936:	learn: 154878.0831838	total: 996ms	remaining: 67ms
937:	learn: 154871.9065994	total: 997ms	remaining: 65.9ms
938:	learn: 154868.7667785	total: 999ms	remaining: 64.9ms
939:	learn: 154868.6744240	total: 1000ms	remaining: 63.8ms
940:	learn: 154861.8293553	total: 1s	remaining: 62.8ms
941:	learn: 154861.6602707	total: 1s	remaining: 61.9ms
942:	learn: 154858.1780266	total: 1.01s	remaining: 60.8ms
943:	learn: 154853.6919699	total: 1.01s	remaining: 59.7ms
944:	learn: 154846.9009633	total: 1.01s	remaining: 58.7ms
945:	learn: 154846.8492822	total: 1.01s	remaining: 57.6ms
946:	learn: 154845.266600

<catboost.core.CatBoostRegressor at 0x179dd0a30>

In [50]:
B_pred_train = clf_B.predict(B_train)
B_pred_test = clf_B.predict(B_test)
print(f'MAE Train: {mean_absolute_error(target_variable_B_train, B_pred_train)}')
print(f'MAE Test: {mean_absolute_error(target_variable_B_test, B_pred_test)}')

MAE Train: 94851.12330034138
MAE Test: 127155.67135781076


In [51]:
print(f'RMSE Train: {mean_squared_error(target_variable_B_train, B_pred_train, squared=False)}')
print(f'RMSE Test: {mean_squared_error(target_variable_B_test, B_pred_test, squared=False)}')

RMSE Train: 154729.14079311295
RMSE Test: 328914.80693087773


In [52]:
B_pred_train = clf_B.predict(B_train)
B_pred_test = clf_B.predict(B_test)
print(f'R2 Train: {r2_score(target_variable_B_train, B_pred_train)}')
print(f'R2 Test: {r2_score(target_variable_B_test, B_pred_test)}')

R2 Train: 0.734485852917942
R2 Test: 0.4297940098185955


In [53]:
# C dataset:

clf_C = CatBoostRegressor(n_estimators=1000, max_depth=5, learning_rate=0.1)

clf_C.fit(C_train, target_variable_C_train)

0:	learn: 297306.2072307	total: 1.92ms	remaining: 1.92s
1:	learn: 294986.2226358	total: 3.48ms	remaining: 1.73s
2:	learn: 292162.7370950	total: 5.15ms	remaining: 1.71s
3:	learn: 289455.6382826	total: 6.3ms	remaining: 1.57s
4:	learn: 286895.3963890	total: 7.58ms	remaining: 1.51s
5:	learn: 284445.3217551	total: 8.96ms	remaining: 1.48s
6:	learn: 277887.6474831	total: 10.1ms	remaining: 1.43s
7:	learn: 275617.7935768	total: 11.6ms	remaining: 1.44s
8:	learn: 273458.5222297	total: 12.3ms	remaining: 1.35s
9:	learn: 271397.4608567	total: 13.4ms	remaining: 1.32s
10:	learn: 269254.9132868	total: 14.3ms	remaining: 1.28s
11:	learn: 267312.8811281	total: 18.1ms	remaining: 1.49s
12:	learn: 257651.8691458	total: 19.7ms	remaining: 1.49s
13:	learn: 255823.6539473	total: 20.7ms	remaining: 1.46s
14:	learn: 251141.6210119	total: 21.6ms	remaining: 1.42s
15:	learn: 244873.8224794	total: 22.8ms	remaining: 1.4s
16:	learn: 243256.6840038	total: 23.6ms	remaining: 1.36s
17:	learn: 237701.7335744	total: 25.1ms	rem

161:	learn: 47664.1438847	total: 198ms	remaining: 1.02s
162:	learn: 47331.3439101	total: 204ms	remaining: 1.05s
163:	learn: 47125.4983274	total: 206ms	remaining: 1.05s
164:	learn: 46864.1560249	total: 207ms	remaining: 1.05s
165:	learn: 46542.4602755	total: 209ms	remaining: 1.05s
166:	learn: 46243.0589267	total: 210ms	remaining: 1.05s
167:	learn: 45951.8431520	total: 211ms	remaining: 1.04s
168:	learn: 45745.8404356	total: 212ms	remaining: 1.04s
169:	learn: 45459.4251315	total: 213ms	remaining: 1.04s
170:	learn: 45286.9367710	total: 214ms	remaining: 1.04s
171:	learn: 45141.3868934	total: 216ms	remaining: 1.04s
172:	learn: 44894.8059997	total: 216ms	remaining: 1.03s
173:	learn: 44650.2111165	total: 218ms	remaining: 1.03s
174:	learn: 44402.4146304	total: 219ms	remaining: 1.03s
175:	learn: 44270.3421294	total: 220ms	remaining: 1.03s
176:	learn: 44181.2205791	total: 221ms	remaining: 1.03s
177:	learn: 43949.7228754	total: 223ms	remaining: 1.03s
178:	learn: 43779.9270013	total: 224ms	remaining

311:	learn: 32108.8443964	total: 394ms	remaining: 870ms
312:	learn: 32057.5485213	total: 401ms	remaining: 881ms
313:	learn: 32027.7778729	total: 403ms	remaining: 880ms
314:	learn: 32011.5601912	total: 404ms	remaining: 878ms
315:	learn: 31946.3838178	total: 406ms	remaining: 879ms
316:	learn: 31865.1017821	total: 407ms	remaining: 877ms
317:	learn: 31815.7691689	total: 408ms	remaining: 876ms
318:	learn: 31762.4766236	total: 409ms	remaining: 873ms
319:	learn: 31737.9609436	total: 411ms	remaining: 872ms
320:	learn: 31647.6959747	total: 412ms	remaining: 871ms
321:	learn: 31608.9039621	total: 413ms	remaining: 870ms
322:	learn: 31558.7820522	total: 414ms	remaining: 869ms
323:	learn: 31501.0606609	total: 416ms	remaining: 867ms
324:	learn: 31458.6444327	total: 417ms	remaining: 866ms
325:	learn: 31416.7487791	total: 418ms	remaining: 864ms
326:	learn: 31357.4091556	total: 419ms	remaining: 862ms
327:	learn: 31283.2754577	total: 420ms	remaining: 861ms
328:	learn: 31256.7861195	total: 422ms	remaining

563:	learn: 24269.2712773	total: 789ms	remaining: 610ms
564:	learn: 24244.6166757	total: 790ms	remaining: 609ms
565:	learn: 24238.1996370	total: 792ms	remaining: 607ms
566:	learn: 24222.0775944	total: 793ms	remaining: 606ms
567:	learn: 24202.1662634	total: 794ms	remaining: 604ms
568:	learn: 24183.6082251	total: 796ms	remaining: 603ms
569:	learn: 24135.5086295	total: 797ms	remaining: 601ms
570:	learn: 24130.2322905	total: 798ms	remaining: 600ms
571:	learn: 24106.6938303	total: 799ms	remaining: 598ms
572:	learn: 24099.8720970	total: 800ms	remaining: 596ms
573:	learn: 24069.6408556	total: 801ms	remaining: 595ms
574:	learn: 24056.2758535	total: 803ms	remaining: 593ms
575:	learn: 24033.1892239	total: 804ms	remaining: 592ms
576:	learn: 24025.6487106	total: 807ms	remaining: 591ms
577:	learn: 24005.0246882	total: 808ms	remaining: 590ms
578:	learn: 23988.2425477	total: 809ms	remaining: 588ms
579:	learn: 23963.3906127	total: 810ms	remaining: 587ms
580:	learn: 23933.5108974	total: 811ms	remaining

863:	learn: 19833.8288055	total: 1.19s	remaining: 187ms
864:	learn: 19826.1547914	total: 1.19s	remaining: 185ms
865:	learn: 19810.7683094	total: 1.19s	remaining: 184ms
866:	learn: 19796.4458885	total: 1.19s	remaining: 182ms
867:	learn: 19785.7026193	total: 1.2s	remaining: 182ms
868:	learn: 19775.3882320	total: 1.2s	remaining: 180ms
869:	learn: 19763.8338996	total: 1.2s	remaining: 179ms
870:	learn: 19763.5607377	total: 1.21s	remaining: 178ms
871:	learn: 19747.9107188	total: 1.21s	remaining: 177ms
872:	learn: 19742.9897067	total: 1.21s	remaining: 176ms
873:	learn: 19732.9413527	total: 1.21s	remaining: 174ms
874:	learn: 19717.6933677	total: 1.21s	remaining: 173ms
875:	learn: 19708.2645072	total: 1.21s	remaining: 172ms
876:	learn: 19698.2309254	total: 1.21s	remaining: 170ms
877:	learn: 19694.6074892	total: 1.21s	remaining: 169ms
878:	learn: 19683.3715747	total: 1.22s	remaining: 167ms
879:	learn: 19667.9089031	total: 1.22s	remaining: 166ms
880:	learn: 19657.5164451	total: 1.22s	remaining: 1

<catboost.core.CatBoostRegressor at 0x179dd30d0>

In [54]:
C_pred_train = clf_C.predict(C_train)
C_pred_test = clf_C.predict(C_test)
print(f'MAE Train: {mean_absolute_error(target_variable_C_train, C_pred_train)}')
print(f'MAE Test: {mean_absolute_error(target_variable_C_test, C_pred_test)}')

MAE Train: 10708.356958625072
MAE Test: 27864.710366639745


In [55]:
print(f'RMSE Train: {mean_squared_error(target_variable_C_train, C_pred_train, squared=False)}')
print(f'RMSE Test: {mean_squared_error(target_variable_C_test, C_pred_test, squared=False)}')

RMSE Train: 18504.954764899063
RMSE Test: 202684.99777033206


In [56]:
C_pred_train = clf_C.predict(C_train)
C_pred_test = clf_C.predict(C_test)
print(f'R2 Train: {r2_score(target_variable_C_train, C_pred_train)}')
print(f'R2 Test: {r2_score(target_variable_C_test, C_pred_test)}')

R2 Train: 0.9962023101503417
R2 Test: 0.7834749563471507


## Подберите оптимальный набор параметров модели с помощью библиотеки hyperopt

In [57]:
# A dataset, MAE 
from sklearn.model_selection import KFold

In [61]:
def evaluate_catboost_for_A_dataset(params):
    kf = KFold(n_splits=5, shuffle=True)
    results = []
    for train_ids, val_ids in kf.split(A_train):
        curr_X_train = A_train.iloc[train_ids]
        curr_y_train = target_variable_A_train.iloc[train_ids]
        
        curr_X_val = A_train.iloc[val_ids]
        curr_y_val = target_variable_A_train.iloc[val_ids]
        
        clf = CatBoostRegressor(**params, cat_features=cat_features, verbose=0)
        clf.fit(curr_X_train, curr_y_train)
        y_pred = clf.predict(curr_X_val)
        curr_acc = r2_score(curr_y_val, y_pred)
        
        results.append(curr_acc)
    results = np.array(results) 
    return {'loss': -results.mean(), 'status': STATUS_OK}

In [62]:
space = {
    'max_depth': hp.choice('max_depth', [6, 7, 8, 9, 10]),
    'learning_rate': hp.choice('learning_rate', [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]),
    'n_estimators': hp.choice('n_estimators', [10, 100, 1000])
}

In [63]:
trials = Trials()
best_params = fmin(fn=evaluate_catboost_for_A_dataset, space=space, algo=tpe.suggest, max_evals=100,trials=trials)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [27:34<00:00, 16.54s/trial, best loss: -0.870298854048376]


In [64]:
trials.best_trial

{'state': 2,
 'tid': 66,
 'spec': None,
 'result': {'loss': -0.870298854048376, 'status': 'ok'},
 'misc': {'tid': 66,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'learning_rate': [66], 'max_depth': [66], 'n_estimators': [66]},
  'vals': {'learning_rate': [4], 'max_depth': [0], 'n_estimators': [2]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2023, 5, 5, 15, 11, 34, 882000),
 'refresh_time': datetime.datetime(2023, 5, 5, 15, 11, 51, 10000)}

In [65]:
print('Best parameters:', best_params)

Best parameters: {'learning_rate': 4, 'max_depth': 0, 'n_estimators': 2}


In [66]:
print('Best loss:', min(trials.losses()))

Best loss: -0.870298854048376
