# 1. Подготовка данных

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import lightgbm as lgb

data = pd.read_csv('/datasets/autos.csv')


print(data.head())
print()
print(data.info())

           DateCrawled  Price VehicleType  RegistrationYear Gearbox  Power  \
0  2016-03-24 11:52:17    480         NaN              1993  manual      0   
1  2016-03-24 10:58:45  18300       coupe              2011  manual    190   
2  2016-03-14 12:52:21   9800         suv              2004    auto    163   
3  2016-03-17 16:54:04   1500       small              2001  manual     75   
4  2016-03-31 17:25:20   3600       small              2008  manual     69   

   Model  Kilometer  RegistrationMonth  FuelType       Brand NotRepaired  \
0   golf     150000                  0    petrol  volkswagen         NaN   
1    NaN     125000                  5  gasoline        audi         yes   
2  grand     125000                  8  gasoline        jeep         NaN   
3   golf     150000                  6    petrol  volkswagen          no   
4  fabia      90000                  7  gasoline       skoda          no   

           DateCreated  NumberOfPictures  PostalCode             LastSeen 

Данные загружены. Для дальнейшей работы необходимо обработать пропуски. Сделаем 2 датасета для каждого типа моделей.

In [2]:
# удалим неинформативные признаки
data=data.drop(['DateCrawled','DateCreated','NumberOfPictures','PostalCode','LastSeen'],axis=1)
data1=data

In [3]:
data = data.fillna('')
print(data.head())
print()
print(data.info())

   Price VehicleType  RegistrationYear Gearbox  Power  Model  Kilometer  \
0    480                          1993  manual      0   golf     150000   
1  18300       coupe              2011  manual    190            125000   
2   9800         suv              2004    auto    163  grand     125000   
3   1500       small              2001  manual     75   golf     150000   
4   3600       small              2008  manual     69  fabia      90000   

   RegistrationMonth  FuelType       Brand NotRepaired  
0                  0    petrol  volkswagen              
1                  5  gasoline        audi         yes  
2                  8  gasoline        jeep              
3                  6    petrol  volkswagen          no  
4                  7  gasoline       skoda          no  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 11 columns):
Price                354369 non-null int64
VehicleType          354369 non-null object
Registra

Пропуски обработаны для работы с catboost. NaN заменены на ''.

In [4]:
# разобьем данные на выборки
data_valid_train, data_test = train_test_split(data, test_size=0.2, random_state=12345)
data_train, data_valid = train_test_split(data_valid_train, test_size=0.25, random_state=12345)

features_train = data_train.drop(['Price'], axis=1)
target_train = data_train['Price']
features_test = data_test.drop(['Price'], axis=1)
target_test = data_test['Price']
features_valid = data_valid.drop(['Price'], axis=1)
target_valid = data_valid['Price']




cat_features = ['VehicleType', 'Gearbox', 'Model',
                'FuelType', 'Brand', 'NotRepaired']

Данные разбиты на 3 части в пропорции 3:1:1 (обучающая, валидационная, тестовая). Заданы признаки и целевые признаки.(для catboost)

In [5]:
for c in data1.columns:
    col_type = data1[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        data1[c] = data1[c].astype('category')

        
print(data1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 11 columns):
Price                354369 non-null int64
VehicleType          316879 non-null category
RegistrationYear     354369 non-null int64
Gearbox              334536 non-null category
Power                354369 non-null int64
Model                334664 non-null category
Kilometer            354369 non-null int64
RegistrationMonth    354369 non-null int64
FuelType             321474 non-null category
Brand                354369 non-null category
NotRepaired          283215 non-null category
dtypes: category(6), int64(5)
memory usage: 15.9 MB
None


Данные обработаны для работы с lightgbm. Все не количественные переменные приведены к типу category.

In [6]:
data_valid_train1, data_test1 = train_test_split(data1, test_size=0.2, random_state=12345)
data_train1, data_valid1 = train_test_split(data_valid_train1, test_size=0.25, random_state=12345)

features_train1 = data_train1.drop(['Price'], axis=1)
target_train1 = data_train1['Price']
features_test1 = data_test1.drop(['Price'], axis=1)
target_test1 = data_test1['Price']
features_valid1 = data_valid1.drop(['Price'], axis=1)
target_valid1 = data_valid1['Price']


Данные разбиты на 3 части в пропорции 3:1:1 (обучающая, валидационная, тестовая). Заданы признаки и целевые признаки.(для lightgbm)

# 2. Обучение моделей

In [7]:
model1 = CatBoostRegressor(loss_function="RMSE", iterations=50, depth=4)
model2 = CatBoostRegressor(loss_function="RMSE", iterations=100, depth=6)
model3 = CatBoostRegressor(loss_function="RMSE", iterations=150, depth=10)
model4 = CatBoostRegressor(loss_function="RMSE", iterations=200, depth=12)

model5 = lgb.LGBMRegressor(metric='RMSE',num_iterations=50, max_depth=4)
model6 = lgb.LGBMRegressor(metric='RMSE',num_iterations=100, max_depth=6)
model7 = lgb.LGBMRegressor(metric='RMSE',num_iterations=150, max_depth=10)
model8 = lgb.LGBMRegressor(metric='RMSE',num_iterations=200, max_depth=12)

Обучено 8 моделей со схожими гиперпараметрами, по 4 на lightgbm и catboost.

# 3. Анализ моделей

In [8]:
%%time
model1.fit(features_train, target_train, cat_features=cat_features, verbose=10)

0:	learn: 4440.3247085	total: 417ms	remaining: 20.4s
10:	learn: 3812.4100363	total: 3s	remaining: 10.6s
20:	learn: 3381.4800970	total: 5.38s	remaining: 7.43s
30:	learn: 3081.2157822	total: 7.76s	remaining: 4.76s
40:	learn: 2864.9652541	total: 10.1s	remaining: 2.23s
49:	learn: 2715.1855265	total: 12.3s	remaining: 0us
CPU times: user 11.5 s, sys: 1.68 s, total: 13.2 s
Wall time: 16.2 s


<catboost.core.CatBoostRegressor at 0x7f7cfa8af790>

In [9]:
%%time
predicted_valid = model1.predict(features_valid)

CPU times: user 188 ms, sys: 8 ms, total: 196 ms
Wall time: 154 ms


In [10]:
mse = mean_squared_error(target_valid, predicted_valid)
rmse1= mse ** 0.5
print("CatBoostRegressor,iterations=50, depth=4")
print("RMSE =",rmse1)

CatBoostRegressor,iterations=50, depth=4
RMSE = 2699.8074389536773


In [11]:
%%time
model2.fit(features_train, target_train, cat_features=cat_features, verbose=10)

0:	learn: 4432.0365296	total: 457ms	remaining: 45.2s
10:	learn: 3732.2992369	total: 4.04s	remaining: 32.7s
20:	learn: 3252.3260608	total: 7.31s	remaining: 27.5s
30:	learn: 2925.6542885	total: 10.6s	remaining: 23.6s
40:	learn: 2690.6311857	total: 13.8s	remaining: 19.8s
50:	learn: 2521.5831692	total: 17s	remaining: 16.3s
60:	learn: 2407.5477709	total: 20.1s	remaining: 12.9s
70:	learn: 2322.3379662	total: 23.5s	remaining: 9.6s
80:	learn: 2252.0919285	total: 26.7s	remaining: 6.26s
90:	learn: 2195.5697783	total: 29.8s	remaining: 2.94s
99:	learn: 2160.2484400	total: 32.8s	remaining: 0us
CPU times: user 30 s, sys: 3.72 s, total: 33.7 s
Wall time: 38.8 s


<catboost.core.CatBoostRegressor at 0x7f7cfa8af750>

In [12]:
%%time
predicted_valid = model2.predict(features_valid)

CPU times: user 208 ms, sys: 24 ms, total: 232 ms
Wall time: 205 ms


In [13]:
mse = mean_squared_error(target_valid, predicted_valid)
rmse2= mse ** 0.5
print("CatBoostRegressor, iterations=100, depth=6")
print("RMSE =",rmse2)

CatBoostRegressor, iterations=100, depth=6
RMSE = 2134.7543453588305


In [14]:
%%time
model3.fit(features_train, target_train, cat_features=cat_features, verbose=10)

0:	learn: 4425.4604998	total: 559ms	remaining: 1m 23s
10:	learn: 3663.4085248	total: 5.54s	remaining: 1m 9s
20:	learn: 3138.9000010	total: 10.3s	remaining: 1m 3s
30:	learn: 2788.6572721	total: 15.2s	remaining: 58.3s
40:	learn: 2533.2533354	total: 20s	remaining: 53.1s
50:	learn: 2354.0484642	total: 24.9s	remaining: 48.3s
60:	learn: 2237.5541540	total: 29.7s	remaining: 43.4s
70:	learn: 2148.3761284	total: 34.5s	remaining: 38.4s
80:	learn: 2076.2072429	total: 39.2s	remaining: 33.4s
90:	learn: 2025.6636814	total: 44.1s	remaining: 28.6s
100:	learn: 1991.1845046	total: 49s	remaining: 23.7s
110:	learn: 1965.1552908	total: 53.8s	remaining: 18.9s
120:	learn: 1944.2796526	total: 58.6s	remaining: 14s
130:	learn: 1927.4119954	total: 1m 3s	remaining: 9.21s
140:	learn: 1913.7536778	total: 1m 8s	remaining: 4.36s
149:	learn: 1904.1232325	total: 1m 12s	remaining: 0us
CPU times: user 1min 7s, sys: 6.03 s, total: 1min 13s
Wall time: 1min 18s


<catboost.core.CatBoostRegressor at 0x7f7cfa8af7d0>

In [15]:
%%time
predicted_valid = model3.predict(features_valid)

CPU times: user 284 ms, sys: 20 ms, total: 304 ms
Wall time: 301 ms


In [16]:
mse = mean_squared_error(target_valid, predicted_valid)
rmse3= mse ** 0.5
print("CatBoostRegressor, iterations=150, depth=10")
print("RMSE =", rmse3)

CatBoostRegressor, iterations=150, depth=10
RMSE = 1892.9573468039723


In [17]:
%%time
model4.fit(features_train, target_train, cat_features=cat_features, verbose=10)

0:	learn: 4419.5897300	total: 2.13s	remaining: 7m 4s
10:	learn: 3619.6576624	total: 21.4s	remaining: 6m 7s
20:	learn: 3062.3802725	total: 40.1s	remaining: 5m 41s
30:	learn: 2682.7764213	total: 59.5s	remaining: 5m 24s
40:	learn: 2420.6509991	total: 1m 18s	remaining: 5m 4s
50:	learn: 2248.3647477	total: 1m 38s	remaining: 4m 48s
60:	learn: 2124.6494697	total: 1m 57s	remaining: 4m 28s
70:	learn: 2039.5722928	total: 2m 18s	remaining: 4m 11s
80:	learn: 1978.3109977	total: 2m 38s	remaining: 3m 53s
90:	learn: 1930.2331287	total: 3m	remaining: 3m 35s
100:	learn: 1895.1171304	total: 3m 22s	remaining: 3m 18s
110:	learn: 1868.8457590	total: 3m 43s	remaining: 2m 59s
120:	learn: 1848.2504550	total: 4m 3s	remaining: 2m 39s
130:	learn: 1832.6005158	total: 4m 23s	remaining: 2m 19s
140:	learn: 1819.1051382	total: 4m 43s	remaining: 1m 58s
150:	learn: 1808.1081579	total: 5m 4s	remaining: 1m 38s
160:	learn: 1797.8650766	total: 5m 25s	remaining: 1m 18s
170:	learn: 1788.4269901	total: 5m 46s	remaining: 58.8s

<catboost.core.CatBoostRegressor at 0x7f7cfa8af890>

In [18]:
%%time
predicted_valid = model4.predict(features_valid)

CPU times: user 508 ms, sys: 12 ms, total: 520 ms
Wall time: 506 ms


In [19]:
mse = mean_squared_error(target_valid, predicted_valid)
rmse4= mse ** 0.5
print("CatBoostRegressor, iterations=200, depth=12")
print("RMSE =",rmse4)

CatBoostRegressor, iterations=200, depth=12
RMSE = 1800.2696823078218


Наименьшее значение RMSE у модели4, у неё же самое большое время обучения и предсказания.
ЛУчше использовать модель 3: она менее точная, но работает гораздо быстрее модели4.

In [20]:
%%time
model5.fit(features_train1, target_train1,eval_set=[(features_valid1, target_valid1)],verbose=10)



[10]	valid_0's rmse: 2796.31
[20]	valid_0's rmse: 2241.42
[30]	valid_0's rmse: 2042.49
[40]	valid_0's rmse: 1952.51
[50]	valid_0's rmse: 1912.54
CPU times: user 4.22 s, sys: 20 ms, total: 4.24 s
Wall time: 4.28 s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=4,
              metric='RMSE', min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=100, n_jobs=-1,
              num_iterations=50, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [21]:
%%time
predicted_valid = model5.predict(features_valid1)

CPU times: user 396 ms, sys: 8 ms, total: 404 ms
Wall time: 394 ms


In [22]:
mse = mean_squared_error(target_valid1, predicted_valid)
rmse5= mse ** 0.5
print("LGBMRegressor, iterations=50, depth=4")
print("RMSE =", rmse5)


LGBMRegressor, iterations=50, depth=4
RMSE = 1912.5419530611339


In [23]:
%%time
model6.fit(features_train1, target_train1,eval_set=[(features_valid1, target_valid1)],verbose=10)



[10]	valid_0's rmse: 2621.36
[20]	valid_0's rmse: 2075.04
[30]	valid_0's rmse: 1907.09
[40]	valid_0's rmse: 1846.37
[50]	valid_0's rmse: 1818.93
[60]	valid_0's rmse: 1798.5
[70]	valid_0's rmse: 1786.71
[80]	valid_0's rmse: 1776.85
[90]	valid_0's rmse: 1768.98
[100]	valid_0's rmse: 1763.59
CPU times: user 9.08 s, sys: 0 ns, total: 9.08 s
Wall time: 9.17 s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=6,
              metric='RMSE', min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=100, n_jobs=-1,
              num_iterations=100, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [24]:
%%time
predicted_valid = model6.predict(features_valid1)

CPU times: user 1.06 s, sys: 0 ns, total: 1.06 s
Wall time: 1.09 s


In [25]:
mse = mean_squared_error(target_valid1, predicted_valid)
rmse6= mse ** 0.5
print("LGBMRegressor, iterations=100, depth=6")
print("RMSE =", rmse6)

LGBMRegressor, iterations=100, depth=6
RMSE = 1763.5942221969635


In [26]:
%%time
model7.fit(features_train1, target_train1,eval_set=[(features_valid1, target_valid1)],verbose=10)



[10]	valid_0's rmse: 2620.58
[20]	valid_0's rmse: 2072.7
[30]	valid_0's rmse: 1903.17
[40]	valid_0's rmse: 1841.12
[50]	valid_0's rmse: 1811.35
[60]	valid_0's rmse: 1793.22
[70]	valid_0's rmse: 1780.26
[80]	valid_0's rmse: 1769.87
[90]	valid_0's rmse: 1762.87
[100]	valid_0's rmse: 1755.75
[110]	valid_0's rmse: 1750.29
[120]	valid_0's rmse: 1745.77
[130]	valid_0's rmse: 1739.36
[140]	valid_0's rmse: 1735.3
[150]	valid_0's rmse: 1731.55
CPU times: user 13.9 s, sys: 0 ns, total: 13.9 s
Wall time: 14.1 s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=10,
              metric='RMSE', min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=100, n_jobs=-1,
              num_iterations=150, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [27]:
%%time
predicted_valid = model7.predict(features_valid1)

CPU times: user 1.65 s, sys: 0 ns, total: 1.65 s
Wall time: 1.61 s


In [28]:
mse = mean_squared_error(target_valid1, predicted_valid)
rmse7= mse ** 0.5
print("LGBMRegressor, iterations=150, depth=10")
print("RMSE =", rmse7)


LGBMRegressor, iterations=150, depth=10
RMSE = 1731.5548432807977


In [29]:
%%time
model8.fit(features_train1, target_train1,eval_set=[(features_valid1, target_valid1)],verbose=10)



[10]	valid_0's rmse: 2620.58
[20]	valid_0's rmse: 2072.7
[30]	valid_0's rmse: 1903.16
[40]	valid_0's rmse: 1840.83
[50]	valid_0's rmse: 1810.94
[60]	valid_0's rmse: 1792.55
[70]	valid_0's rmse: 1777.72
[80]	valid_0's rmse: 1767.55
[90]	valid_0's rmse: 1760.65
[100]	valid_0's rmse: 1753.6
[110]	valid_0's rmse: 1747.41
[120]	valid_0's rmse: 1743.06
[130]	valid_0's rmse: 1737.15
[140]	valid_0's rmse: 1733.18
[150]	valid_0's rmse: 1730.25
[160]	valid_0's rmse: 1726.54
[170]	valid_0's rmse: 1723.88
[180]	valid_0's rmse: 1721.37
[190]	valid_0's rmse: 1718.65
[200]	valid_0's rmse: 1716.95
CPU times: user 16.1 s, sys: 0 ns, total: 16.1 s
Wall time: 16.3 s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=12,
              metric='RMSE', min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=100, n_jobs=-1,
              num_iterations=200, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [30]:
%%time
predicted_valid = model8.predict(features_valid1)

CPU times: user 2.1 s, sys: 0 ns, total: 2.1 s
Wall time: 2.11 s


In [31]:
mse = mean_squared_error(target_valid1, predicted_valid)
rmse8= mse ** 0.5
print("LGBMRegressor, iterations=200, depth=12")
print("RMSE =", rmse8)

LGBMRegressor, iterations=200, depth=12
RMSE = 1716.9468291412757


У LGBMRegressor лучшая модель - модель8. Она точнее всех, работает незначительно медленнее. 

In [32]:
results = {'Время обучения': ['16 s', '36.5 s','1min 16s','6min 51s','9.47 s','10.3 s','15.2 s','16.8 s'], 
           'Время предсказания': ['162 ms', '227 ms','249 ms','462 ms','395 ms','1.08 s','1.6 s','2.11 s'], 
           'RMSE': [rmse1,rmse2,rmse3,rmse4,rmse5,rmse6,rmse7,rmse8]}

final_data = pd.DataFrame(data=results, index=['model1','model2','model3','model4','model5','model6','model7','model8'])
final_data

Unnamed: 0,Время обучения,Время предсказания,RMSE
model1,16 s,162 ms,2699.807439
model2,36.5 s,227 ms,2134.754345
model3,1min 16s,249 ms,1892.957347
model4,6min 51s,462 ms,1800.269682
model5,9.47 s,395 ms,1912.541953
model6,10.3 s,1.08 s,1763.594222
model7,15.2 s,1.6 s,1731.554843
model8,16.8 s,2.11 s,1716.946829


Добавлена таблица с результатами всех моделей.

In [33]:
model_final = lgb.LGBMRegressor(metric='RMSE',num_iterations=200, max_depth=12)
model_final.fit(features_train1, target_train1,eval_set=[(features_valid1, target_valid1)],verbose=10)
test_predictions = model_final.predict(features_test1)
mse_final = mean_squared_error(target_test1, test_predictions)
print("RMSE =", mse_final ** 0.5)



[10]	valid_0's rmse: 2620.58
[20]	valid_0's rmse: 2072.7
[30]	valid_0's rmse: 1903.16
[40]	valid_0's rmse: 1840.83
[50]	valid_0's rmse: 1810.94
[60]	valid_0's rmse: 1792.55
[70]	valid_0's rmse: 1777.72
[80]	valid_0's rmse: 1767.55
[90]	valid_0's rmse: 1760.65
[100]	valid_0's rmse: 1753.6
[110]	valid_0's rmse: 1747.41
[120]	valid_0's rmse: 1743.06
[130]	valid_0's rmse: 1737.15
[140]	valid_0's rmse: 1733.18
[150]	valid_0's rmse: 1730.25
[160]	valid_0's rmse: 1726.54
[170]	valid_0's rmse: 1723.88
[180]	valid_0's rmse: 1721.37
[190]	valid_0's rmse: 1718.65
[200]	valid_0's rmse: 1716.95
RMSE = 1758.9566574466555


Проверка RMSE лучшей модели на тестовой выборке.

## Вывод. 
Среди 8 моделей наилучший результат показывает модель8. У неё наименьшее значение rmse, время обучения и предсказания занимает значительно меньше времени, чем аналог из CatBoostRegressor.