*Подключим необходимые библиотеки*

In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from catboost import CatBoostRegressor

import warnings
warnings.simplefilter("ignore")

*Загрузим данные и посмотрим, сколько пропусков и в каких столбцах*

In [2]:
df = pd.read_csv("data/cars_ml.csv")
df.sample(5)

Unnamed: 0,price,year,mileage,tax,steering,owners,model,volume,power,transmission,...,ecological,co2,engine capacity,cylinders,cylinder,compression,fuel consumption city,fuel consumption highway,fuel consumption combined,log_price
714,785000.0,2013.0,256000.0,4694.0,Левый,2.0,SsangYong,2.3,150.0,Механика,...,Euro 0,,2295.0,4.0,4.0,10.4,14.9,8.7,11.0,13.57344
521,370000.0,2002.0,308000.0,4095.0,Левый,2.0,Audi,2.0,130.0,Механика,...,Euro 0,,1984.0,4.0,5.0,10.3,11.5,6.0,8.0,12.821261
2969,830000.0,2013.0,191000.0,4935.0,Левый,2.0,SsangYong,2.0,141.0,Механика,...,Euro 0,,1998.0,4.0,4.0,17.5,9.9,6.3,7.7,13.629182
3414,1045000.0,2011.0,80000.0,4935.0,Левый,3.0,Nissan,2.0,141.0,Вариатор,...,Euro 5,184.0,1997.0,4.0,4.0,10.2,10.1,6.2,7.6,13.859528
361,800000.0,2004.0,312000.0,18075.0,Левый,3.0,Volkswagen,3.2,241.0,Автомат,...,Euro 4,,3189.0,6.0,4.0,11.3,19.1,10.8,13.8,13.592368


In [3]:
df.shape

(4273, 38)

In [4]:
df.isna().mean().sort_values(ascending=False)

co2                          0.504798
weight                       0.138778
fuel consumption city        0.131055
fuel consumption highway     0.128715
gears                        0.121928
model                        0.068570
compression                  0.065528
maximum speed                0.053592
acceleration                 0.052188
wheel size                   0.042593
fuel consumption combined    0.038147
clearance                    0.032296
fuel capacity                0.025977
fuel                         0.022935
consumption                  0.022467
engine capacity              0.018020
cylinders                    0.018020
cylinder                     0.018020
volume                       0.018020
tax                          0.013574
class                        0.000468
ecological                   0.000000
price                        0.000000
width                        0.000000
wheelbase                    0.000000
height                       0.000000
year        

*Так как пропусков больше половины в столбце выбросов CO2, то удалим его из данных*

In [5]:
df = df.drop("co2", axis=1)

*Создадим два списка, один из которых будет содержать названия столбцов, имеющих тип object, а другой — float64*

In [6]:
columns_obj = [i for i in df.columns if (df[i].dtype == "object")]
columns_num = [i for i in df.columns if (i not in columns_obj)]

*Создадим два алгоритма для заполнения пустых значений: для числовых данных — это медиана, а для категориальных — наиболее встречающееся*

In [7]:
num_imputer = SimpleImputer(strategy="median")
obj_imputer = SimpleImputer(strategy="most_frequent")

preprocessor = ColumnTransformer([
    ("num_imputer", num_imputer, columns_num),
    ("cat_imputer", obj_imputer, columns_obj)
])

In [8]:
imputer_df = preprocessor.fit_transform(df)
df_imputer = pd.DataFrame(imputer_df, columns=columns_num + columns_obj)
df_imputer.sample(5)

Unnamed: 0,price,year,mileage,tax,owners,volume,power,acceleration,consumption,number seats,...,steering,model,transmission,engine,fuel,drive,country,class,wheel size,ecological
2425,1789000.0,2011.0,204000.0,40950.0,2.0,3.5,273.0,8.2,12.4,7.0,...,Левый,Toyota,Автомат,Бензиновый,АИ-95,Полный,Япония,E,R17,Euro 4
371,1320000.0,2010.0,276300.0,17925.0,2.0,3.0,240.0,8.3,9.3,5.0,...,Левый,Volkswagen,Автомат,Дизельный,ДТ,Полный,Германия,E,R18,Euro 0
2629,1780000.0,2018.0,71000.0,8350.0,0.0,2.4,167.0,10.5,7.7,5.0,...,Левый,Mitsubishi,Вариатор,Бензиновый,АИ-92,Передний,Япония,D,R18,Euro 0
301,760000.0,2010.0,201001.0,4631.0,3.0,1.8,147.0,11.1,7.1,5.0,...,Левый,Toyota,Вариатор,Бензиновый,АИ-95,Передний,Япония,M,R16,Euro 4
183,720000.0,2001.0,195000.0,8100.0,3.0,1.8,180.0,7.9,9.4,4.0,...,Левый,BMW,Механика,Бензиновый,АИ-98,Полный,Германия,S,R17,Euro 0


In [9]:
df_imputer.shape

(4273, 37)

*Для категориальных столбцов выполним OneHotEncoding*

In [10]:
encoder = OneHotEncoder(drop="first", sparse_output=False)
encoded = encoder.fit_transform(df_imputer[columns_obj])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())
encoded_df.sample(5)

Unnamed: 0,steering_Правый,model_Alfa,model_Aston,model_Audi,model_BAIC,model_BMW,model_BYD,model_Bentley,model_Brilliance,model_Cadillac,...,wheel size_R19,wheel size_R20,wheel size_R21,wheel size_R22,wheel size_R23,ecological_Euro 2,ecological_Euro 3,ecological_Euro 4,ecological_Euro 5,ecological_Euro 6
2957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
encoded_df.shape

(4273, 131)

In [12]:
df_ohe = pd.concat([df_imputer[columns_num], encoded_df], axis=1)
df_ohe.sample(5)

Unnamed: 0,price,year,mileage,tax,owners,volume,power,acceleration,consumption,number seats,...,wheel size_R19,wheel size_R20,wheel size_R21,wheel size_R22,wheel size_R23,ecological_Euro 2,ecological_Euro 3,ecological_Euro 4,ecological_Euro 5,ecological_Euro 6
1464,4500000.0,2023.0,25000.0,18600.0,1.0,2.0,249.0,9.6,8.5,5.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
499,3290000.0,2022.0,75500.0,2980.0,1.0,1.4,150.0,9.7,7.1,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1555,3773000.0,2021.0,4700.0,9200.0,1.0,2.0,184.0,7.1,5.9,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1211,1750000.0,2004.0,236000.0,18675.0,1.0,4.0,249.0,9.5,13.2,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3415,4800000.0,2018.0,78258.0,35360.0,2.0,2.0,272.0,7.2,11.3,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_ohe.shape

(4273, 158)

*Создадим данные для обучения и разделим их на тренировочные и тестовые*

In [14]:
X = df_ohe.drop("price", axis=1)
y = df_ohe["price"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

*Проведём стандартизацию данных для лучшей сходимости*

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

*Попробуем обучить линейную регрессию с добавлением $L_1$- и $L_2$-норм. Подбор гиперпараметров будет происходить с помощью GridSearch*

In [17]:
lasso_params = {'alpha': np.linspace(1e-3, 15, 100)}

model_lasso = Lasso()
lasso_grid = GridSearchCV(model_lasso, lasso_params,
                          scoring="neg_mean_squared_error", cv=5)
lasso_grid.fit(X_train, y_train)

ridge_params = {'alpha': np.linspace(1e-3, 15, 100)}

model_ridge = Ridge()
ridge_grid = GridSearchCV(model_ridge, ridge_params,
                          scoring="neg_mean_squared_error", cv=5)
ridge_grid.fit(X_train, y_train)

*Посмотрим лучшие параметры по моделям*

In [18]:
lasso_grid.best_params_, ridge_grid.best_params_

({'alpha': 0.001}, {'alpha': 15.0})

In [19]:
lasso_best = lasso_grid.best_estimator_
ridge_best = ridge_grid.best_estimator_

lasso_pred = lasso_best.predict(X_test)
ridge_pred = ridge_best.predict(X_test)

np.sqrt(mean_squared_error(y_test, lasso_pred)), np.sqrt(mean_squared_error(y_test, ridge_pred))

(2072822.232580926, 2061094.0980994229)

*Метод k ближайших соседей*

In [20]:
knn_params = {
    'n_neighbors': np.arange(1, 101),
    'weights': ['uniform', 'distance']
}

model_knn = KNeighborsRegressor()
knn_grid = GridSearchCV(
    model_knn, knn_params, scoring='neg_mean_squared_error', cv=5)
knn_grid.fit(X_train, y_train)

In [21]:
knn_grid.best_params_

{'n_neighbors': 6, 'weights': 'distance'}

In [22]:
knn_best = knn_grid.best_estimator_
knn_pred = knn_best.predict(X_test)
np.sqrt(mean_squared_error(y_test, knn_pred))

2567049.0390150584

*Случайный лес*

In [23]:
rf_params = {
    "n_estimators": [10, 20, 30, 40, 50, 100, 200],
    "max_depth": [None, 20, 30],
    "min_samples_leaf": [1, 2, 4, 8],
    "bootstrap": [True, False]
}

model_rf = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(model_rf, rf_params,
                       scoring='neg_mean_squared_error', cv=5)
rf_grid.fit(X_train, y_train)

In [24]:
rf_grid.best_params_

{'bootstrap': False,
 'max_depth': None,
 'min_samples_leaf': 2,
 'n_estimators': 30}

In [25]:
rf_best = rf_grid.best_estimator_
rf_pred = rf_best.predict(X_test)
np.sqrt(mean_squared_error(y_test, rf_pred))

555785.0529354347

*Градиентный бустинг*

In [56]:
cb_params = {
    "iterations": [500, 1000, 1500],
    "learning_rate": [0.05, 0.1],
    "depth": [2, 4, 6, 8],
    "l2_leaf_reg": [1, 5, 10, 15, 20]
}
fit_params = {
    "eval_set": [(X_test, y_test)],
    "early_stopping_rounds": 50
}


model_cb = CatBoostRegressor(verbose=0, random_seed=42)
cb_grid = GridSearchCV(model_cb, cb_params,
                       scoring='neg_mean_squared_error', cv=5)
cb_grid.fit(X_train, y_train, **fit_params)

In [57]:
cb_grid.best_params_

{'depth': 2, 'iterations': 1000, 'l2_leaf_reg': 10, 'learning_rate': 0.1}

In [58]:
cb_best = cb_grid.best_estimator_
cb_pred = cb_best.predict(X_test)
np.sqrt(mean_squared_error(y_test, cb_pred))

1132951.335907063

*Получается, что наилучшей моделью оказался случайный лес, но ошибка всё равно оказалась достаточно большой, вероятно это произошло из-за маленького количества объектов*