## Импорт библиотек

In [1]:
import pandas as pd

import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

## Читаем датасет и оставляем фичи для дальнейшей работы

In [4]:
df = pd.read_csv('../data/used_cars.csv')

In [5]:
features = ['brand', 'model_year', 'milage', 'fuel_type', 'engine', 'ext_col','accident', 'price']

In [9]:
dataset = df[features]

In [10]:
dataset.head()

Unnamed: 0,brand,model_year,milage,fuel_type,engine,ext_col,accident,price
0,Ford,2013,"51,000 mi.",E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,Black,At least 1 accident or damage reported,"$10,300"
1,Hyundai,2021,"34,742 mi.",Gasoline,3.8L V6 24V GDI DOHC,Moonlight Cloud,At least 1 accident or damage reported,"$38,005"
2,Lexus,2022,"22,372 mi.",Gasoline,3.5 Liter DOHC,Blue,None reported,"$54,598"
3,INFINITI,2015,"88,900 mi.",Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,Black,None reported,"$15,500"
4,Audi,2021,"9,835 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,Glacier White Metallic,None reported,"$34,999"


## Предобработка данных

In [11]:
dataset = dataset.rename(columns={'model_year': 'year'})

In [12]:
dataset['milage'] = dataset['milage'].apply(lambda x: x[:-4].replace(',', '.')).map(float) * 1.61

In [13]:
fuel = dataset.fuel_type.replace(['–', 'not supported'], 'Gasoline')
fuel = fuel.fillna('Gasoline')
dataset['fuel_type'] = fuel

In [14]:
l = []
pattern = '\d\.\dL'
for engine in dataset.engine:
    result = re.search(pattern, engine)
    if result:
        l.append(float(result.group()[:-1]))
    else:
        l.append(0)
        
engine = pd.Series(l)
engine = engine.replace(0, engine.median())
dataset['engine'] = engine

In [15]:
selected_colors = dataset.ext_col.value_counts().head(12).index
all_colors = dataset.ext_col.unique()

colors_to_drop = [color for color in all_colors if color not in selected_colors]

color = dataset['ext_col'].replace(colors_to_drop, 'Black')

dataset['ext_col'] = color
dataset = dataset.rename(columns={'ext_col': 'color'})

In [16]:
dataset['accident'] = dataset.accident.map({'At least 1 accident or damage reported': 1, 'None reported': 0}).fillna(0)

In [17]:
dataset['price'] = dataset['price'].map(lambda x: float(x[1:].replace(',', ''))) * 93

## Строю модель

In [18]:
df_for_model = pd.get_dummies(dataset, dtype=float)

In [19]:
X = df_for_model.drop(['price'], axis=1)
y = df_for_model['price']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
clf = RandomForestRegressor(criterion='squared_error', n_estimators=100, n_jobs=-1)

In [22]:
%%time
clf.fit(X_train, y_train)

CPU times: total: 2.3 s
Wall time: 331 ms


In [23]:
predict = clf.predict(X_test)

In [24]:
mse_rf = mean_squared_error(y_test, predict,squared=False)



In [25]:
f'mean_squared_error of random forest {mse_rf}'

'mean_squared_error of random forest 2567882.878559447'

In [26]:
X

Unnamed: 0,year,milage,engine,accident,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,...,color_Blue,color_Brown,color_Gold,color_Gray,color_Green,color_Orange,color_Red,color_Silver,color_White,color_Yellow
0,2013,82.11000,3.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021,55.93462,3.8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022,36.01892,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2015,143.12900,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021,15.83435,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4004,2023,1149.54000,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4005,2022,17.54900,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4006,2022,3.40676,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4007,2020,53.13000,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
brand = pd.DataFrame(dataset.brand.value_counts()).reset_index()

In [47]:
brand[brand['count'] > 15].brand.sort_values()

19            Acura
35             Alfa
5              Audi
1               BMW
28          Bentley
30            Buick
11         Cadillac
3         Chevrolet
31         Chrysler
14            Dodge
0              Ford
12              GMC
34          Genesis
21            Honda
37           Hummer
17          Hyundai
23         INFINITI
25           Jaguar
8              Jeep
16              Kia
32      Lamborghini
9              Land
7             Lexus
24          Lincoln
29             MINI
27         Maserati
18            Mazda
2     Mercedes-Benz
33       Mitsubishi
10           Nissan
4           Porsche
13              RAM
36           Rivian
20           Subaru
15            Tesla
6            Toyota
22       Volkswagen
26            Volvo
Name: brand, dtype: object

In [48]:
dataset

Unnamed: 0,brand,year,milage,fuel_type,engine,color,accident,price
0,Ford,2013,82.11000,E85 Flex Fuel,3.7,Black,1.0,957900.0
1,Hyundai,2021,55.93462,Gasoline,3.8,Black,1.0,3534465.0
2,Lexus,2022,36.01892,Gasoline,3.5,Blue,0.0,5077614.0
3,INFINITI,2015,143.12900,Hybrid,3.5,Black,0.0,1441500.0
4,Audi,2021,15.83435,Gasoline,2.0,Black,0.0,3254907.0
...,...,...,...,...,...,...,...,...
4004,Bentley,2023,1149.54000,Gasoline,6.0,Black,0.0,32545350.0
4005,Audi,2022,17.54900,Gasoline,3.0,Black,0.0,5012700.0
4006,Porsche,2022,3.40676,Gasoline,3.5,Black,0.0,8462814.0
4007,Ford,2020,53.13000,Gasoline,3.5,Blue,0.0,5858907.0


In [62]:
year = pd.DataFrame(dataset.year.value_counts()).reset_index()

In [72]:
dataset.fuel_type.value_counts()

fuel_type
Gasoline          3526
Hybrid             194
E85 Flex Fuel      139
Diesel             116
Plug-In Hybrid      34
Name: count, dtype: int64

In [77]:
engine = pd.DataFrame(dataset.engine.value_counts()).reset_index()

In [86]:
color = pd.DataFrame(dataset.color.value_counts()).reset_index()

In [88]:
for i in color.color:
    print(f'<option value="{i}">{i}</option>')

<option value="Black">Black</option>
<option value="White">White</option>
<option value="Gray">Gray</option>
<option value="Silver">Silver</option>
<option value="Blue">Blue</option>
<option value="Red">Red</option>
<option value="Green">Green</option>
<option value="Brown">Brown</option>
<option value="Gold">Gold</option>
<option value="Beige">Beige</option>
<option value="Orange">Orange</option>
<option value="Yellow">Yellow</option>
