## Импорт библиотек

In [1]:
import pandas as pd

import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

## Читаем датасет и оставляем фичи для дальнейшей работы

In [3]:
df = pd.read_csv('data/used_cars.csv')

In [4]:
features = ['brand', 'model_year', 'milage', 'fuel_type', 'engine', 'ext_col','accident', 'price']

In [5]:
dataset = df[features]

In [6]:
dataset.head()

Unnamed: 0,brand,model_year,milage,fuel_type,engine,ext_col,accident,price
0,Ford,2013,"51,000 mi.",E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,Black,At least 1 accident or damage reported,"$10,300"
1,Hyundai,2021,"34,742 mi.",Gasoline,3.8L V6 24V GDI DOHC,Moonlight Cloud,At least 1 accident or damage reported,"$38,005"
2,Lexus,2022,"22,372 mi.",Gasoline,3.5 Liter DOHC,Blue,None reported,"$54,598"
3,INFINITI,2015,"88,900 mi.",Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,Black,None reported,"$15,500"
4,Audi,2021,"9,835 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,Glacier White Metallic,None reported,"$34,999"


## Предобработка данных

In [7]:
dataset = dataset.rename(columns={'model_year': 'year'})

In [8]:
dataset['milage'] = dataset['milage'].apply(lambda x: x[:-4].replace(',', '.')).map(float) * 1.61

In [9]:
fuel = dataset.fuel_type.replace(['–', 'not supported'], 'Gasoline')
fuel = fuel.fillna('Gasoline')
dataset['fuel_type'] = fuel

In [10]:
l = []
pattern = '\d\.\dL'
for engine in dataset.engine:
    result = re.search(pattern, engine)
    if result:
        l.append(float(result.group()[:-1]))
    else:
        l.append(0)
        
engine = pd.Series(l)
engine = engine.replace(0, engine.median())
dataset['engine'] = engine

In [11]:
selected_colors = dataset.ext_col.value_counts().head(12).index
all_colors = dataset.ext_col.unique()

colors_to_drop = [color for color in all_colors if color not in selected_colors]

color = dataset['ext_col'].replace(colors_to_drop, 'Black')

dataset['ext_col'] = color
dataset = dataset.rename(columns={'ext_col': 'color'})

In [12]:
dataset['accident'] = dataset.accident.map({'At least 1 accident or damage reported': 1, 'None reported': 0}).fillna(0)

In [13]:
dataset['price'] = dataset['price'].map(lambda x: float(x[1:].replace(',', ''))) * 93

## Строю модель

In [14]:
df_for_model = pd.get_dummies(dataset, dtype=float)

In [15]:
X = df_for_model.drop(['price'], axis=1)
y = df_for_model['price']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [17]:
clf = RandomForestRegressor(criterion='squared_error', n_estimators=100, n_jobs=-1)

In [18]:
%%time
clf.fit(X_train, y_train)

CPU times: total: 2.09 s
Wall time: 343 ms


In [19]:
predict = clf.predict(X_test)

In [20]:
mse_rf = mean_squared_error(y_test, predict,squared=False)

In [24]:
f'mean_squared_error of random forest {mse_rf}'

'mean_squared_error of random forest 2909698.8018964212'

In [207]:
sample = {'brand': 'brand_Audi', 'year': '2016', 'milage': '10000', 'fuel_type': 'fuel_type_Gasoline', 'engine': '3.5', 'color': 'color_Orange', 'accident': '0'}

In [208]:
sample = pd.Series(sample)

df_sample = pd.DataFrame(sample, columns=['sample'])

df_sample = df_sample.T

df_sample[['year', 'milage', 'engine', 'accident']] = df_sample[['year', 'milage', 'engine', 'accident']].astype('float64')

In [209]:
sample_X = df_sample.rename(columns={'brand': df_sample['brand'].values[0], 
                                     'color': df_sample['color'].values[0],
                                     'fuel_type': df_sample['fuel_type'].values[0]})

In [210]:
sample_X

Unnamed: 0,brand_Audi,year,milage,fuel_type_Gasoline,engine,color_Orange,accident
sample,brand_Audi,2016.0,10000.0,fuel_type_Gasoline,3.5,color_Orange,0.0


In [211]:
sample_X[[sample['brand'], sample['color'], sample['fuel_type']]] = 1.0

In [212]:
sample_X

Unnamed: 0,brand_Audi,year,milage,fuel_type_Gasoline,engine,color_Orange,accident
sample,1.0,2016.0,10000.0,1.0,3.5,1.0,0.0


In [213]:
df_concat_with_sample = pd.concat([X_train, sample_X]).fillna(0.0)

In [214]:
sample = df_concat_with_sample.loc['sample']

In [215]:
sample = pd.DataFrame(sample).T

In [216]:
clf.predict(sample)

array([6021818.82])