# Определение стоимости автомобилей

Сервис по продаже автомобилей с пробегом «Не бит, не крашен» разрабатывает приложение, чтобы привлечь новых клиентов. В нём можно будет узнать рыночную стоимость своего автомобиля. 
Постройте модель, которая умеет её определять. В вашем распоряжении данные о технических характеристиках, комплектации и ценах других автомобилей.
Критерии, которые важны заказчику:
- качество предсказания;
- время обучения модели;
- время предсказания модели.


In [2]:
!pip install lightgbm
!pip install catboost



In [3]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb

import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import train_test_split, GridSearchCV

## Подготовка данных 

In [4]:
try: 
    df = pd.read_csv('/datasets/autos.csv')
except: 
    df = pd.read_csv('./datasets/autos.csv')

In [5]:
df.head()

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        354369 non-null  object
 1   Price              354369 non-null  int64 
 2   VehicleType        316879 non-null  object
 3   RegistrationYear   354369 non-null  int64 
 4   Gearbox            334536 non-null  object
 5   Power              354369 non-null  int64 
 6   Model              334664 non-null  object
 7   Kilometer          354369 non-null  int64 
 8   RegistrationMonth  354369 non-null  int64 
 9   FuelType           321474 non-null  object
 10  Brand              354369 non-null  object
 11  Repaired           283215 non-null  object
 12  DateCreated        354369 non-null  object
 13  NumberOfPictures   354369 non-null  int64 
 14  PostalCode         354369 non-null  int64 
 15  LastSeen           354369 non-null  object
dtypes: int64(7), object(

In [7]:
df.describe()

Unnamed: 0,Price,RegistrationYear,Power,Kilometer,RegistrationMonth,NumberOfPictures,PostalCode
count,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0
mean,4416.656776,2004.234448,110.094337,128211.172535,5.714645,0.0,50508.689087
std,4514.158514,90.227958,189.850405,37905.34153,3.726421,0.0,25783.096248
min,0.0,1000.0,0.0,5000.0,0.0,0.0,1067.0
25%,1050.0,1999.0,69.0,125000.0,3.0,0.0,30165.0
50%,2700.0,2003.0,105.0,150000.0,6.0,0.0,49413.0
75%,6400.0,2008.0,143.0,150000.0,9.0,0.0,71083.0
max,20000.0,9999.0,20000.0,150000.0,12.0,0.0,99998.0


In [8]:
df.columns = df.columns.str.replace(r"([A-Z])", r" \1").str.lower().str.replace(' ', '_').str[1:]

  df.columns = df.columns.str.replace(r"([A-Z])", r" \1").str.lower().str.replace(' ', '_').str[1:]


In [9]:
df.head()

Unnamed: 0,date_crawled,price,vehicle_type,registration_year,gearbox,power,model,kilometer,registration_month,fuel_type,brand,repaired,date_created,number_of_pictures,postal_code,last_seen
0,2016-03-24 11:52:17,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [10]:
df.isna().sum()

date_crawled              0
price                     0
vehicle_type          37490
registration_year         0
gearbox               19833
power                     0
model                 19705
kilometer                 0
registration_month        0
fuel_type             32895
brand                     0
repaired              71154
date_created              0
number_of_pictures        0
postal_code               0
last_seen                 0
dtype: int64

In [11]:
df = df.drop(['number_of_pictures', 'postal_code', 'date_created', 'last_seen', 'registration_month', 'date_crawled'], axis=1)

In [12]:
print('Доля пропусков в vehicle_type: {:.2%}'.format(df['vehicle_type'].isna().mean()))
print('Доля пропусков в gearbox: {:.2%}'.format(df['gearbox'].isna().mean()))
print('Доля пропусков в model: {:.2%}'.format(df['model'].isna().mean()))
print('Доля пропусков в fuel_type: {:.2%}'.format(df['fuel_type'].isna().mean()))
print('Доля пропусков в repaired: {:.2%}'.format(df['repaired'].isna().mean()))

Доля пропусков в vehicle_type: 10.58%
Доля пропусков в gearbox: 5.60%
Доля пропусков в model: 5.56%
Доля пропусков в fuel_type: 9.28%
Доля пропусков в repaired: 20.08%


In [13]:
df.drop_duplicates(inplace=True)  

Заполним пропуски в vehicle_type.

In [14]:
df.fillna({'vehicle_type':'unknown'}, inplace=True)
df.vehicle_type.value_counts()

sedan          78206
small          67548
wagon          56367
unknown        34559
bus            25830
convertible    18195
coupe          14690
suv            10776
other           3158
Name: vehicle_type, dtype: int64

Удалим пропуски в gearbox и сразу же закодируем.

In [15]:
df.dropna(subset=['gearbox'], inplace=True)
df.loc[(df.gearbox == 'manual'), 'gearbox'] = 1
df.loc[(df.gearbox  == 'auto'), 'gearbox'] = 0

In [16]:
df.dropna(subset=['model'], inplace=True)

Заполним пропуски в fuel_type.

In [17]:
df['fuel_type'] = df['fuel_type'].fillna('other')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 277215 entries, 0 to 354368
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   price              277215 non-null  int64 
 1   vehicle_type       277215 non-null  object
 2   registration_year  277215 non-null  int64 
 3   gearbox            277215 non-null  object
 4   power              277215 non-null  int64 
 5   model              277215 non-null  object
 6   kilometer          277215 non-null  int64 
 7   fuel_type          277215 non-null  object
 8   brand              277215 non-null  object
 9   repaired           228973 non-null  object
dtypes: int64(4), object(6)
memory usage: 23.3+ MB


In [19]:
df.repaired.isna().sum()

48242

Заполним пропуски в repaired и сразу же закодируем.

In [20]:
df.fillna({'repaired':'no info'}, inplace=True)

In [21]:
df.loc[(df.repaired == 'yes'), 'repaired'] = 1
df.loc[(df.repaired == 'no'), 'repaired'] = 0
df.loc[(df.repaired == 'no info'), 'repaired'] = 2

Удалим совсем старые данные.

In [22]:
df = df.query('1970 <= registration_year <= 2024')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276567 entries, 0 to 354368
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   price              276567 non-null  int64 
 1   vehicle_type       276567 non-null  object
 2   registration_year  276567 non-null  int64 
 3   gearbox            276567 non-null  object
 4   power              276567 non-null  int64 
 5   model              276567 non-null  object
 6   kilometer          276567 non-null  int64 
 7   fuel_type          276567 non-null  object
 8   brand              276567 non-null  object
 9   repaired           276567 non-null  object
dtypes: int64(4), object(6)
memory usage: 23.2+ MB


In [24]:
df.corr()

  df.corr()


Unnamed: 0,price,registration_year,power,kilometer
price,1.0,0.43922,0.143615,-0.365549
registration_year,0.43922,1.0,0.028895,-0.262877
power,0.143615,0.028895,1.0,0.020822
kilometer,-0.365549,-0.262877,0.020822,1.0


**Выводы:**

- цена зависит от года и мощности
- цена зависит обратно пропорциональная от пробега
- в данных бюли устранены пропуски
- были удалены очень старые данные

## Обучение моделей

In [25]:
df_ohe = pd.get_dummies(df, drop_first=True)

y_ohe = df_ohe['price']
X_ohe = df_ohe.drop('price', axis=1)

X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(X_ohe, y_ohe, test_size=0.25, random_state=12345) 

numeric = list(df_ohe)
numeric.remove('price')

scaler = StandardScaler()
scaler.fit(X_train_ohe[numeric])
X_train_ohe[numeric] = scaler.transform(X_train_ohe[numeric])
X_test_ohe[numeric] = scaler.transform(X_test_ohe[numeric])

  df_ohe = pd.get_dummies(df, drop_first=True)


### Линейная регрессия

Обучим модель линейной регрессии

In [26]:
model_lr = LinearRegression()
model_lr.fit(X_train_ohe, y_train_ohe)

y_predict = model_lr.predict(X_test_ohe)

In [27]:
print("Rmse тестовая выборка: ", mean_squared_error(y_test_ohe, y_predict) ** 0.5)


Rmse тестовая выборка:  2760.4372890073805


### Модель решающего дерева

Обучим модель решающего дерева.

In [28]:
enc = OrdinalEncoder()
enc.fit(df[['vehicle_type','registration_year', 'gearbox', 'power', 
            'model', 'kilometer','fuel_type',
            'brand', 'repaired']])
df[['vehicle_type','registration_year', 'gearbox', 'power', 
            'model', 'kilometer','fuel_type',
            'brand', 'not_repaired']] = enc.transform(df[['vehicle_type','registration_year', 'gearbox', 'power', 
            'model', 'kilometer','fuel_type',
            'brand', 'repaired']])

In [29]:
y_oe = df['price']
X_oe = df.drop('price', axis=1)
X_train_oe, X_test_oe, y_train_oe, y_test_oe = train_test_split(X_oe, y_oe, test_size=0.25, random_state=12345) 


In [30]:
for depth in range(3, 20, 3):
    
    model = DecisionTreeRegressor(random_state=12345, max_depth = depth)
    model.fit(X_train_oe, y_train_oe)

    predictions_train_ohe = model.predict(X_train_oe)
    
    rmse = mean_squared_error(y_train_ohe, predictions_train_ohe) ** 0.5
    print('Глубина:', depth)
    print('RMSE для решающего дерева:', rmse)
    print('')
print()

Глубина: 3
RMSE для решающего дерева: 3031.782335955572

Глубина: 6
RMSE для решающего дерева: 2397.106899558086

Глубина: 9
RMSE для решающего дерева: 2090.176790157572

Глубина: 12
RMSE для решающего дерева: 1794.7608914291345

Глубина: 15
RMSE для решающего дерева: 1456.316747780278

Глубина: 18
RMSE для решающего дерева: 1152.6115938364244




### Модель случайного леса

Обучим модель случайного леса.

In [31]:
for est in [100, 500, 1000]:
    model = RandomForestRegressor(random_state=12345, 
                                   n_estimators=est, 
                                   max_depth=9)
    model.fit(X_train_oe, y_train_oe)
    
    predictions_train_oe = model.predict(X_train_oe)
    
    rmse = mean_squared_error(y_train_oe, predictions_train_oe)**0.5
    print('Количество деревьев:', est)
    print('RMSE для случайного леса:', rmse)
    print('')
print()

Количество деревьев: 100
RMSE для случайного леса: 2015.0257020594804

Количество деревьев: 500
RMSE для случайного леса: 2013.9903541653384

Количество деревьев: 1000
RMSE для случайного леса: 2014.204083481018




### LightGBM

Обучим модель LightGBM

In [34]:
param_grid = {'learning_rate':[.2, .4,0.5,0.8], 'num_leaves': [100, 150]}
model = lgb.LGBMRegressor(random_state=12345)

tuning_model=GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          scoring='neg_root_mean_squared_error',
                          cv=3,
                          verbose=3)

tuning_model.fit(X_train_ohe, y_train_ohe)
display(tuning_model.best_params_)
display(tuning_model.best_score_*-1)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039824 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1163
[LightGBM] [Info] Number of data points in the train set: 138283, number of used features: 284
[LightGBM] [Info] Start training from score 4679.480659
[CV 1/3] END learning_rate=0.2, num_leaves=100;, score=-1705.450 total time=   2.7s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1163
[LightGBM] [Info] Number of data points in the train set: 138283, number of used features: 284
[LightGBM] [Info] Start training from score 4659.209310
[CV 2/3] END learning_rate=0.2, nu

{'learning_rate': 0.2, 'num_leaves': 150}

1701.4377438067156

**Выводы:**

- 

## Анализ моделей

### Линейная регрессия

In [35]:
%%time

model = LinearRegression()
model.fit(X_train_ohe, y_train_ohe)

CPU times: user 17.5 s, sys: 3.31 s, total: 20.8 s
Wall time: 3.06 s


In [36]:
%%time

y_train_pred = model.predict(X_train_ohe)

CPU times: user 202 ms, sys: 133 ms, total: 335 ms
Wall time: 42 ms


In [37]:
%%time
y_test_pred = model.predict(X_test_ohe)

CPU times: user 159 ms, sys: 370 ms, total: 529 ms
Wall time: 60.2 ms


In [38]:
print("Обучающая выборка: ", mean_squared_error(y_train_oe, y_train_pred) ** 0.5)
print("Тестовая выборка: ", mean_squared_error(y_test_oe, y_test_pred) ** 0.5)

Обучающая выборка:  2745.5228538208235
Тестовая выборка:  2760.4372890073805


### Модель решающего дерева

In [47]:
%%time

model = DecisionTreeRegressor(random_state=12345, max_depth=18)
model.fit(X_train_oe, y_train_oe)

CPU times: user 498 ms, sys: 15.3 ms, total: 514 ms
Wall time: 515 ms


In [48]:
%%time

y_train_pred = model.predict(X_train_oe)

CPU times: user 92.9 ms, sys: 9.82 ms, total: 103 ms
Wall time: 102 ms


In [49]:
%%time 

y_test_pred = model.predict(X_test_oe)

CPU times: user 40.6 ms, sys: 3.87 ms, total: 44.5 ms
Wall time: 44 ms


In [50]:
print("Обучающая выборка: ", mean_squared_error(y_train_oe, y_train_pred) ** 0.5)
print("Тестовая выборка: ", mean_squared_error(y_test_oe, y_test_pred) ** 0.5)

Обучающая выборка:  1152.6115938364244
Тестовая выборка:  2150.6104543688502


### Модель случайного леса

In [43]:
%%time

model = RandomForestRegressor(random_state=12345, max_depth=9, n_estimators=500)
model.fit(X_train_oe, y_train_oe)

CPU times: user 1min 20s, sys: 467 ms, total: 1min 20s
Wall time: 1min 21s


In [44]:
%%time

y_train_pred = model.predict(X_train_oe)

CPU times: user 5.73 s, sys: 32.7 ms, total: 5.76 s
Wall time: 5.77 s


In [45]:
%%time

y_test_pred = model.predict(X_test_oe)

CPU times: user 1.93 s, sys: 10.6 ms, total: 1.94 s
Wall time: 1.94 s


In [46]:
print("Обучающая выборка: ", mean_squared_error(y_train_oe, y_train_pred) ** 0.5)
print("Тестовая выборка: ", mean_squared_error(y_test_oe, y_test_pred) ** 0.5)

Обучающая выборка:  2013.9903541653384
Тестовая выборка:  2082.9767113862354


### LightGBM

In [51]:
%%time

model_lgbmr = LGBMRegressor(learning_rate=0.2, num_leaves=150, random_state=12345)
model_lgbmr.fit(X_train_ohe, y_train_ohe)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1175
[LightGBM] [Info] Number of data points in the train set: 207425, number of used features: 288
[LightGBM] [Info] Start training from score 4678.614964
CPU times: user 4.58 s, sys: 1.72 s, total: 6.3 s
Wall time: 3.34 s


In [52]:
%%time

y_train_pred = model_lgbmr.predict(X_train_ohe)

CPU times: user 3.78 s, sys: 47.8 ms, total: 3.83 s
Wall time: 475 ms


In [53]:
%%time

y_test_pred = model_lgbmr.predict(X_test_ohe)

CPU times: user 1.27 s, sys: 14.1 ms, total: 1.28 s
Wall time: 164 ms


In [54]:
print("Обучающая выборка: ", mean_squared_error(y_train_oe, y_train_pred) ** 0.5)
print("Тестовая выборка: ", mean_squared_error(y_test_oe, y_test_pred) ** 0.5)

Обучающая выборка:  1519.908093175754
Тестовая выборка:  1697.8208351522035


**Выводы:**

Составим таблицу с результатами моделей
| Модель | RMSE на обучающей выборке  |  RMSE на тестовой выборке | Время обучения  | Время работы  | 
|---|---|---|---|---|
| Линейная регрессия        |  2745  |  2760  |  20.8 с      |  529 мс  | 
| Модель решающего дерева  |  1152  |  2150  |  514 мс      |  45 мс   |
|  Модель случайного леса  |  2013  |  2082  |  1 мин 20 с  |  1.94 с  | 
|   LightGBM                |  1519  |  1697  |  3.83 c      |  1.28 с  | 


**Итог:** лучшая модель - LightGBM с rmse на тестовой выборке - 1697.

## Общий вывод

**Вывод:** 
В процессе работы над проектом я провёл анализ и предобработку данных. Разработал четыре модели с различными гиперпараметрами, три из которых соответствовали показателю RMSE менее 2500. После анализа результатов по критериям, важным для заказчика, лучшей моделью была признана LGBMRegressor, которая на тестовой выборке показала RMSE 1697, она имеет приемлимую скорость обучения и самый лучшую метрику RMSE.





