# Описание проекта

## Стоимость поддержанного автомобиля

Многие знают про маркетплейсы где продаются б/у вещи, на которых есть возможность недорого купить качественную и полезную вещь. Но всегда волнует вопрос - кто и как устанавливает цену, и какие его характеристики больше всего влияют на итоговую стоимость продажи?! Вопрос становится особо актуальным, если речь идет про дорогие товары, например про автомобили! В рамках данной задачи необходимо поработать с данными о продажах автомобилей на вторичном рынке. Целью данного проекта будет разработанная модель предсказания стоимости автомобиля на вторичном рынке.

## Основные этапы исследования

- Загрузка и ознакомление с данными, <p>
- Предварительная обработка,<p>
- Полноценный разведочный анализ,<p>
- Разработка новых синтетических признаков,<p>
- Проверка на мультиколлинеарность,<p>
- Отбор финального набора обучающих признаков,<p>
- Выбор и обучение моделей,<p>
- Итоговая оценка качества предсказания лучшей модели,<p>
- Анализ важности ее признаков.

# Представление данных

In [475]:
!pip install imblearn



In [524]:
# Импорт основных библиотек
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

from math import ceil

from pandas.api.types import is_string_dtype

from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn.exceptions import DataConversionWarning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor

from imblearn.pipeline import Pipeline, make_pipeline

In [477]:
# Отключение лишних предупреждений
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [478]:
# Импорт датасета
try:
    sample_submission = pd.read_csv('datasets/sample_submission.csv')
    test = pd.read_csv('datasets/test.csv')
    train = pd.read_csv('datasets/train.csv')
except Exception as info:
    display(info)
    sample_submission = pd.read_csv('/kaggle/input/used-cars-price-prediction-19ds/sample_submission.csv')
    test = pd.read_csv('/kaggle/input/used-cars-price-prediction-19ds/test.csv')
    train = pd.read_csv('/kaggle/input/used-cars-price-prediction-19ds/train.csv')

In [479]:
# Объявим функцию для изучения датасетов
def describe_dataframe(dataframe):
    display(dataframe.head(10))
    display(dataframe.info())
    display(dataframe.describe(percentiles=[.5]).T)
    print(f"Количество дублированных строк: {dataframe.duplicated().sum()}")

In [480]:
describe_dataframe(train)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,sellingprice,saledate
0,2011,Ford,Edge,SEL,suv,automatic,2fmdk3jc4bba41556,md,4.2,111041.0,black,black,santander consumer,12500,Tue Jun 02 2015 02:30:00 GMT-0700 (PDT)
1,2014,Ford,Fusion,SE,Sedan,automatic,3fa6p0h75er208976,mo,3.5,31034.0,black,black,ars/avis budget group,14500,Wed Feb 25 2015 02:00:00 GMT-0800 (PST)
2,2012,Nissan,Sentra,2.0 SL,sedan,automatic,3n1ab6ap4cl698412,nj,2.2,35619.0,black,black,nissan-infiniti lt,9100,Wed Jun 10 2015 02:30:00 GMT-0700 (PDT)
3,2003,HUMMER,H2,Base,suv,automatic,5grgn23u93h101360,tx,2.8,131301.0,gold,beige,wichita falls ford lin inc,13300,Wed Jun 17 2015 03:00:00 GMT-0700 (PDT)
4,2007,Ford,Fusion,SEL,Sedan,automatic,3fahp08z17r268380,md,2.0,127709.0,black,black,purple heart,1300,Tue Feb 03 2015 04:00:00 GMT-0800 (PST)
5,2013,Lincoln,MKZ,Base,Sedan,automatic,3ln6l2j91dr817800,mi,2.5,14894.0,black,black,"ford motor credit company,llc",22600,Thu May 21 2015 02:00:00 GMT-0700 (PDT)
6,2010,pontiac,g6,4c,,automatic,1g2za5eb4a4157380,nc,3.4,114587.0,silver,black,north state acceptance,5900,Mon Jan 12 2015 09:30:00 GMT-0800 (PST)
7,2013,Ford,Escape,SE,SUV,automatic,1fmcu0gx3duc59421,fl,4.8,26273.0,blue,gray,fields bmw,15200,Tue Feb 03 2015 01:00:00 GMT-0800 (PST)
8,2000,Hyundai,Elantra,GLS,Sedan,automatic,kmhjf35f2yu955691,oh,1.9,182624.0,black,tan,dt inventory,700,Thu Jan 22 2015 01:00:00 GMT-0800 (PST)
9,2005,Ford,Freestyle,Limited,wagon,automatic,1fmdk06135ga45438,oh,1.0,149364.0,black,tan,wells fargo dealer services,325,Tue Jun 16 2015 05:00:00 GMT-0700 (PDT)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440236 entries, 0 to 440235
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          440236 non-null  int64  
 1   make          432193 non-null  object 
 2   model         432113 non-null  object 
 3   trim          431899 non-null  object 
 4   body          429843 non-null  object 
 5   transmission  388775 non-null  object 
 6   vin           440236 non-null  object 
 7   state         440236 non-null  object 
 8   condition     430831 non-null  float64
 9   odometer      440167 non-null  float64
 10  color         439650 non-null  object 
 11  interior      439650 non-null  object 
 12  seller        440236 non-null  object 
 13  sellingprice  440236 non-null  int64  
 14  saledate      440236 non-null  object 
dtypes: float64(2), int64(2), object(11)
memory usage: 50.4+ MB


None

Unnamed: 0,count,mean,std,min,50%,max
year,440236.0,2010.040101,3.977945,1982.0,2012.0,2015.0
condition,430831.0,3.425077,0.949973,1.0,3.6,5.0
odometer,440167.0,68344.421604,53542.203908,1.0,52098.0,999999.0
sellingprice,440236.0,13592.209588,9751.479098,1.0,12100.0,230000.0


Количество дублированных строк: 0


In [481]:
describe_dataframe(test)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,saledate
0,2005,Cadillac,CTS,Base,Sedan,automatic,1g6dp567450124779,ca,2.7,116970.0,silver,black,lexus of stevens creek,Wed Jan 14 2015 04:30:00 GMT-0800 (PST)
1,2014,GMC,Savana Cargo,2500,Van,,1gtw7fca7e1902207,pa,4.4,6286.0,white,gray,u-haul,Fri Feb 27 2015 01:00:00 GMT-0800 (PST)
2,2013,Nissan,Murano,S,SUV,automatic,jn8az1mw6dw303497,oh,4.6,11831.0,gray,black,nissan-infiniti lt,Tue Feb 24 2015 01:30:00 GMT-0800 (PST)
3,2013,Chevrolet,Impala,LS Fleet,Sedan,automatic,2g1wf5e34d1160703,fl,2.3,57105.0,silver,black,onemain rem/auto club of miami inc dba north dad,Fri Mar 06 2015 02:00:00 GMT-0800 (PST)
4,2013,Nissan,Titan,SV,Crew Cab,automatic,1n6aa0ec3dn301209,tn,2.9,31083.0,black,black,nissan north america inc.,Wed Jun 03 2015 03:30:00 GMT-0700 (PDT)
5,2003,Volkswagen,Passat,GLS 1.8T,wagon,automatic,wvwvd63b93e175638,nc,2.4,104155.0,silver,black,fred anderson nissan of fayetteville,Tue Jun 09 2015 03:00:00 GMT-0700 (PDT)
6,2013,Hyundai,Sonata,GLS,Sedan,automatic,5npeb4ac4dh809686,il,3.7,30669.0,silver,gray,merchants leasing,Tue Mar 03 2015 02:00:00 GMT-0800 (PST)
7,2013,Ford,Explorer,Base,SUV,automatic,1fm5k7b97dgb16454,nc,3.2,87862.0,black,gray,ge fleet services for itself/servicer,Tue Feb 10 2015 01:15:00 GMT-0800 (PST)
8,2011,Infiniti,G Sedan,G37x,G Sedan,automatic,jn1cv6ar5bm411441,tn,3.5,47028.0,black,beige,nissan infiniti lt,Wed Feb 04 2015 02:30:00 GMT-0800 (PST)
9,2007,Chevrolet,Suburban,1500 LS,SUV,automatic,3gnfc16j77g158033,ga,3.4,191211.0,black,tan,riverside chevrolet inc,Tue Feb 10 2015 04:30:00 GMT-0800 (PST)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110058 entries, 0 to 110057
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          110058 non-null  int64  
 1   make          107997 non-null  object 
 2   model         107979 non-null  object 
 3   trim          107944 non-null  object 
 4   body          107464 non-null  object 
 5   transmission  97047 non-null   object 
 6   vin           110058 non-null  object 
 7   state         110058 non-null  object 
 8   condition     107679 non-null  float64
 9   odometer      110039 non-null  float64
 10  color         109900 non-null  object 
 11  interior      109900 non-null  object 
 12  seller        110058 non-null  object 
 13  saledate      110058 non-null  object 
dtypes: float64(2), int64(1), object(11)
memory usage: 11.8+ MB


None

Unnamed: 0,count,mean,std,min,50%,max
year,110058.0,2010.060005,3.96019,1982.0,2012.0,2015.0
condition,107679.0,3.423222,0.951301,1.0,3.6,5.0
odometer,110039.0,68074.331601,53520.988173,1.0,51922.0,999999.0


Количество дублированных строк: 0


In [482]:
# Рассмотрим корреляции численных данных
corr = train[['year', 'condition', 'odometer', 'sellingprice']].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,year,condition,odometer,sellingprice
year,1.0,0.553403,-0.774498,0.586847
condition,0.553403,1.0,-0.540544,0.538906
odometer,-0.774498,-0.540544,1.0,-0.583044
sellingprice,0.586847,0.538906,-0.583044,1.0


# Предобработка данных

In [483]:
# Сохраним первоначальные датафреймы для сравнения чистых и обработанных данных
raw_train = train.copy(deep=True)
raw_test = test.copy(deep=True)

In [484]:
# Все пропущенные значения на экране
train.isna().sum()

year                0
make             8043
model            8123
trim             8337
body            10393
transmission    51461
vin                 0
state               0
condition        9405
odometer           69
color             586
interior          586
seller              0
sellingprice        0
saledate            0
dtype: int64

In [485]:
# Преобразуем колонку с датой продажи к формату datetime
# train['odometer'] = np.floor(pd.to_numeric(train['odometer'], errors='coerce')).astype('Int64')
train['sellingprice'] = np.floor(pd.to_numeric(train['sellingprice'], errors='coerce')).astype('Int32')
train['condition'] = np.floor(pd.to_numeric(train['condition'], errors='coerce')).astype('Float64')

# test['odometer'] = np.floor(pd.to_numeric(test['odometer'], errors='coerce')).astype('Int64')
test['condition'] = np.floor(pd.to_numeric(test['condition'], errors='coerce')).astype('Float64')

train['saledate'] = pd.to_datetime(train['saledate'].str[:-15], format="%a %b %d %Y %H:%M:%S")
test['saledate'] = pd.to_datetime(test['saledate'].str[:-15], format="%a %b %d %Y %H:%M:%S")

# train['year'] = pd.to_datetime(train['year'], format="%Y")
# test['year'] = pd.to_datetime(test['year'], format="%Y")

In [486]:
# Изучим марки автомобилей

train['make'] = train['make'].str.capitalize()
display(f"Первоначальное число уникальных марок автомобилей: {train['make'].value_counts().count()}")
train['make'].value_counts()

'Первоначальное число уникальных марок автомобилей: 62'

Ford         74067
Chevrolet    47818
Nissan       42866
Toyota       31711
Dodge        24294
             ...  
Daewoo           1
Dodge tk         1
Mazda tk         1
Dot              1
Airstream        1
Name: make, Length: 62, dtype: int64

In [527]:
def make_unique(data):
    if not data or data in ['none', 'nan']:
        return 'Other'
    data = str(data)
    if data.find("ford") != -1:
        return "ford"
    elif data.find("gmc") != -1:
        return "gmc"
    elif data.find("land") != -1 and data.find("rover") != -1:
        return "landrover"
    elif data.find("mercedes") != -1:
        return "mercedes"
    elif data == "vw":
        return "volkswagen"
    elif data.find("dodge") != -1:
        return "dodge"
    elif data.find("mazda") != -1:  #  Hyundai
        return "mazda"
    elif data.find("hyundai") != -1:
        return "hyundai"
    else:
        return data

In [488]:
def body_unique(data):
    if not data or data in ['none', 'nan']:
        return 'Other'
    data = str(data)
    if data.find("cab") != -1 or data.find("crew") != -1:
        return "pick-up"
    if data.find("convertible") != -1:
        return "convertible"
    if data.find("coupe") != -1 or data.find("koup") != -1:
        return "coupe"
    if data.find("wagon") != -1:
        return "wagon"
    if data.find("van") != -1:
        return "van"
    if data.find("sedan") != -1:
        return "sedan"
    else:
        return data

In [489]:
train['make'] = train['make'].str.lower().apply(make_unique).str.capitalize()
train['make'].value_counts()

Ford            74070
Chevrolet       47818
Nissan          42866
Toyota          31711
Dodge           24295
Honda           21600
Hyundai         17187
Bmw             16254
Kia             14325
Chrysler        13796
Mercedes        13408
Infiniti        12123
Jeep            12107
Volkswagen       9901
Lexus            9363
Gmc              8368
Nan              8043
Mazda            6704
Cadillac         5958
Acura            4679
Audi             4635
Lincoln          4541
Buick            4023
Subaru           4019
Pontiac          3596
Ram              3592
Mitsubishi       3413
Volvo            2958
Mini             2473
Saturn           2247
Mercury          1561
Landrover        1476
Scion            1338
Jaguar           1100
Porsche          1094
Suzuki            846
Fiat              672
Hummer            597
Saab              384
Smart             311
Oldsmobile        294
Isuzu             163
Maserati          112
Bentley            91
Aston martin       23
Plymouth  

In [528]:
test['make'] = test['make'].str.lower().apply(make_unique).str.capitalize()
test['make'].value_counts()

Ford            18466
Chevrolet       11801
Nissan          10485
Toyota           7728
Dodge            6188
Honda            5382
Hyundai          4410
Bmw              4136
Kia              3571
Mercedes         3471
Chrysler         3459
Jeep             3069
Infiniti         3002
Volkswagen       2515
Lexus            2430
Other            2061
Gmc              2059
Mazda            1676
Cadillac         1496
Lincoln          1153
Acura            1151
Audi             1124
Buick            1019
Subaru           1015
Ram               889
Pontiac           866
Mitsubishi        806
Volvo             765
Mini              674
Saturn            544
Mercury           434
Landrover         371
Scion             318
Jaguar            297
Porsche           280
Suzuki            222
Fiat              181
Hummer            174
Saab               93
Oldsmobile         88
Smart              81
Isuzu              38
Bentley            23
Maserati           21
Tesla               6
Plymouth  

In [490]:
train['body'] = train['body'].str.lower().apply(body_unique).str.capitalize()
train['body'].value_counts()

Sedan          196720
Suv            113042
Pick-up         37243
Van             25146
Hatchback       20715
Coupe           15791
Wagon           12639
Nan             10393
Convertible      8547
Name: body, dtype: int64

In [530]:
test['body'] = test['body'].str.lower().apply(body_unique).str.capitalize()
test['body'].value_counts()

Sedan          48970
Suv            28295
Pick-up         9202
Van             6414
Hatchback       5152
Coupe           3995
Wagon           3287
Other           2594
Convertible     2149
Name: body, dtype: int64

In [491]:
train['model'] = train['model'].str.capitalize()
train['model'].fillna("Unknown", inplace=True)
train['model'].value_counts()

Altima            15454
F-150             11408
Fusion            10244
Camry             10000
Escape             9447
                  ...  
420-class             1
C230                  1
Rrs                   1
Activehybrid 5        1
G500                  1
Name: model, Length: 840, dtype: int64

In [531]:
test['model'] = test['model'].str.capitalize()
test['model'].fillna("Unknown", inplace=True)
test['model'].value_counts()

Altima           3736
F-150            2737
Fusion           2553
Camry            2423
Escape           2296
                 ... 
Exige               1
1                   1
C240w               1
Accord hybrid       1
Caprice             1
Name: model, Length: 747, dtype: int64

In [492]:
train['trim'] = train['trim'].str.capitalize()
train['trim'].fillna("Unknown", inplace=True)
train['trim'].value_counts()

Base                      43876
Se                        34498
Lx                        16511
Limited                   14516
Lt                        13431
                          ...  
Executive pzev                1
4c base                       1
3500 sh ceiling 158 wb        1
Chevy van base                1
4x4 v6 xlt sport              1
Name: trim, Length: 1851, dtype: int64

In [532]:
test['trim'] = test['trim'].str.capitalize()
test['trim'].fillna("Unknown", inplace=True)
test['trim'].value_counts()

Base                11009
Se                   8725
Lx                   4098
Limited              3536
Lt                   3280
                    ...  
4wd s                   1
Hx                      1
Gr tr gr touring        1
Mr touring              1
Awd xs ll bean          1
Name: trim, Length: 1458, dtype: int64

In [493]:
train['transmission'].value_counts()

automatic    375061
manual        13714
Name: transmission, dtype: int64

In [494]:
# train['transmission'].fillna(train.groupby(['make', 'body', 'trim', 'model'])['transmission'].median(), inplace=True)
train['transmission'].fillna("Unknown", inplace=True)
train['transmission'].value_counts()

automatic    375061
Unknown       51461
manual        13714
Name: transmission, dtype: int64

In [533]:
test['transmission'].fillna("Unknown", inplace=True)
test['transmission'].value_counts()

automatic    93584
Unknown      13011
manual        3463
Name: transmission, dtype: int64

In [495]:
train['color'].fillna("Unknown", inplace=True)
train['color'].value_counts()

black        87115
white        84149
silver       65667
gray         65284
blue         40237
red          34514
—            19520
green         8975
gold          8934
beige         7257
burgundy      7059
brown         5320
orange        1629
purple        1250
off-white     1143
yellow         979
Unknown        586
charcoal       389
turquoise      183
pink            32
lime            14
Name: color, dtype: int64

In [534]:
test['color'].fillna("Unknown", inplace=True)
test['color'].value_counts()

black        22006
white        20928
silver       16360
gray         16348
blue         10180
red           8384
—             4948
green         2270
gold          2207
beige         1826
burgundy      1759
brown         1300
orange         407
purple         284
off-white      275
yellow         274
Unknown        158
charcoal        84
turquoise       49
pink            10
lime             1
Name: color, dtype: int64

In [496]:
train['interior'].fillna("Unknown", inplace=True)
train['interior'].value_counts()

black        192442
gray         140843
beige         46878
tan           34709
—             13563
brown          6818
red            1070
blue            885
silver          844
Unknown         586
off-white       373
purple          276
gold            256
white           215
green           198
burgundy        155
orange          109
yellow           16
Name: interior, dtype: int64

In [535]:
test['interior'].fillna("Unknown", inplace=True)
test['interior'].value_counts()

black        48176
gray         34984
beige        11931
tan           8658
—             3380
brown         1664
red            264
blue           241
silver         225
Unknown        158
off-white      107
gold            64
purple          58
green           44
burgundy        34
orange          33
white           33
yellow           4
Name: interior, dtype: int64

In [497]:
train['condition'].fillna(train.groupby(['make', 'body', 'trim', 'model'])['condition'].transform('median'), inplace=True)
train['condition'].fillna(train.groupby(['make'])['condition'].transform('median'), inplace=True)
train['condition'] = np.round(train['condition'], decimals = 1)
train['condition'].value_counts()

4.0    142593
3.0    139473
2.0    108832
1.0     40302
5.0      8862
2.5        85
1.5        76
3.5        13
Name: condition, dtype: Int64

In [536]:
test['condition'].fillna(train.groupby(['make', 'body', 'trim', 'model'])['condition'].transform('median'), inplace=True)
test['condition'].fillna(train.groupby(['make'])['condition'].transform('median'), inplace=True)
test['condition'] = np.round(train['condition'], decimals = 1)
test['condition'].value_counts()

4.0    35434
3.0    35037
2.0    27153
1.0    10202
5.0     2196
2.5       21
1.5       13
3.5        2
Name: condition, dtype: Int64

In [498]:
train['odometer'].fillna(train.groupby(['make', 'body', 'model'])['odometer'].transform('mean'), inplace=True)

In [537]:
test['odometer'].fillna(train.groupby(['make', 'body', 'model'])['odometer'].transform('mean'), inplace=True)

In [499]:
train.isna().sum()

year            0
make            0
model           0
trim            0
body            0
transmission    0
vin             0
state           0
condition       0
odometer        0
color           0
interior        0
seller          0
sellingprice    0
saledate        0
dtype: int64

In [551]:
test.isna().sum()

year                 0
make                 0
model                0
trim                 0
body                 0
transmission         0
state                0
condition            0
odometer             0
color                0
interior             0
saledate_week_day    0
saledate_month       0
saledate_year        0
car_age              0
dtype: int64

# Добавление синтетических данных

In [500]:
train['saledate_week_day'] = train['saledate'].dt.weekday
train['saledate_month'] = train['saledate'].dt.month
train['saledate_year'] = train['saledate'].dt.year
train['car_age'] = train['saledate'].dt.year - train['year']

In [539]:
test['saledate_week_day'] = test['saledate'].dt.weekday
test['saledate_month'] = test['saledate'].dt.month
test['saledate_year'] = test['saledate'].dt.year
test['car_age'] = test['saledate'].dt.year - test['year']

In [501]:
train.drop(columns=[
    'vin',
    'seller',
    'saledate'
], inplace=True)

In [540]:
test.drop(columns=[
    'vin',
    'seller',
    'saledate'
], inplace=True)

In [521]:
corr = train.corr(method='histogram_intersection')
corr.style.background_gradient(cmap='coolwarm')

  corr = train.corr()


TypeError: Styler.background_gradient() got an unexpected keyword argument 'method'

# Исследовательский анализ данных

Тут пока пусто

# Подбор и обучение моделей машинного обучения

In [560]:
target_train = train['sellingprice']
features_train = train.drop(['sellingprice'], axis=1)

In [562]:
features_test = test

In [504]:
categorical_columns = list(filter(lambda column: is_string_dtype(features_train[column]), features_train.columns))
numeric_columns = list(filter(lambda column: not is_string_dtype(features_train[column]), features_train.columns))

In [505]:
categorical_columns

['make', 'model', 'trim', 'body', 'transmission', 'state', 'color', 'interior']

In [561]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(features_train[categorical_columns])

In [507]:
features_train_encoded = pd.DataFrame(
    encoder.transform(features_train[categorical_columns]), # .toarray()
    columns=encoder.get_feature_names_out(categorical_columns),
)
features_train = pd.concat((features_train[numeric_columns], features_train_encoded), axis=1)

In [508]:
features_train.head()

Unnamed: 0,year,condition,odometer,saledate_week_day,saledate_month,saledate_year,car_age,make,model,trim,body,transmission,state,color,interior
0,2011,4.0,111041.0,1,6,2015,4,16.0,242.0,1410.0,6.0,1.0,12.0,2.0,2.0
1,2014,3.0,31034.0,2,2,2015,1,16.0,321.0,1379.0,5.0,1.0,15.0,2.0,2.0
2,2012,2.0,35619.0,2,6,2015,3,38.0,667.0,69.0,5.0,1.0,19.0,2.0,2.0
3,2003,2.0,131301.0,2,6,2015,12,20.0,383.0,581.0,6.0,1.0,33.0,7.0,1.0
4,2007,2.0,127709.0,1,2,2015,8,16.0,321.0,1410.0,5.0,1.0,12.0,2.0,2.0


In [563]:
features_test_encoded = pd.DataFrame(
    encoder.transform(features_test[categorical_columns]), # .toarray()
    columns=encoder.get_feature_names_out(categorical_columns),
)
features_test = pd.concat((features_test[numeric_columns], features_test_encoded), axis=1)

In [564]:
describe_dataframe(features_test)

Unnamed: 0,year,condition,odometer,saledate_week_day,saledate_month,saledate_year,car_age,make,model,trim,body,transmission,state,color,interior
0,2005,4.0,116970.0,2,1,2015,10,7.0,203.0,581.0,5.0,1.0,3.0,16.0,2.0
1,2014,3.0,6286.0,4,2,2015,1,18.0,661.0,177.0,7.0,0.0,28.0,18.0,7.0
2,2013,2.0,11831.0,1,2,2015,2,38.0,513.0,1324.0,6.0,1.0,24.0,8.0,2.0
3,2013,2.0,57105.0,4,3,2015,2,8.0,397.0,1124.0,5.0,1.0,5.0,16.0,2.0
4,2013,2.0,31083.0,2,6,2015,2,38.0,742.0,1563.0,4.0,1.0,32.0,2.0,2.0
5,2003,2.0,104155.0,1,6,2015,12,53.0,540.0,951.0,8.0,1.0,17.0,16.0,2.0
6,2013,3.0,30669.0,1,3,2015,2,21.0,700.0,949.0,5.0,1.0,8.0,16.0,7.0
7,2013,4.0,87862.0,1,2,2015,2,16.0,285.0,581.0,6.0,1.0,17.0,2.0,7.0
8,2011,1.0,47028.0,2,2,2015,4,22.0,330.0,918.0,5.0,1.0,32.0,2.0,1.0
9,2007,1.0,191211.0,1,2,2015,8,8.0,723.0,47.0,6.0,1.0,6.0,2.0,14.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110058 entries, 0 to 110057
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   year               110058 non-null  int64  
 1   condition          110058 non-null  Float64
 2   odometer           110058 non-null  float64
 3   saledate_week_day  110058 non-null  int64  
 4   saledate_month     110058 non-null  int64  
 5   saledate_year      110058 non-null  int64  
 6   car_age            110058 non-null  int64  
 7   make               110058 non-null  float64
 8   model              110058 non-null  float64
 9   trim               110058 non-null  float64
 10  body               110058 non-null  float64
 11  transmission       110058 non-null  float64
 12  state              110058 non-null  float64
 13  color              110058 non-null  float64
 14  interior           110058 non-null  float64
dtypes: Float64(1), float64(9), int64(5)
memory usage: 1

None

Unnamed: 0,count,mean,std,min,50%,max
year,110058.0,2010.060005,3.96019,1982.0,2012.0,2015.0
condition,110058.0,2.929492,1.007287,1.0,3.0,5.0
odometer,110058.0,68075.991622,53517.960035,1.0,51932.5,999999.0
saledate_week_day,110058.0,2.14287,1.044089,0.0,2.0,6.0
saledate_month,110058.0,3.810518,3.241834,1.0,2.0,12.0
saledate_year,110058.0,2014.902751,0.296297,2014.0,2015.0,2015.0
car_age,110058.0,4.842747,3.938529,-1.0,3.0,33.0
make,110058.0,23.027404,14.945815,-1.0,19.0,54.0
model,110058.0,385.438024,236.984876,-1.0,330.0,839.0
trim,110058.0,1044.418434,483.230212,-1.0,1118.0,1850.0


Количество дублированных строк: 0


In [509]:
scaler = StandardScaler()
scaler.fit(features_train)

features_train = pd.DataFrame(
    scaler.transform(features_train),
    columns=features_train.columns,
    index=features_train.index)

In [510]:
describe_dataframe(features_train)

Unnamed: 0,year,condition,odometer,saledate_week_day,saledate_month,saledate_year,car_age,make,model,trim,body,transmission,state,color,interior
0,0.241305,1.059178,0.797397,-1.089094,0.672419,0.328663,-0.21809,-0.527942,-0.605503,0.75341,0.681573,0.228591,-0.361429,-1.258869,-0.705233
1,0.995464,0.066029,-0.696936,-0.131164,-0.560021,0.328663,-0.976742,-0.527942,-0.272387,0.689375,0.036378,0.228591,-0.101577,-1.258869,-0.705233
2,0.492692,-0.92712,-0.611299,-0.131164,0.672419,0.328663,-0.470974,0.965347,1.186576,-2.016637,0.036378,0.228591,0.244892,-1.258869,-0.705233
3,-1.769785,-0.92712,1.175804,-0.131164,0.672419,0.328663,1.804982,-0.256435,-0.010954,-0.95902,0.681573,0.228591,1.457534,-0.508241,-0.940031
4,-0.76424,-0.92712,1.108715,-1.089094,-0.560021,0.328663,0.793446,-0.527942,-0.272387,0.75341,0.036378,0.228591,-0.361429,-1.258869,-0.705233
5,0.744078,-0.92712,-0.998391,0.826766,0.364309,0.328663,-0.723858,0.422333,0.473961,-0.95902,0.036378,0.228591,-0.274811,-1.258869,-0.705233
6,-0.010081,0.066029,0.863628,-2.047024,-0.868131,0.328663,0.034794,1.168977,-0.175404,-1.465106,-1.254011,0.228591,0.071658,0.842889,-0.705233
7,0.744078,1.059178,-0.78586,-1.089094,-0.560021,0.328663,-0.723858,-0.527942,-0.479003,0.689375,0.681573,0.228591,-0.96775,-1.108743,0.468753
8,-2.523944,-1.920269,2.134391,0.826766,-0.868131,0.328663,2.563634,-0.188558,-0.588636,-0.198858,0.036378,0.228591,0.677979,-1.258869,2.112334
9,-1.267013,-1.920269,1.513177,-1.089094,0.672419,0.328663,1.299214,-0.527942,-0.28082,0.110991,1.971961,0.228591,0.677979,-1.258869,2.112334


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440236 entries, 0 to 440235
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   year               440236 non-null  float64
 1   condition          440236 non-null  float64
 2   odometer           440236 non-null  float64
 3   saledate_week_day  440236 non-null  float64
 4   saledate_month     440236 non-null  float64
 5   saledate_year      440236 non-null  float64
 6   car_age            440236 non-null  float64
 7   make               440236 non-null  float64
 8   model              440236 non-null  float64
 9   trim               440236 non-null  float64
 10  body               440236 non-null  float64
 11  transmission       440236 non-null  float64
 12  state              440236 non-null  float64
 13  color              440236 non-null  float64
 14  interior           440236 non-null  float64
dtypes: float64(15)
memory usage: 50.4 MB


None

Unnamed: 0,count,mean,std,min,50%,max
year,440236.0,-2.548067e-14,1.000001,-7.048899,0.492692,1.246851
condition,440236.0,-8.244333000000001e-17,1.000001,-1.920269,0.066029,2.052328
odometer,440236.0,1.329939e-16,1.000001,-1.276556,-0.3034,17.400938
saledate_week_day,440236.0,1.131417e-16,1.000001,-2.047024,-0.131164,3.700556
saledate_month,440236.0,-4.480476e-17,1.000001,-0.868131,-0.560021,2.521078
saledate_year,440236.0,-2.022082e-13,1.000001,-3.042632,0.328663,0.328663
car_age,440236.0,-1.096312e-16,1.000001,-1.48251,-0.470974,7.115546
make,440236.0,6.518963000000001e-17,1.000001,-1.613969,-0.324311,2.051374
model,440236.0,2.441988e-17,1.000001,-1.625934,-0.234437,1.911841
trim,440236.0,-2.056241e-16,1.000001,-2.159167,0.150238,1.6623


Количество дублированных строк: 0


In [565]:
features_test = pd.DataFrame(
    scaler.transform(features_test),
    columns=features_test.columns,
    index=features_test.index)

In [566]:
describe_dataframe(features_test)

Unnamed: 0,year,condition,odometer,saledate_week_day,saledate_month,saledate_year,car_age,make,model,trim,body,transmission,state,color,interior
0,-1.267013,1.059178,0.908137,-0.131164,-0.868131,0.328663,1.299214,-1.138832,-0.769952,-0.95902,0.036378,0.228591,-1.140984,0.842889,-0.705233
1,0.995464,0.066029,-1.159167,1.784696,-0.560021,0.328663,-0.976742,-0.392188,1.161276,-1.793545,1.326767,-2.437424,1.024448,1.14314,0.468753
2,0.744078,-0.92712,-1.0556,-1.089094,-0.560021,0.328663,-0.723858,0.965347,0.537211,0.575764,0.681573,0.228591,0.677979,-0.358116,-0.705233
3,0.744078,-0.92712,-0.209994,1.784696,-0.251911,0.328663,-0.723858,-1.070956,0.048079,0.162632,0.036378,0.228591,-0.96775,0.842889,-0.705233
4,0.744078,-0.92712,-0.696021,-0.131164,0.672419,0.328663,-0.723858,0.965347,1.502825,1.069456,-0.608816,0.228591,1.370917,-1.258869,-0.705233
5,-1.769785,-0.92712,0.668784,-1.089094,0.672419,0.328663,1.804982,1.983498,0.651061,-0.194727,1.971961,0.228591,0.071658,0.842889,-0.705233
6,0.744078,0.066029,-0.703753,-1.089094,-0.251911,0.328663,-0.723858,-0.188558,1.325726,-0.198858,0.036378,0.228591,-0.707898,0.842889,0.468753
7,0.744078,1.059178,0.364471,-1.089094,-0.560021,0.328663,-0.723858,-0.527942,-0.424187,-0.95902,0.681573,0.228591,0.071658,-1.258869,0.468753
8,0.241305,-1.920269,-0.398207,-0.131164,-0.560021,0.328663,-0.21809,-0.120681,-0.234437,-0.262893,0.036378,0.228591,1.370917,-1.258869,-0.940031
9,-0.76424,-1.920269,2.294775,-1.089094,-0.560021,0.328663,0.793446,-1.070956,1.422709,-2.062081,0.681573,0.228591,-0.881132,-1.258869,2.112334


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110058 entries, 0 to 110057
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   year               110058 non-null  float64
 1   condition          110058 non-null  float64
 2   odometer           110058 non-null  float64
 3   saledate_week_day  110058 non-null  float64
 4   saledate_month     110058 non-null  float64
 5   saledate_year      110058 non-null  float64
 6   car_age            110058 non-null  float64
 7   make               110058 non-null  float64
 8   model              110058 non-null  float64
 9   trim               110058 non-null  float64
 10  body               110058 non-null  float64
 11  transmission       110058 non-null  float64
 12  state              110058 non-null  float64
 13  color              110058 non-null  float64
 14  interior           110058 non-null  float64
dtypes: float64(15)
memory usage: 12.6 MB


None

Unnamed: 0,count,mean,std,min,50%,max
year,110058.0,0.005003,0.995538,-7.048899,0.492692,1.246851
condition,110058.0,-0.003996,1.000386,-1.920269,0.066029,2.052328
odometer,110058.0,-0.005083,0.999583,-1.276556,-0.306603,17.400938
saledate_week_day,110058.0,0.005696,1.000164,-2.047024,-0.131164,3.700556
saledate_month,110058.0,-0.002182,0.998841,-0.868131,-0.560021,2.521078
saledate_year,110058.0,0.000809,0.998906,-3.042632,0.328663,0.328663
car_age,110058.0,-0.004973,0.995991,-1.48251,-0.470974,7.115546
make,110058.0,-0.050944,1.014473,-1.681846,-0.324311,2.051374
model,110058.0,-0.000674,0.999284,-1.63015,-0.234437,1.911841
trim,110058.0,-0.001756,0.998188,-2.161233,0.150238,1.6623


Количество дублированных строк: 0


In [518]:
kf = KFold(n_splits=3, random_state=32123, shuffle=True)
params = {
    'criterion': ['absolute_error', 'squared_error', 'poisson', 'friedman_mse'],
    'max_depth': range(2, 10, 4),
    'random_state': [32123]
}
dt_pipeline = make_pipeline(DecisionTreeRegressor())

In [519]:
new_params = {key: params[key] for key in params}  # decisiontreeregressor__
grid_dt = GridSearchCV(
    DecisionTreeRegressor(),
    param_grid=params,
    cv=kf,
    scoring='neg_mean_absolute_percentage_error',
    n_jobs=-1,
    return_train_score=True)

In [520]:
grid_dt.fit(features_train, target_train)
grid_dt.best_params_

{'criterion': 'absolute_error', 'max_depth': 16, 'random_state': 32123}

In [522]:
grid_dt.best_score_

-0.24502968885379958

In [523]:
model = DecisionTreeRegressor(
    criterion='absolute_error',
    max_depth=16,
    random_state=32123
)
model.fit(features_train, target_train)

In [546]:
describe_dataframe(sample_submission)

Unnamed: 0,vin,sellingprice
0,1g6dp567450124779,13592.209588
1,1gtw7fca7e1902207,13592.209588
2,jn8az1mw6dw303497,13592.209588
3,2g1wf5e34d1160703,13592.209588
4,1n6aa0ec3dn301209,13592.209588
5,wvwvd63b93e175638,13592.209588
6,5npeb4ac4dh809686,13592.209588
7,1fm5k7b97dgb16454,13592.209588
8,jn1cv6ar5bm411441,13592.209588
9,3gnfc16j77g158033,13592.209588


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110058 entries, 0 to 110057
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   vin           110058 non-null  object 
 1   sellingprice  110058 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.7+ MB


None

Unnamed: 0,count,mean,std,min,50%,max
sellingprice,110058.0,13592.209588,7.275991e-12,13592.209588,13592.209588,13592.209588


Количество дублированных строк: 0


In [567]:
submission = model.predict(features_test)
submission

array([ 3900., 22200., 17400., ...,  3400., 20800., 15800.])

In [572]:
sample_submission['sellingprice'] = submission
sample_submission.to_csv('sample_submission.csv', index=False)