In [151]:
#%pip install vininfo
#%pip install catboost

In [152]:
import pandas as pd
import numpy as np
import pickle

from vininfo import Vin

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

from catboost import Pool, CatBoostRegressor, CatBoostClassifier, cv
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

## Знакомство с данными

### train

In [186]:
train = pd.read_csv("/Users/sirena0789/Desktop/Автомобили/used-cars-price-prediction-22ds/train.csv")

In [154]:
train.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,sellingprice,saledate
0,2011,Ford,Edge,SEL,suv,automatic,2fmdk3jc4bba41556,md,4.2,111041.0,black,black,santander consumer,12500,Tue Jun 02 2015 02:30:00 GMT-0700 (PDT)
1,2014,Ford,Fusion,SE,Sedan,automatic,3fa6p0h75er208976,mo,3.5,31034.0,black,black,ars/avis budget group,14500,Wed Feb 25 2015 02:00:00 GMT-0800 (PST)
2,2012,Nissan,Sentra,2.0 SL,sedan,automatic,3n1ab6ap4cl698412,nj,2.2,35619.0,black,black,nissan-infiniti lt,9100,Wed Jun 10 2015 02:30:00 GMT-0700 (PDT)
3,2003,HUMMER,H2,Base,suv,automatic,5grgn23u93h101360,tx,2.8,131301.0,gold,beige,wichita falls ford lin inc,13300,Wed Jun 17 2015 03:00:00 GMT-0700 (PDT)
4,2007,Ford,Fusion,SEL,Sedan,automatic,3fahp08z17r268380,md,2.0,127709.0,black,black,purple heart,1300,Tue Feb 03 2015 04:00:00 GMT-0800 (PST)


In [155]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440236 entries, 0 to 440235
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          440236 non-null  int64  
 1   make          432193 non-null  object 
 2   model         432113 non-null  object 
 3   trim          431899 non-null  object 
 4   body          429843 non-null  object 
 5   transmission  388775 non-null  object 
 6   vin           440236 non-null  object 
 7   state         440236 non-null  object 
 8   condition     430831 non-null  float64
 9   odometer      440167 non-null  float64
 10  color         439650 non-null  object 
 11  interior      439650 non-null  object 
 12  seller        440236 non-null  object 
 13  sellingprice  440236 non-null  int64  
 14  saledate      440236 non-null  object 
dtypes: float64(2), int64(2), object(11)
memory usage: 50.4+ MB


### test

In [187]:
test = pd.read_csv("/Users/sirena0789/Desktop/Автомобили/used-cars-price-prediction-22ds/test.csv")

In [157]:
test.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,saledate
0,2005,Cadillac,CTS,Base,Sedan,automatic,1g6dp567450124779,ca,2.7,116970.0,silver,black,lexus of stevens creek,Wed Jan 14 2015 04:30:00 GMT-0800 (PST)
1,2014,GMC,Savana Cargo,2500,Van,,1gtw7fca7e1902207,pa,4.4,6286.0,white,gray,u-haul,Fri Feb 27 2015 01:00:00 GMT-0800 (PST)
2,2013,Nissan,Murano,S,SUV,automatic,jn8az1mw6dw303497,oh,4.6,11831.0,gray,black,nissan-infiniti lt,Tue Feb 24 2015 01:30:00 GMT-0800 (PST)
3,2013,Chevrolet,Impala,LS Fleet,Sedan,automatic,2g1wf5e34d1160703,fl,2.3,57105.0,silver,black,onemain rem/auto club of miami inc dba north dad,Fri Mar 06 2015 02:00:00 GMT-0800 (PST)
4,2013,Nissan,Titan,SV,Crew Cab,automatic,1n6aa0ec3dn301209,tn,2.9,31083.0,black,black,nissan north america inc.,Wed Jun 03 2015 03:30:00 GMT-0700 (PDT)


In [158]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110058 entries, 0 to 110057
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          110058 non-null  int64  
 1   make          107997 non-null  object 
 2   model         107979 non-null  object 
 3   trim          107944 non-null  object 
 4   body          107464 non-null  object 
 5   transmission  97047 non-null   object 
 6   vin           110058 non-null  object 
 7   state         110058 non-null  object 
 8   condition     107679 non-null  float64
 9   odometer      110039 non-null  float64
 10  color         109900 non-null  object 
 11  interior      109900 non-null  object 
 12  seller        110058 non-null  object 
 13  saledate      110058 non-null  object 
dtypes: float64(2), int64(1), object(11)
memory usage: 11.8+ MB


## Предобработка данных

### Вспомогательные функции

In [159]:
# Для понижения регистра
def reg_low(df):
    col_cat = ['make', 'model', 'trim', 'body', 'state', 'color', 'interior', 'seller']
    for i in col_cat:
        df[i] = df[i].str.lower()

In [160]:
# Удаление столбца с датой и добавление возраста машины и дня недели продажи
def change_date(df):
    df['saledate'] = pd.to_datetime(df['saledate'], utc=True)
    df['saleyaer'] = pd.DatetimeIndex(df['saledate']).year
    df['weekday'] = df['saledate'].dt.weekday
    df['machineage'] =  df['saleyaer'] - df['year']
    df = df.drop(['saledate'], axis=1)
    return(df)

### Основная часть

In [161]:
# Проверим данные на выбросы и мин/макс границы

train.describe()

test.describe()

Unnamed: 0,year,condition,odometer
count,110058.0,107679.0,110039.0
mean,2010.060005,3.423222,68074.331601
std,3.96019,0.951301,53520.988173
min,1982.0,1.0,1.0
25%,2007.0,2.7,28313.5
50%,2012.0,3.6,51922.0
75%,2013.0,4.2,98852.5
max,2015.0,5.0,999999.0


In [188]:
# Изменение формата даты и добавление новых признаков

train = change_date(train)

test = change_date(test)

In [189]:
# Изменение 141 строки с machineage = -1 на 0

train.loc[train['machineage'] < 0, 'machineage'] = 0

# Изменение 29 строк с machineage = -1

test.loc[test['machineage'] < 0, 'machineage'] = 0

In [190]:
# Удаление 1 строки с sellingprice = 1
train = train[train['sellingprice'] >= 100]

In [191]:
# Понижение регистра для удаления неявных дубликатов

reg_low(train)

reg_low(test)

## Заполнение пропусков

In [214]:
# Ячейка для обновления данных для моего удобства (не обращай внимания на нее)

train = pd.read_csv("/Users/sirena0789/Desktop/Автомобили/used-cars-price-prediction-22ds/train.csv")
test = pd.read_csv("/Users/sirena0789/Desktop/Автомобили/used-cars-price-prediction-22ds/test.csv")
train = change_date(train)
test = change_date(test)
train.loc[train['machineage'] < 0, 'machineage'] = 0
test.loc[test['machineage'] < 0, 'machineage'] = 0
train = train[train['sellingprice'] >= 100]
reg_low(train)
reg_low(test)

### Вспомогательные функции

Тут я пытаюсь заполнить пропуски. Для ускорения работы у меня прописаны функции для создания выборок, обучения на данных без пропусков для train, запись этой обученной модели в файл и функция, которая вытаскивает модель из файла для предсказаний в test. Так как у нас по большей части мультиклассовая классификация, то колонки  model, trim, body заполняются с помощью логистической регрессии, transmission заполнется кэтбустом (т к это бинарная классификация и для этойй цели кэтбуст лучше подойдет), а condition (регрессия) заполнется CatBoostRegressor.

In [215]:
# Функция для создания и разделения выборок в train для заполения transmission
# На входе датафрейм, на выходе выборки для обучения и конструирования итоговой таблицы

def sample_transmission_train(df):
    name = df.columns
    
    df_transmission_test_new = df.loc[df['transmission'].isnull()]
    ind = df_transmission_test_new.index
    df_transmission_test = df_transmission_test_new.drop(['sellingprice'], axis=1)
    df_transmission_train_new = df.loc[df['transmission'].notna()]
    df_transmission_train = df_transmission_train_new .drop(['sellingprice'], axis=1)
    df_transmission_train_features = df_transmission_train.drop(['transmission'], axis=1)
    df_transmission_train_target = df_transmission_train['transmission']
    df_transmission_test = df_transmission_test.drop(['transmission'], axis=1)
    df_transmission_train_target = pd.get_dummies(df_transmission_train_target, drop_first=True)
    return (df_transmission_train_features, df_transmission_train_target, 
            df_transmission_train, df_transmission_test, name, ind, 
            df_transmission_test_new, df_transmission_train_new)

In [216]:
# Функция для создания и разделения выборок в test для заполения transmission
# На входе датафрейм, на выходе выборки для обучения и конструирования итоговой таблицы

def sample_transmission_test(df):
    name = df.columns
    df_transmission_test = df.loc[df['transmission'].isnull()]
    ind = df_transmission_test.index
    df_transmission_train = df.loc[df['transmission'].notna()]
    df_transmission_train_features = df_transmission_train.drop(['transmission'], axis=1)
    df_transmission_train_target = df_transmission_train['transmission']
    df_transmission_test = df_transmission_test.drop(['transmission'], axis=1)
    df_transmission_train_target = pd.get_dummies(df_transmission_train_target, drop_first=True)
    return (df_transmission_train_features, df_transmission_train_target, 
            df_transmission_train, df_transmission_test, name, ind)

In [217]:
# Функция для обучения и сохранение моедли в файл для train transmission
# На входе выборки для обучения и конструирования итоговой таблицы, на выходе итоговый датафрейм

def train_transmission(df_transmission_train_features, df_transmission_train_target, 
                       df_transmission_train, df_transmission_test, name, ind,
                       df_transmission_test_new, df_transmission_train_new):
    model = CatBoostClassifier(learning_rate=0.2,
                          random_state=1234,
                          verbose=False,
                          cat_features=["make","model","trim","body","state","color", "interior", "seller"])
    model.fit(df_transmission_train_features, df_transmission_train_target)
    filename = 'model_transmission.sav'
    pickle.dump(model, open(filename, 'wb'))
    target_predict = model.predict(df_transmission_test)
    target_predict = pd.Series(data=target_predict, index=ind)
    target_predict = target_predict.replace([0, 1],['automatic', 'manual'])
    df_transmission_test_new['transmission'] = target_predict
    df_transmission_test_new = df_transmission_test_new[name]
    df = pd.concat([df_transmission_train_new, df_transmission_test_new], axis=0)
    return df

In [218]:
# Функция для извлечение моедли из файла и заполнение transmission в test
# На входе выборки для обучения и конструирования итоговой таблицы, на выходе итоговый датафрейм

def test_transmission(df_transmission_train_features, df_transmission_train_target,
                       df_transmission_test, name, ind):
    filename = 'model_transmission.sav'
    model = pickle.load(open(filename, 'rb'))
    target_predict = model.predict(df_transmission_test)
    target_predict = pd.Series(data=target_predict, index=ind)
    target_predict = target_predict.replace([0, 1],['automatic', 'manual'])
    df_transmission_test['transmission'] = target_predict
    df_transmission_test = df_transmission_test[name]
    df = pd.concat([df_transmission_train, df_transmission_test], axis=0)
    return df

In [219]:
# Функция для заполнения make с использованием vininfo

def vin(df):
    df.loc[df['make'].isnull(), 'make'] = df.loc[df['make'].isnull()].apply(
        lambda row: Vin(row['vin']).manufacturer, axis=1)
    return (df)

In [220]:
# Функция для создания и разделения выборок для заполения 'model','trim','body'.
# На вход поступает сам датафрейм, название столбца, который будем заполнять, 
# название столбцов в которых до сих пор присутствуют пропуски и они не пройдут в модель (их мы удаляем перед обучением), 
# и колонки которым требуется encoder, на выходе выборки для обучения и конструирования итоговой таблицы

def sample_filling(df, column, columns, encoder_columns):
    name = df.columns
    df_test = df.loc[df[column].isnull()]
    ind = df_test.index
    df_train = df.loc[df[column].notna()]
    df_target = df_train[column]
    df_features = df_train.drop(columns, axis=1)
    df_features = df_features.drop(column, axis=1)
    df_test_new = df_test.drop(column, axis=1)
    df_test = df_test.drop(column, axis=1)
    df_test = df_test.drop(columns, axis=1)
    label_encoder = LabelEncoder()
    for i in encoder_columns:
        kek = df_features[i].append(df_test[i])
        kek = kek.drop_duplicates()
        label_encoder = label_encoder.fit(kek)
        df_features[i] = label_encoder.transform(df_features[i])
        df_test[i] = label_encoder.transform(df_test[i])
    return (df_features, df_target, df_train, df_test, df_test_new, name, ind)

In [221]:
# Функция для обучения и заполнения для train для заполения 'model','trim','body'.
# На вход поступает выборки для обучения и конструирования итоговой таблицы 
# на выходе модель для записи в файл и итоговый датафрейм 

def train_filling(df_features, df_target, df_train, df_test, df_test_new, name, ind, column):
    model = LogisticRegression(random_state=12345, class_weight='balanced',
                               multi_class='multinomial', n_jobs=-1,  solver='sag')
    model = model.fit(df_features, df_target)
    target_predict = model.predict(df_test)
    target_predict = pd.Series(data=target_predict, index=ind)
    df_test_new[column] = target_predict
    df_test_new = df_test_new[name]
    df_new = pd.concat([df_train, df_test_new], axis=0)
    return (df_new, model)

In [222]:
# Функция для обучения и заполнения для train для заполения 'model','trim','body'.
# На вход поступает выборки для обучения и конструирования итоговой таблицы 
# на выходе итоговый датафрейм 

def test_filling(df_features, df_target, df_train, df_test, df_test_new, name, ind, filename, column):
    model = pickle.load(open(filename, 'rb'))
    target_predict = model.predict(df_test)
    target_predict = pd.Series(data=target_predict, index=ind)
    df_test_new[column] = target_predict
    df_test_new = df_test_new[name]
    df_new = pd.concat([df_train, df_test_new], axis=0)
    return df_new

In [223]:
# Функция для обучения и заполнения 'condition'
# На вход поступает датафрейм и название столбца для заполнения
# на выходе итоговый датафрейм 

def filling_regr(df, column):
    # подготовка выборок
    name = df.columns
    df_test = df.loc[df[column].isnull()]
    ind = df_test.index
    df_train = df.loc[df[column].notna()]
    df_target = df_train[column]
    df_features = df_train.drop(column, axis=1)
    df_test_new = df_test.drop(column, axis=1)
    df_test = df_test.drop(column, axis=1)
    df_features = df_features.drop('transmission', axis=1)
    df_test = df_test.drop('transmission', axis=1)
    

    model = CatBoostRegressor(random_state=12345, cat_features=['make', 'model', 'trim', 'body',
                                 'state','color',
                                 'interior', 'seller'],
                          learning_rate=0.2, verbose= False).fit(df_features, df_target)
    target_predict = model.predict(df_test)
    target_predict = pd.Series(data=target_predict, index=ind)
    df_test_new[column] = target_predict
    df_test_new = df_test_new[name]
    df_new = pd.concat([df_train, df_test_new], axis=0)
    return df_new


### Основная часть

In [224]:
# Заполнение пропусков в 'color', 'interior'

train['color'] = train['color'].fillna('-')
train['interior'] = train['interior'].fillna('-')

test['color'] = test['color'].fillna('-')
test['interior'] = test['interior'].fillna('-')

In [225]:
# Заполнение пропусков в 'make'

train = vin(train)

test = vin(test)

In [226]:
# Удаление вин

train = train.drop(['vin'], axis=1)

# Сохранение индексов и вина из теста для итогового файла
ind = test.index
vin_test = test['vin']
vin_test = pd.DataFrame(data=vin_test, index=ind)
test = test.drop(['vin'], axis=1)

In [178]:
# Заполнение 'model' train с обучением

column = 'model'
columns = ['trim','body','transmission','condition' ,
           'odometer', 'saleyaer', 'weekday', 'sellingprice']
encoder_columns = ['make', 'state','color', 'interior', 'seller']

(df_features, df_target, df_train, df_test,
 df_test_new, name, ind) = sample_filling(train, column, columns, encoder_columns)

train, model_model = train_filling(df_features, df_target, df_train, df_test, df_test_new, name, ind, column)
filename = 'model_model.sav'
pickle.dump(model_model, open(filename, 'wb'))

In [228]:
# Заполнение 'model' test

columns = ['trim','body','transmission','condition' ,
           'odometer', 'saleyaer', 'weekday']

(df_features, df_target, df_train, df_test, 
 df_test_new, name, ind) = sample_filling (test, column, columns, encoder_columns)

test = test_filling(df_features, df_target, df_train, df_test, df_test_new, name, ind, filename, column)

In [None]:
# Заполнение 'trim' train с обучением

column = 'trim'
columns = ['body','transmission','condition' ,
           'odometer', 'saleyaer', 'weekday', 'sellingprice']
encoder_columns = ['make', 'model', 'state','color', 'interior', 'seller']

(df_features, df_target, df_train, df_test,
 df_test_new, name, ind) = sample_filling(train, column, columns, encoder_columns)

train, model_model = train_filling(df_features, df_target, df_train, df_test, df_test_new, name, ind, column)
filename = 'model_trim.sav'
pickle.dump(model_model, open(filename, 'wb'))

In [230]:
# Заполнение 'trim' test

columns = ['body','transmission','condition' ,
           'odometer', 'saleyaer', 'weekday']

(df_features, df_target, df_train, df_test, 
 df_test_new, name, ind) = sample_filling (test, column, columns, encoder_columns)

test = test_filling(df_features, df_target, df_train, df_test, df_test_new, name, ind, filename, column)

In [None]:
# Заполнение 'body' train с обучением
column = 'body'
columns = ['transmission','condition' ,
           'odometer', 'saleyaer', 'weekday', 'sellingprice']
encoder_columns = ['make', 'model', 'trim', 'state','color', 'interior', 'seller']

(df_features, df_target, df_train, df_test,
 df_test_new, name, ind) = sample_filling(train, column, columns, encoder_columns)

train, model_model = train_filling(df_features, df_target, df_train, df_test, df_test_new, name, ind, column)
filename = 'model_body.sav'
pickle.dump(model_model, open(filename, 'wb'))

In [232]:
# Заполнение 'body' test

columns = ['transmission','condition' ,
           'odometer', 'saleyaer', 'weekday']

(df_features, df_target, df_train, df_test, 
 df_test_new, name, ind) = sample_filling (test, column, columns, encoder_columns)

test = test_filling(df_features, df_target, df_train, df_test, df_test_new, name, ind, filename, column)

In [234]:
# Удаление пропусков в train в odometer (их мало)

train = train.dropna(subset=['odometer'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440166 entries, 0 to 440144
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          440166 non-null  int64  
 1   make          440166 non-null  object 
 2   model         440166 non-null  object 
 3   trim          440166 non-null  object 
 4   body          440166 non-null  object 
 5   transmission  388731 non-null  object 
 6   state         440166 non-null  object 
 7   condition     430810 non-null  float64
 8   odometer      440166 non-null  float64
 9   color         440166 non-null  object 
 10  interior      440166 non-null  object 
 11  seller        440166 non-null  object 
 12  sellingprice  440166 non-null  int64  
 13  saleyaer      440166 non-null  int64  
 14  weekday       440166 non-null  int64  
 15  machineage    440166 non-null  int64  
dtypes: float64(2), int64(5), object(9)
memory usage: 57.1+ MB


In [235]:
# Заполение пропусков в condition train

column = 'condition'
train = filling_regr(train, column)

In [236]:
# Заполение пропусков в 'condition' test

column = 'condition'
test = filling_regr(test, column)

In [237]:
# Заполение пропусков в transmission train

(df_transmission_train_features, df_transmission_train_target, 
            df_transmission_train, df_transmission_test, name, ind, 
            df_transmission_test_new, df_transmission_train_new) = sample_transmission_train(train)

train = train_transmission(df_transmission_train_features, df_transmission_train_target, 
            df_transmission_train, df_transmission_test, name, ind, 
            df_transmission_test_new, df_transmission_train_new)

In [238]:
# Заполение пропусков в transmission test

(df_transmission_train_features, df_transmission_train_target, 
            df_transmission_train, df_transmission_test, name, ind) = sample_transmission_test(test)

test = test_transmission(df_transmission_train_features, df_transmission_train_target,
                       df_transmission_test, name, ind)

## Первичный анадиз данных

### train

In [None]:
# Проверка распрделения стоимости
train['sellingprice'].hist(bins=200, figsize=(12,9))

In [None]:
# Проверка корреляции
train.corr()

### test

In [None]:
# Проверка корреляции
test.corr()

## Работа с выборками

### Вспомогательные функции

In [239]:
# Функция для LabelEncoder()
def LE(features, test, cat_columns):
    label_encoder = LabelEncoder()
    for i in cat_columns:
        features[i]= label_encoder.fit_transform(features[i])
        test[i]= label_encoder.fit_transform(test[i])
    return features, test

In [240]:
# Функция для StandardScaler()

def scaler(features_train,features_test, columns):
    scaler = StandardScaler()

    scaler.fit(features_train[columns])

    features_train = features_train.copy()
    features_train[columns] = scaler.transform(features_train[columns])

    features_test = features_test.copy()
    features_test[columns] = scaler.transform(features_test[columns])
    return features_train, features_test

In [241]:
# Функция для get_dummies (пробовала OHE(), но мой копмьютер совершенно не тянет эту кодировку, да и смысла не вижу)

def ohe(features_train,features_test):
    features_train = pd.get_dummies(features_train, drop_first=True)
    features_test = pd.get_dummies(features_test, drop_first=True)
    return features_train, features_test

Готовлю три разных выборки. Оригинальная, закодированная LabelEncoder() и закодированная get_dummies, чтобы посмотреть какиая лучше сработает

In [242]:
features_original = train.drop('sellingprice', axis=1)
test_original = test
features_le = train.drop('sellingprice', axis=1)
test_le = test
features_ohe = train.drop('sellingprice', axis=1)
test_ohe = test
target = train['sellingprice']

In [243]:
cat_columns = features_original.select_dtypes(include='object').columns.to_list()
numeric = features_original.select_dtypes(exclude='object').columns.to_list()

In [244]:
features_original, test_original = scaler(features_original, test_original, numeric)

In [245]:
features_le, test_le = scaler(features_le, test_le, numeric)
features_le, test_le = LE(features_le, test_le, cat_columns)

In [246]:
features_ohe, test_ohe = scaler(features_ohe, test_ohe, numeric)
features_ohe, test_ohe = ohe(features_ohe, test_ohe)

In [247]:
features_original_train, features_original_valid, target_original_train, target_original_valid = train_test_split(
    features_original, target, test_size=0.25, random_state=1234)

In [248]:
features_le_train, features_le_valid, target_le_train, target_le_valid = train_test_split(
    features_le, target, test_size=0.25, random_state=1234)

In [249]:
features_ohe_train, features_ohe_valid, target_ohe_train, target_ohe_valid = train_test_split(
    features_ohe, target, test_size=0.25, random_state=1234)

## Обучение моделей

In [250]:
# Функция для вычисления mape

def mape(test, pred):
    test, pred = np.array(test), np.array(pred)
    mape = np.mean(np.abs((test - pred) / test))
    return mape

### DecisionTreeRegressor

#### Закодированная LE выборка

In [None]:
%%time

model = DecisionTreeRegressor(random_state=12345)

max_depth = [x for x in range(2, 30)]
params = [{'max_depth':max_depth}]

clf = GridSearchCV(estimator = model, param_grid = params, cv = 2, n_jobs = -1 ,
                   scoring='neg_mean_absolute_percentage_error')

clf.fit(features_le_train, target_le_train)
pred = clf.predict(features_le_valid)
MAPE_DTR = mape(target_le_valid, pred)
print("Best parameters:")
print(clf.best_params_)
print("MAPE:", MAPE_DTR)

#### Закодированная OHE выборка

In [None]:
%%time

model = DecisionTreeRegressor(random_state=12345)

max_depth = [x for x in range(2, 20)]
params = [{'max_depth':max_depth}]

clf = GridSearchCV(estimator = model, param_grid = params, cv = 2, n_jobs = -1 ,
                   scoring='neg_mean_absolute_percentage_error')

clf.fit(features_ohe_train, target_ohe_train)
pred = clf.predict(features_ohe_valid)
MAPE_DTR = mape(target_ohe_valid, pred)
print("Best parameters:")
print(clf.best_params_)
print("MAPE:", MAPE_DTR)

### CatBoost

#### Оригинальная выборка

In [251]:
%%time

model = CatBoostRegressor(random_state=1234, cat_features=['make', 'model', 'trim', 'body',
                                 'transmission', 'state','color',
                                 'interior', 'seller'])

params = [{'learning_rate':[0.01, 0.03],
           'iterations':[1000, 3000],
                'verbose':[False]}]

clf = GridSearchCV(estimator = model, param_grid = params, cv = 2,
                   n_jobs = -1, scoring='neg_mean_absolute_percentage_error')

clf.fit(features_original_train, target_original_train)

estimator = clf.best_estimator_
filename = "model_CBR_original.cbm"
pickle.dump(estimator, open(filename, 'wb'))

pred = clf.predict(features_original_valid)
MAPE_CBR_original = mape(target_original_valid, pred)
print("Best parameters:")
print(clf.best_params_)
print("MAPE:", MAPE_CBR_original)

Best parameters:
{'iterations': 3000, 'learning_rate': 0.03, 'verbose': False}
MAPE: 0.17620170650673161
CPU times: user 21min 56s, sys: 29.1 s, total: 22min 25s
Wall time: 14min 27s


#### Закодированная LE выборка

In [None]:
%%time

model = CatBoostRegressor(random_state=1234)

params = [{'learning_rate':[0.01, 0.03],
           'iterations':[1000, 3000],
                'verbose':[False]}]

clf = GridSearchCV(estimator = model, param_grid = params, cv = 2,
                   n_jobs = -1, scoring='neg_mean_absolute_percentage_error')

clf.fit(features_le_train, target_le_train)

estimator = clf.best_estimator_
filename = "model_CBR_LE.cbm"
pickle.dump(estimator, open(filename, 'wb'))

pred = clf.predict(features_le_valid)
MAPE_CBR_LE = mape(target_le_valid, pred)
print("Best parameters:")
print(clf.best_params_)
print("MAPE:", MAPE_CBR_LE)

### LGBMRegressor

#### Закодированная OHE выборка

In [None]:
%%time


model = lgb.LGBMRegressor(random_state=1234, force_col_wise=True)

params = [{'learning_rate':[0.01, 0.03, 0.1],
           'iterations':[1000, 2000, 3000]}]

clf = GridSearchCV(estimator = model, param_grid = params, cv = 2,
                   n_jobs = -1, scoring='neg_mean_absolute_percentage_error')

clf.fit(features_le_train, target_le_train)

estimator = clf.best_estimator_
filename = "model_LGBM_le.cbm"
pickle.dump(estimator, open(filename, 'wb'))

pred = clf.predict(features_le_valid)
MAPE_LGBM_le = mape(target_le_valid, pred)
print("Best parameters:")
print(clf.best_params_)
print("MAPE:", MAPE_LGBM_le)

#### Закодированная LE выборка

In [None]:
%%time


model = lgb.LGBMRegressor(random_state=1234, force_col_wise=True)

params = [{'learning_rate':[0.01, 0.03, 0.1],
           'iterations':[1000, 2000, 3000],
                'verbose':[1, False]}]

clf = GridSearchCV(estimator = model, param_grid = params, cv = 2,
                   n_jobs = -1, scoring='neg_mean_absolute_percentage_error')

clf.fit(features_ohe_train, target_ohe_train)

estimator = clf.best_estimator_
filename = "model_LGBM_ohe.cbm"
pickle.dump(estimator, open(filename, 'wb'))

pred = clf.predict(features_ohe_valid)
MAPE_LGBM_ohe = mape(target_ohe_valid, pred)
print("Best parameters:")
print(clf.best_params_)
print("MAPE:", MAPE_LGBM_ohe)

## Финальная модель

In [252]:
filename = "model_CBR_original.cbm"
model = pickle.load(open(filename, 'rb'))
pred_test = model.predict(test_original)
pred = model.predict(features_original_valid)
MAPE_CBR_original = mape(target_original_valid, pred)
print("MAPE на валидационной выборке:", MAPE_CBR_original)

MAPE на валидационной выборке: 0.17620170650673161


## Сбор итогового файла

In [254]:
vin_test['sellingprice'] = pred_test
vin_test.to_csv(r'my_data.csv', index=False)
vin_test.head()

Unnamed: 0,vin,sellingprice
0,1g6dp567450124779,5509.94858
1,1gtw7fca7e1902207,20853.852539
2,jn8az1mw6dw303497,8601.685458
3,2g1wf5e34d1160703,21229.189587
4,1n6aa0ec3dn301209,2448.168583
