# Построим простую модель

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# Загрузка датасета

In [2]:
X = pd.read_csv('train_preprocessed.csv').drop(columns='price')
X.head(1).T

Unnamed: 0,0
body_type,седан
brand,BMW
color,чёрный
description,авто бодрый ход работать отлично электрик ходо...
engine_displacement,3.0
engine_power,5.609472
fuel_type,бензин
mileage,245000
model_year,2007
n_doors,4


In [3]:
y = pd.read_csv('train_preprocessed.csv', usecols=['price'])
y.head(1)

Unnamed: 0,price
0,599000.0


In [4]:
X_test = pd.read_csv('test_preprocessed.csv')
X_test.head(1).T

Unnamed: 0,0
body_type,универсал
brand,MERCEDES
color,чёрный
description,весь привет продать эксклюзивный проект универ...
engine_displacement,3.0
engine_power,5.398163
fuel_type,бензин
mileage,350000
model_year,1984
n_doors,5


# Проверим готовность данных

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6682 entries, 0 to 6681
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   body_type             6682 non-null   object 
 1   brand                 6682 non-null   object 
 2   color                 6682 non-null   object 
 3   description           6679 non-null   object 
 4   engine_displacement   6682 non-null   float64
 5   engine_power          6682 non-null   float64
 6   fuel_type             6682 non-null   object 
 7   mileage               6682 non-null   int64  
 8   model_year            6682 non-null   int64  
 9   n_doors               6682 non-null   int64  
 10  production_year       6682 non-null   int64  
 11  sell_id               6682 non-null   int64  
 12  vehicle_transmission  6682 non-null   object 
 13  n_owners              6682 non-null   object 
 14  ti_own                6682 non-null   float64
 15  drive_type           

Для простой модели оставим 2 признака:
- бренд
- пробег

In [6]:
columns_to_model = ['brand', 
                    'mileage', 
                   ]
X = X[columns_to_model]

In [7]:
# one-hot-encoding для категориальных признаков
X = pd.get_dummies(X)
X.columns

Index(['mileage', 'brand_AUDI', 'brand_BMW', 'brand_MERCEDES'], dtype='object')

# Проверяем модели

## Разделяем на тренировочную и валидационную части

In [8]:
# параметры разделения
test_size = 0.2
seed = 73

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

(5345, 4) (5345, 1)
(1337, 4) (1337, 1)


In [10]:
# обучим модель
simple_model = LinearRegression()
simple_model.fit(X_train, y_train)

LinearRegression()

In [11]:
# валидация
y_predict = simple_model.predict(X_valid)
mape = mean_absolute_percentage_error(y_valid, y_predict)
print(f'MAPE = {mape*100:0.1f}%')

MAPE = 70.2%


In [12]:
print(simple_model.coef_, simple_model.intercept_)

[[-1.18143585e+01 -4.46930267e+05 -8.09501132e+04  5.27880380e+05]] [3279244.43785409]


# Результат для Kaggle

In [13]:
# обработаем test так же, как train
X_test = X_test[columns_to_model]
X_test = pd.get_dummies(X_test)

In [14]:
# обучим модель на всех примерах
simple_model = LinearRegression()
simple_model.fit(X, y)

LinearRegression()

In [15]:
# предскажем
y_test_predict = simple_model.predict(X_test)

In [16]:
submission = pd.read_csv('sample_submission.csv')
submission['price'] = y_test_predict
submission.to_csv('submission_simple_model.csv', index=False)