# Baseline

Построим первую модель, которая будет предсказывать среднее значение целевой переменной.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

RANDOM_STATE = 42

In [4]:
df = pd.read_csv('../../../full-data/full_processed.csv')

In [5]:
from sklearn.model_selection import train_test_split

X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, shuffle=True)

In [6]:
y_mean = y_train.mean()

In [7]:
y_train_pred = pd.DataFrame(np.full((y_train.shape[0], 1), y_mean), columns=['price'])
y_test_pred = pd.DataFrame(np.full((y_test.shape[0], 1), y_mean), columns=['price'])

In [8]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

In [9]:
print('MSE на трейне:',MSE(y_train, y_train_pred))
print('r2_score на трейне', r2_score(y_train, y_train_pred))

print('MSE на тесте:',MSE(y_test, y_test_pred))
print('r2_score на тесте', r2_score(y_test, y_test_pred))

MSE на трейне: 5108167056090.841
r2_score на трейне 0.0
MSE на тесте: 5143299958041.462
r2_score на тесте -1.0104791440701533e-05


Результаты на тесте очень плохие.

## Linear model

Обучим простейшую линейную регрессию

In [10]:
numerical_cols = ['year', 'mileage', 'engine_capacity', 'engine_power', 'travel_distance']
categorical_cols = ['title', 'transmission', 'body_type', 'drive_type', 'color', 'fuel_type']

In [11]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import Lasso, Ridge
from sklearn.base import BaseEstimator, TransformerMixin

In [12]:
class TitleExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if len(X[self.column].str.split())>=2:
          X[self.column] = X[self.column].str.split().str[0] + X[self.column].str.split().str[1]
        if len(X[self.column].str.split())==1:
          X[self.column] = X[self.column].str.split().str[0]
        return X



numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('title_extractor', TitleExtractor(column='title')),
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

column_trans = ColumnTransformer(
    [
        ('cat', categorical_transformer, categorical_cols),
        ('num', numeric_transformer, numerical_cols)
    ],
    remainder='drop'
)

linear_clf = Pipeline(steps=[('preprocessor', column_trans),
                      ('classifier', LinearRegression())])

In [13]:
linear_clf.fit(X_train, y_train)

In [14]:
y_train_pred = linear_clf.predict(X_train)
y_test_pred = linear_clf.predict(X_test)

In [15]:
print('MSE на трейне:',MSE(y_train, y_train_pred))
print('r2_score на трейне', r2_score(y_train, y_train_pred))

print('MSE на тесте:',MSE(y_test, y_test_pred))
print('r2_score на тесте', r2_score(y_test, y_test_pred))

MSE на трейне: 1165852805434.9211
r2_score на трейне 0.7717668994312178
MSE на тесте: 1163551816903.3071
r2_score на тесте 0.7737710061936918


Похоже на сильное переобучение. Посмотрим на веса молели:

In [18]:
linear_model = linear_clf.named_steps['classifier']

for i,j in zip(range(0,len(linear_model.coef_)), linear_model.coef_):
  print(i,j)

0 -2229538.245520963
1 -1025832.4969185559
2 -889351.7274097129
3 217090.58815452494
4 -563417.1679494868
5 -366058.8317738749
6 57009.7499001358
7 -469919.62016224535
8 -438501.6631466422
9 299665.7020080501
10 -190215.8864367628
11 -269812.24584390357
12 -292177.51568967843
13 -639236.5701236629
14 -2602305.104063767
15 299190.96048980404
16 -638351.4226896067
17 123733.27322934159
18 26540.082970046165
19 6558557.320490545
20 551605.0052595639
21 8028759.0201951135
22 1360517.418557434
23 1301507.5841122987
24 2125040.9261798165
25 1372319.7645169864
26 993699.5741528211
27 1444922.4834292503
28 1389132.6455016932
29 -590671.2757238881
30 769800.3094538744
31 -292790.18803095934
32 46209.77501002528
33 -112183.2234564545
34 123717.06028561514
35 743493.0669366855
36 399067.65599969425
37 2057705.258765576
38 -408640.9420073773
39 3688.87611654258
40 -364149.1040240335
41 4037092.3199218265
42 833940.9840203591
43 3610425.7712768987
44 1272185.2412882517
45 515399.2792995548
46 10113

In [19]:
Lasso_clf = Pipeline(steps=[('preprocessor', column_trans),
                      ('classifier', Lasso())])

In [None]:
Lasso_clf.fit(X_train, y_train)

In [None]:
y_train_pred = Lasso_clf.predict(X_train)
y_test_pred = Lasso_clf.predict(X_test)

In [None]:
print('MSE на трейне:',MSE(y_train, y_train_pred))
print('r2_score на трейне', r2_score(y_train, y_train_pred))

print('MSE на тесте:',MSE(y_test, y_test_pred))
print('r2_score на тесте', r2_score(y_test, y_test_pred))

MSE на трейне: 4099586003536.425
r2_score на трейне 0.8881174456240711
MSE на тесте: 11730031250475.412
r2_score на тесте 0.4702455667346822


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__alpha': [0.001, 0.01, 0.1, 1, 10],
    'classifier__max_iter': [1000],
    'classifier__tol': [1e-3],
    'classifier__selection': ['random']
}

grid_search = GridSearchCV(
    estimator=Lasso_clf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)

best_lasso = grid_search.best_estimator_

print("Test score:", best_lasso.score(X_test, y_test))


Fitting 5 folds for each of 5 candidates, totalling 25 fits


  model = cd_fast.sparse_enet_coordinate_descent(


Лучшие параметры: {'classifier__alpha': 0.01, 'classifier__max_iter': 1000, 'classifier__selection': 'random', 'classifier__tol': 0.001}
Test score: 0.47055404292034053
