<a href="https://colab.research.google.com/github/sk27110/AutoPriceForecast/blob/main/Lasso_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

RANDOM_STATE = 42

In [None]:
df = pd.read_csv('processed_data.csv')

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, shuffle=True)

# Baseline

Построим первую модель, которая будет предсказывать среднее значение целевой переменной.

In [None]:
y_mean = y_train.mean()

In [None]:
y_train_pred = pd.DataFrame(np.full((y_train.shape[0], 1), y_mean), columns=['price'])
y_test_pred = pd.DataFrame(np.full((y_test.shape[0], 1), y_mean), columns=['price'])

In [None]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

In [None]:
print('MSE на трейне:',MSE(y_train, y_train_pred))
print('r2_score на трейне', r2_score(y_train, y_train_pred))

print('MSE на тесте:',MSE(y_test, y_test_pred))
print('r2_score на тесте', r2_score(y_test, y_test_pred))

MSE на трейне: 36641869918000.69
r2_score на трейне 0.0
MSE на тесте: 22143126002102.887
r2_score на тесте -3.307028624544017e-05


Результаты на тесте очень плохие.

#Linear model

Обучим простейшую линейную регрессию

In [None]:
numerical_cols = ['year', 'mileage', 'engine_capacity', 'engine_power', 'travel_distance']
categorical_cols = ['title', 'transmission', 'body_type', 'drive_type', 'color', 'fuel_type']

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import Lasso, Ridge
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class TitleExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if len(X[self.column].str.split())>=2:
          X[self.column] = X[self.column].str.split().str[0] + X[self.column].str.split().str[1]
        if len(X[self.column].str.split())==1:
          X[self.column] = X[self.column].str.split().str[0]
        return X



numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('title_extractor', TitleExtractor(column='title')),
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

column_trans = ColumnTransformer(
    [
        ('cat', categorical_transformer, categorical_cols),
        ('num', numeric_transformer, numerical_cols)
    ],
    remainder='drop'
)

linear_clf = Pipeline(steps=[('preprocessor', column_trans),
                      ('classifier', LinearRegression())])

In [None]:
linear_clf.fit(X_train, y_train)

In [None]:
y_train_pred = linear_clf.predict(X_train)
y_test_pred = linear_clf.predict(X_test)

In [None]:
print('MSE на трейне:',MSE(y_train, y_train_pred))
print('r2_score на трейне', r2_score(y_train, y_train_pred))

print('MSE на тесте:',MSE(y_test, y_test_pred))
print('r2_score на тесте', r2_score(y_test, y_test_pred))

MSE на трейне: 4099498269576.759
r2_score на трейне 0.8881198399876737
MSE на тесте: 11642678226127.543
r2_score на тесте 0.47419062458826144


Похоже на сильное переобучение. Посмотрим на веса молели:

In [None]:
linear_model = linear_clf.named_steps['classifier']

# for i,j in zip(range(0,len(linear_model.coef_)), linear_model.coef_):
#   print(i,j)

In [None]:
Lasso_clf = Pipeline(steps=[('preprocessor', column_trans),
                      ('classifier', Lasso())])

In [None]:
Lasso_clf.fit(X_train, y_train)

  model = cd_fast.sparse_enet_coordinate_descent(


In [None]:
y_train_pred = Lasso_clf.predict(X_train)
y_test_pred = Lasso_clf.predict(X_test)

In [None]:
print('MSE на трейне:',MSE(y_train, y_train_pred))
print('r2_score на трейне', r2_score(y_train, y_train_pred))

print('MSE на тесте:',MSE(y_test, y_test_pred))
print('r2_score на тесте', r2_score(y_test, y_test_pred))

MSE на трейне: 4099586003536.425
r2_score на трейне 0.8881174456240711
MSE на тесте: 11730031250475.412
r2_score на тесте 0.4702455667346822


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__alpha': [0.001, 0.01, 0.1, 1, 10],
    'classifier__max_iter': [1000],
    'classifier__tol': [1e-3],
    'classifier__selection': ['random']
}

grid_search = GridSearchCV(
    estimator=Lasso_clf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)

best_lasso = grid_search.best_estimator_

print("Test score:", best_lasso.score(X_test, y_test))


Fitting 5 folds for each of 5 candidates, totalling 25 fits


  model = cd_fast.sparse_enet_coordinate_descent(


Лучшие параметры: {'classifier__alpha': 0.01, 'classifier__max_iter': 1000, 'classifier__selection': 'random', 'classifier__tol': 0.001}
Test score: 0.47055404292034053
