# Выделение целевого признака

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv('../data/beautiful_df.csv')
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [3]:
data

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,0,0,1,0,190000,2010,0,0,0,2.5,...,1,1,1,0,1,0,1,1,1,16
1,0,0,1,1,290000,2002,0,0,0,3.0,...,1,0,0,1,1,0,0,0,1,83
2,0,1,1,2,402000,2001,0,0,0,2.5,...,1,0,0,0,0,0,0,1,1,151
3,0,2,0,1,10000,1999,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,86
4,0,3,1,3,280000,2001,0,0,0,2.5,...,1,0,1,1,0,0,0,0,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,54,1108,1,0,290000,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,301
38527,54,1111,0,1,321000,2004,2,0,1,2.2,...,1,0,0,1,1,0,0,1,1,317
38528,54,1108,1,1,777957,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,369
38529,54,1111,0,3,20000,2001,0,0,0,2.0,...,1,0,0,0,0,0,0,0,1,490


In [4]:
data['price_usd'].value_counts()

1500.00    637
3500.00    568
2000.00    561
1000.00    552
2500.00    546
          ... 
6053.23      1
9130.00      1
8661.20      1
4097.51      1
5666.00      1
Name: price_usd, Length: 2677, dtype: int64

In [5]:
y = data["price_usd"]
X = data.drop(["price_usd"], axis=1)

In [6]:
y

0        10900.00
1         5000.00
2         2800.00
3         9999.00
4         2134.11
           ...   
38526     2750.00
38527     4800.00
38528     4300.00
38529     4000.00
38530     3200.00
Name: price_usd, Length: 38531, dtype: float64

In [7]:
X

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,0,0,1,0,190000,2010,0,0,0,2.5,...,1,1,1,0,1,0,1,1,1,16
1,0,0,1,1,290000,2002,0,0,0,3.0,...,1,0,0,1,1,0,0,0,1,83
2,0,1,1,2,402000,2001,0,0,0,2.5,...,1,0,0,0,0,0,0,1,1,151
3,0,2,0,1,10000,1999,0,0,0,3.0,...,0,0,0,0,0,0,0,0,0,86
4,0,3,1,3,280000,2001,0,0,0,2.5,...,1,0,1,1,0,0,0,0,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,54,1108,1,0,290000,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,301
38527,54,1111,0,1,321000,2004,2,0,1,2.2,...,1,0,0,1,1,0,0,1,1,317
38528,54,1108,1,1,777957,2000,0,0,0,3.5,...,1,0,0,1,1,0,0,1,1,369
38529,54,1111,0,3,20000,2001,0,0,0,2.0,...,1,0,0,0,0,0,0,0,1,490


## Разделение данных

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30824, 29), (30824,), (7707, 29), (7707,))

# Линейная регрессия

In [11]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt

In [12]:
lr = LinearRegression().fit(X_train, y_train)

In [13]:
lr.predict(X_test)

array([ 7366.31872144, 17188.70752523,  3405.20865022, ...,
       11871.63328973,  5542.88534125,  1487.1284297 ])

In [14]:
y_pred = lr.predict(X_test)

In [15]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lr.score(X_test, y_test)}')

MAE: 2332.264383217121
MSE: 13728555.489239968
RMSE: 3705.2065380002728
MAPE: 1.4543785747008366
R^2: 0.687584693181398


In [16]:
len(lr.coef_)
lr.coef_

array([-1.78843262e+01,  2.15157926e+00,  8.54497672e+02,  1.06782254e+02,
       -5.30255037e-03,  3.69208469e+02,  3.05538402e+02, -6.54479912e+02,
        6.17487852e+02,  1.65849802e+03,  1.46154922e+02,  6.38142048e+03,
        2.20423858e+03, -9.55161073e+02, -2.18465787e+02, -1.16487735e+02,
        8.72265797e+01, -1.08865774e+00,  6.02147757e+02, -9.03847708e+02,
        5.73968018e+02,  7.36481356e+02,  3.11542198e+02,  2.53401706e+02,
        1.00218521e+03,  1.34272836e+03,  3.21010853e+02,  2.43002337e+02,
        1.96567742e+00])

In [17]:
# L1
ridge = Ridge(alpha=0.5).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 2332.304942498558
MSE: 13728306.677960154
RMSE: 3705.1729619493008
MAPE: 2.116136543216806
R^2: 0.6875903552812732


array([-1.78453403e+01,  2.14985359e+00,  8.54525009e+02,  1.06787756e+02,
       -5.30306244e-03,  3.69253845e+02,  3.05553058e+02, -6.54166465e+02,
        6.17401606e+02,  1.65847515e+03,  1.46128401e+02,  6.36160474e+03,
        2.20984503e+03, -9.55120541e+02, -2.18202487e+02, -1.16498212e+02,
        8.72353495e+01, -1.08965009e+00,  6.02182166e+02, -9.04071416e+02,
        5.73983085e+02,  7.36346361e+02,  3.11625350e+02,  2.53286438e+02,
        1.00212675e+03,  1.34249876e+03,  3.20875055e+02,  2.42973326e+02,
        1.96543942e+00])

In [18]:
# L2
lasso = Lasso(alpha=0.5).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 2332.144573619198
MSE: 13728434.134448102
RMSE: 3705.1901617120952
MAPE: 2.117658990996311
R^2: 0.6875874548043935


array([-1.76393936e+01,  2.14163188e+00,  8.53395182e+02,  1.06788938e+02,
       -5.30731570e-03,  3.69552848e+02,  3.05618890e+02, -6.40364157e+02,
        6.15473242e+02,  1.65900729e+03,  1.45665282e+02,  6.30705300e+03,
        2.21664967e+03, -9.53362428e+02, -2.14746576e+02, -1.16488480e+02,
        8.72894435e+01, -1.09008561e+00,  5.96033329e+02, -9.03743522e+02,
        5.73038971e+02,  7.35348137e+02,  3.11230860e+02,  2.51264014e+02,
        1.00117264e+03,  1.34099161e+03,  3.18598688e+02,  2.38620345e+02,
        1.96372926e+00])

In [19]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

In [20]:
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [21]:
ridge_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
# выводим оптимальные значения параметров
#GridSearchCV
ridge_optimal.best_params_

{'alpha': 0.9}

In [22]:
# L1
ridge = Ridge(alpha=0.9).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 2332.337442295239
MSE: 13728111.72459027
RMSE: 3705.146653587449
MAPE: 2.1168663360018205
R^2: 0.6875947917579962


array([-1.78143074e+01,  2.14848011e+00,  8.54546623e+02,  1.06792136e+02,
       -5.30347058e-03,  3.69289967e+02,  3.05565119e+02, -6.53916251e+02,
        6.17332087e+02,  1.65845670e+03,  1.46107263e+02,  6.34586083e+03,
        2.21429212e+03, -9.55088091e+02, -2.17992824e+02, -1.16506575e+02,
        8.72423272e+01, -1.09043752e+00,  6.02209775e+02, -9.04249023e+02,
        5.73995060e+02,  7.36238824e+02,  3.11691811e+02,  2.53194696e+02,
        1.00208002e+03,  1.34231539e+03,  3.20766990e+02,  2.42950100e+02,
        1.96525000e+00])

In [23]:
y_pred

array([ 7367.55732214, 17189.5048655 ,  3404.63249615, ...,
       11871.33229638,  5543.08866727,  1486.91730948])

In [24]:
y_test

38184     8400.00
5006     45293.56
1314      2750.00
6334       700.00
15859     4350.00
           ...   
10051      474.25
15342    16900.00
19665    10300.00
6994      2714.60
6114      3500.00
Name: price_usd, Length: 7707, dtype: float64

In [25]:
# L2
lasso = Lasso(alpha=0.9).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 2332.0540286767687
MSE: 13728410.36110907
RMSE: 3705.186953597493
MAPE: 2.1196134984530532
R^2: 0.6875879958048661


array([-1.74434474e+01,  2.13367397e+00,  8.52513191e+02,  1.06794284e+02,
       -5.31112797e-03,  3.69828350e+02,  3.05683249e+02, -6.29071520e+02,
        6.13861617e+02,  1.65941470e+03,  1.45273571e+02,  6.24755901e+03,
        2.22657855e+03, -9.51923513e+02, -2.11771208e+02, -1.16489077e+02,
        8.73397346e+01, -1.09122791e+00,  5.91141787e+02, -9.03660174e+02,
        5.72295733e+02,  7.34441561e+02,  3.10981790e+02,  2.49553860e+02,
        1.00036259e+03,  1.33960220e+03,  3.16668956e+02,  2.35114751e+02,
        1.96217073e+00])

# Полиномиальная

In [26]:
yy = data["price_usd"]
XX = data.drop(["price_usd"], axis=1)

In [27]:
lr = LinearRegression().fit(XX, yy)
lr.coef_

array([-1.81220468e+01,  2.18858726e+00,  8.35720469e+02,  1.09494063e+02,
       -5.47687928e-03,  3.68676615e+02,  2.56272528e+02, -6.13347402e+02,
        7.61107216e+02,  1.69254034e+03,  1.45185053e+02,  6.04636359e+03,
        2.37403743e+03, -9.73205705e+02, -1.97772307e+02, -1.12778004e+02,
        8.41602979e+01, -3.61912858e-01,  5.92447383e+02, -9.19761533e+02,
        5.91326303e+02,  7.41186828e+02,  3.09695279e+02,  2.60535493e+02,
        1.06118213e+03,  1.33028038e+03,  3.21568285e+02,  2.26687831e+02,
        1.86936657e+00])

In [28]:
from sklearn.preprocessing import PolynomialFeatures

In [29]:
# создаем объект, который позволит расширить множество предикторов
pf = PolynomialFeatures(2)  
# добавляем новые предикторы
X_p=pf.fit_transform(XX) 
X_p

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.60000e+01, 2.56000e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        8.30000e+01, 6.88900e+03],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        1.51000e+02, 2.28010e+04],
       ...,
       [1.00000e+00, 5.40000e+01, 1.10800e+03, ..., 1.00000e+00,
        3.69000e+02, 1.36161e+05],
       [1.00000e+00, 5.40000e+01, 1.11100e+03, ..., 1.00000e+00,
        4.90000e+02, 2.40100e+05],
       [1.00000e+00, 5.40000e+01, 1.10600e+03, ..., 1.00000e+00,
        6.32000e+02, 3.99424e+05]])

In [30]:
lr2 = LinearRegression().fit(X_p, yy)
lr2.coef_

array([ 6.64651692e+04,  2.53066635e+03, -1.76051382e+02, -1.79993458e+05,
       -1.67894414e+03,  6.58124852e-01, -4.95092044e+04, -1.24079325e+05,
        5.42514260e+04, -9.48506671e+04, -3.18185122e+05, -5.86313681e+03,
       -4.50912856e+05,  3.61715683e+05,  7.59905779e+04, -2.34170365e+03,
        6.26410193e+03, -5.19060131e+03, -1.13080215e+02, -6.27047856e+04,
        2.41816441e+04, -8.04494461e+04, -6.24444504e+04, -2.37995656e+04,
       -1.62265090e+04, -1.61729617e+05, -2.97294697e+04, -9.10193300e+04,
        5.17730164e+04, -2.21020288e+02,  3.22611677e+00, -7.32864357e-02,
        3.51137900e+00,  8.65977016e-01, -2.34624313e-05, -1.40193656e+00,
       -2.60872345e+02,  2.60608297e+02,  5.72149790e+02,  3.90749414e+01,
       -1.94818890e+00, -7.96384028e+01, -4.09732114e+01, -2.96789444e+00,
       -4.51385705e+00, -9.15289964e-01,  3.74090756e-01,  1.67603668e-01,
       -1.54732554e+01,  3.27374447e+00,  1.50468318e-01,  2.10750763e+00,
       -6.02183761e+00, -

In [32]:
polynomial_regression_classic = LinearRegression().fit(PolynomialFeatures(2).fit_transform(X_train), y_train)
y_predicted = polynomial_regression_classic.predict(PolynomialFeatures(2).fit_transform(X_test))
print(f"MAE: {mean_absolute_error(y_pred, y_test)}",
      f'RMSE: {mean_squared_error(y_pred, y_test)}',
      f'MSE: {mean_squared_error(y_pred, y_test)**0.5}',
      f'MAPE: {mean_absolute_percentage_error(y_pred, y_test)}',
      f'R^2: {polynomial_regression_classic.score(PolynomialFeatures(2).fit_transform(X_test), y_test)}',
      sep='\n')

MAE: 2332.0540286767687
RMSE: 13728410.36110907
MSE: 3705.186953597493
MAPE: 1.036170554879724
R^2: 0.8458302973808837
