# Regressão Linear

* from https://www.kaggle.com/datasets/danielkyrka/bmw-pricing-challenge


__Fatures:__
* maker_key: Brand car
* model_key: Model car
* mileage: Milage of motor
* engine_power: Power of motor
* registration_date: Date of registration 
* fuel: type of fuel (petrol, diesel...)
* paint_color: color of the car
* car_type: type of car
* feature_1: sistema de navegação
* feature_2: motor a gasolina
* feature_3: motor diesel
* feature_4: teto solar
* feature_5: transmissão automática
* feature_6: tração nas quatro rodas
* feature_7: assentos de couro
* feature_8: sistema de som premium
* price: sale price
* sold_at: date of sale

In [127]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, \
    mean_absolute_error, mean_absolute_percentage_error

In [128]:
# blw pricing chalenge
data = pd.read_csv('./../../Datasets/bmw_pricing_challenge.csv')

In [129]:
data.head()

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at
0,BMW,118,140411,100,2012-02-01,diesel,black,convertible,True,True,False,False,True,True,True,False,11300,2018-01-01
1,BMW,M4,13929,317,2016-04-01,petrol,grey,convertible,True,True,False,False,False,True,True,True,69700,2018-02-01
2,BMW,320,183297,120,2012-04-01,diesel,white,convertible,False,False,False,False,True,False,True,False,10200,2018-02-01
3,BMW,420,128035,135,2014-07-01,diesel,red,convertible,True,True,False,False,True,True,True,True,25100,2018-02-01
4,BMW,425,97097,160,2014-12-01,diesel,silver,convertible,True,True,False,False,False,True,True,True,33400,2018-04-01


In [130]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   maker_key          4843 non-null   object
 1   model_key          4843 non-null   object
 2   mileage            4843 non-null   int64 
 3   engine_power       4843 non-null   int64 
 4   registration_date  4843 non-null   object
 5   fuel               4843 non-null   object
 6   paint_color        4843 non-null   object
 7   car_type           4843 non-null   object
 8   feature_1          4843 non-null   bool  
 9   feature_2          4843 non-null   bool  
 10  feature_3          4843 non-null   bool  
 11  feature_4          4843 non-null   bool  
 12  feature_5          4843 non-null   bool  
 13  feature_6          4843 non-null   bool  
 14  feature_7          4843 non-null   bool  
 15  feature_8          4843 non-null   bool  
 16  price              4843 non-null   int64 


In [131]:
# Remove first col (just one value)
print("Brands in dataset: ", data.maker_key.unique())
data.drop('maker_key', axis=1, inplace=True)


Brands in dataset:  ['BMW']


In [132]:
# Duplicates
data.duplicated().sum()

0

In [133]:
# Missing values
data.isnull().sum().sum()

0

In [134]:
# Add days between registration date and sold_at
data['sold_at'] = pd.to_datetime(data['sold_at'])
data['registration_date'] = pd.to_datetime(data['registration_date'])
data['time_to_sale'] = data['sold_at'] - data['registration_date']

# Add registration year
data['year'] = data['registration_date'].dt.year

In [135]:
encoder = LabelEncoder()
for col in data.columns:
    if data[col].dtypes != 'float64':
        data[col] = encoder.fit_transform(data[col])

data.dtypes

model_key            int64
mileage              int64
engine_power         int64
registration_date    int64
fuel                 int64
paint_color          int64
car_type             int64
feature_1            int64
feature_2            int64
feature_3            int64
feature_4            int64
feature_5            int64
feature_6            int64
feature_7            int64
feature_8            int64
price                int64
sold_at              int64
time_to_sale         int64
year                 int64
dtype: object

In [136]:
matrix_correlations = data.corr()
matrix_correlations.style.background_gradient(cmap='viridis')

Unnamed: 0,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at,time_to_sale,year
model_key,1.0,-0.027289,0.563784,0.006305,-0.024856,0.018869,0.588436,0.320416,0.145957,0.2191,0.508473,0.225805,0.044677,-0.05805,0.379913,0.519635,0.066709,0.000318,0.010196
mileage,-0.027289,1.0,-0.04079,-0.445387,-0.043721,-0.037516,-0.143297,0.058805,0.049026,-0.009208,-0.052712,0.060215,-0.021406,0.171495,-0.027143,-0.420566,-0.007821,0.453224,-0.431971
engine_power,0.563784,-0.04079,1.0,0.080089,0.100917,0.000365,0.134563,0.333504,0.205722,0.313609,0.463591,0.376028,0.231198,-0.005449,0.523129,0.668643,-0.041794,-0.080261,0.082729
registration_date,0.006305,-0.445387,0.080089,1.0,-0.201235,0.036807,0.035108,-0.078514,0.230845,-0.025752,0.056386,0.087387,0.074,-0.084439,0.18293,0.505678,-0.05518,-0.990376,0.988895
fuel,-0.024856,-0.043721,0.100917,-0.201235,1.0,-0.001952,-0.074873,-0.008028,-0.09532,0.012614,-0.073418,0.004505,0.027107,-0.047046,-0.001592,-0.034651,-0.014554,0.184279,-0.211797
paint_color,0.018869,-0.037516,0.000365,0.036807,-0.001952,1.0,0.033395,-0.078086,-0.079484,-0.00112,0.013036,-0.031725,0.020209,-0.112449,0.002493,0.01643,-0.000344,-0.042689,0.033747
car_type,0.588436,-0.143297,0.134563,0.035108,-0.074873,0.033395,1.0,0.048608,-0.036548,0.085205,0.379293,-0.037593,-0.0373,-0.092251,0.087504,0.281076,0.099682,-0.024013,0.035041
feature_1,0.320416,0.058805,0.333504,-0.078514,-0.008028,-0.078086,0.048608,1.0,0.305965,0.254764,0.230125,0.27832,0.134274,0.243831,0.225611,0.294571,0.080243,0.091565,-0.067658
feature_2,0.145957,0.049026,0.205722,0.230845,-0.09532,-0.079484,-0.036548,0.305965,1.0,0.150669,0.149922,0.285422,0.136106,0.370019,0.224007,0.283215,0.011428,-0.217449,0.234515
feature_3,0.2191,-0.009208,0.313609,-0.025752,0.012614,-0.00112,0.085205,0.254764,0.150669,1.0,0.199477,0.198823,0.144153,0.062218,0.202217,0.266335,0.018045,0.032093,-0.025864


In [137]:
price_corr = data.corrwith(data['price']).reset_index().rename(columns={'index': 'features', 0: 'values'})
price_corr['values'] = price_corr['values'].apply(lambda x : abs(x))
price_corr.sort_values(by='values', inplace=True)
price_corr.style.background_gradient(cmap='viridis')

Unnamed: 0,features,values
16,sold_at,0.003507
13,feature_7,0.008889
5,paint_color,0.01643
4,fuel,0.034651
12,feature_6,0.238059
9,feature_3,0.266335
6,car_type,0.281076
8,feature_2,0.283215
7,feature_1,0.294571
11,feature_5,0.318721


In [138]:
def results_regression(y_test_, y_pred_, print_ = False):
    y_pred_ = y_pred_.astype(np.int64)
    mse = mean_squared_error(y_test_, y_pred_)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(y_test_, y_pred_)
    mape = mean_absolute_percentage_error(y_test_, y_pred_)
    r2 = r2_score(y_test_, y_pred_)

    if print_:
        print(f"MSE: {mse}")
        print(f"Erro médio quadrático (RMSE): {rmse}")    
        print(f"Erro médio absoluto (MAE): {mae}")
        print(f"Erro de porcentagem absoluta média (MAPE): {mape}")
        print(f"R2 Score: {r2}")
        
    return r2
    
def compute_regression(X, y, method_):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    linear_regression = method_
    linear_regression.fit(X_train, y_train)

    y_pred = linear_regression.predict(X_test)

    results_regression(y_test, y_pred, True)


In [139]:
# Test all features x price
X = data.drop('price', axis=1).copy()
y = data.price


In [148]:
# Using linear regression
print("=="*30, "\nUsing linear regression \n")
compute_regression(X, y, LinearRegression())

# Using knn regression
print("=="*30, "\nUsing kNN regression \n")
compute_regression(X, y, KNeighborsRegressor())


Using linear regression 

MSE: 1236.6676986584107
Erro médio quadrático (RMSE): 35.16628639277128
Erro médio absoluto (MAE): 25.53250773993808
Erro de porcentagem absoluta média (MAPE): 2086807257677350.8
R2 Score: 0.7721966765469803
Using kNN regression 

MSE: 2277.8204334365323
Erro médio quadrático (RMSE): 47.72651708889443
Erro médio absoluto (MAE): 33.8328173374613
Erro de porcentagem absoluta média (MAPE): 2370315593352893.0
R2 Score: 0.5804086534087054


In [147]:
# when normalizing ... same score
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# with linear regression
print("=="*30, "\nNormalized and using linear regression \n")
compute_regression(X_scaled, y, LinearRegression())

# with knn
print("=="*30, "\nNormalized and using kNN regression \n")
compute_regression(X_scaled, y, KNeighborsRegressor())

Normalized and using linear regression 

MSE: 1236.6676986584107
Erro médio quadrático (RMSE): 35.16628639277128
Erro médio absoluto (MAE): 25.53250773993808
Erro de porcentagem absoluta média (MAPE): 2086807257677350.8
R2 Score: 0.7721966765469803
Normalized and using kNN regression 

MSE: 1290.5376676986584
Erro médio quadrático (RMSE): 35.92405416567927
Erro médio absoluto (MAE): 24.78534571723426
Erro de porcentagem absoluta média (MAPE): 1910195507584390.2
R2 Score: 0.7622734303952512
