# Regressão Linear

* from https://www.kaggle.com/datasets/danielkyrka/bmw-pricing-challenge


__Fatures:__
* maker_key: Brand car
* model_key: Model car
* mileage: Milage of motor
* engine_power: Power of motor
* registration_date: Date of registration 
* fuel: type of fuel (petrol, diesel...)
* paint_color: color of the car
* car_type: type of car
* feature_1: sistema de navegação
* feature_2: motor a gasolina
* feature_3: motor diesel
* feature_4: teto solar
* feature_5: transmissão automática
* feature_6: tração nas quatro rodas
* feature_7: assentos de couro
* feature_8: sistema de som premium
* price: sale price
* sold_at: date of sale

In [8]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error, mean_absolute_percentage_error

In [9]:
# blw pricing chalenge
data = pd.read_csv('./../../Datasets/bmw_pricing_challenge.csv')

In [10]:
data.head()

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at
0,BMW,118,140411,100,2012-02-01,diesel,black,convertible,True,True,False,False,True,True,True,False,11300,2018-01-01
1,BMW,M4,13929,317,2016-04-01,petrol,grey,convertible,True,True,False,False,False,True,True,True,69700,2018-02-01
2,BMW,320,183297,120,2012-04-01,diesel,white,convertible,False,False,False,False,True,False,True,False,10200,2018-02-01
3,BMW,420,128035,135,2014-07-01,diesel,red,convertible,True,True,False,False,True,True,True,True,25100,2018-02-01
4,BMW,425,97097,160,2014-12-01,diesel,silver,convertible,True,True,False,False,False,True,True,True,33400,2018-04-01


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   maker_key          4843 non-null   object
 1   model_key          4843 non-null   object
 2   mileage            4843 non-null   int64 
 3   engine_power       4843 non-null   int64 
 4   registration_date  4843 non-null   object
 5   fuel               4843 non-null   object
 6   paint_color        4843 non-null   object
 7   car_type           4843 non-null   object
 8   feature_1          4843 non-null   bool  
 9   feature_2          4843 non-null   bool  
 10  feature_3          4843 non-null   bool  
 11  feature_4          4843 non-null   bool  
 12  feature_5          4843 non-null   bool  
 13  feature_6          4843 non-null   bool  
 14  feature_7          4843 non-null   bool  
 15  feature_8          4843 non-null   bool  
 16  price              4843 non-null   int64 


In [12]:
# Remove first col (just one value)
print("Brands in dataset: ", data.maker_key.unique())
data.drop('maker_key', axis=1, inplace=True)


Brands in dataset:  ['BMW']


In [13]:
# Duplicates
data.duplicated().sum()

0

In [14]:
# Missing values
data.isnull().sum().sum()

0

In [15]:
# Add days between registration date and sold_at
data['sold_at'] = pd.to_datetime(data['sold_at'])
data['registration_date'] = pd.to_datetime(data['registration_date'])
data['time_to_sale'] = data['sold_at'] - data['registration_date']

# Add registration year
data['year'] = data['registration_date'].dt.year

In [16]:
encoder = LabelEncoder()
for col in data.columns:
    if data[col].dtypes != 'int64':
        data[col] = encoder.fit_transform(data[col])

data

Unnamed: 0,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at,time_to_sale,year
0,2,140411,100,134,0,1,0,1,1,0,0,1,1,1,0,11300,0,169,2012
1,61,13929,317,184,3,5,0,1,1,0,0,0,1,1,1,69700,1,20,2016
2,21,183297,120,136,0,9,0,0,0,0,0,1,0,1,0,10200,1,167,2012
3,31,128035,135,163,0,7,0,1,1,0,0,1,1,1,1,25100,1,83,2014
4,33,97097,160,168,0,8,0,1,1,0,0,0,1,1,1,33400,3,72,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,13,39743,110,178,0,1,7,0,1,0,0,0,0,1,0,14600,7,55,2015
4839,12,49832,100,174,0,5,7,0,1,0,0,0,0,1,1,17500,7,65,2015
4840,13,19633,110,178,0,5,7,0,1,0,0,0,0,1,1,17000,8,58,2015
4841,12,27920,110,184,0,3,7,1,1,0,0,0,0,1,1,22700,8,40,2016


In [17]:
matrix_correlations = data.corr()
matrix_correlations.style.background_gradient(cmap='viridis')

Unnamed: 0,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at,time_to_sale,year
model_key,1.0,-0.021504,0.541018,0.006305,-0.024856,0.018869,0.588436,0.320416,0.145957,0.2191,0.508473,0.225805,0.044677,-0.05805,0.379913,0.484555,0.066709,0.000318,0.010359
mileage,-0.021504,1.0,-0.050116,-0.5202,-0.04286,-0.039669,-0.131402,0.065258,0.009695,0.003621,-0.052857,0.046706,-0.029626,0.154827,-0.040854,-0.409564,-0.000154,0.522942,-0.507261
engine_power,0.541018,-0.050116,1.0,0.080138,0.121625,0.002108,0.134099,0.327213,0.201202,0.312789,0.447769,0.341004,0.232058,0.008905,0.488579,0.638989,-0.028196,-0.07951,0.082577
registration_date,0.006305,-0.5202,0.080138,1.0,-0.201235,0.036807,0.035108,-0.078514,0.230845,-0.025752,0.056386,0.087387,0.074,-0.084439,0.18293,0.457325,-0.05518,-0.990376,0.988115
fuel,-0.024856,-0.04286,0.121625,-0.201235,1.0,-0.001952,-0.074873,-0.008028,-0.09532,0.012614,-0.073418,0.004505,0.027107,-0.047046,-0.001592,-0.017325,-0.014554,0.184279,-0.212563
paint_color,0.018869,-0.039669,0.002108,0.036807,-0.001952,1.0,0.033395,-0.078086,-0.079484,-0.00112,0.013036,-0.031725,0.020209,-0.112449,0.002493,0.016292,-0.000344,-0.042689,0.033216
car_type,0.588436,-0.131402,0.134099,0.035108,-0.074873,0.033395,1.0,0.048608,-0.036548,0.085205,0.379293,-0.037593,-0.0373,-0.092251,0.087504,0.267482,0.099682,-0.024013,0.034905
feature_1,0.320416,0.065258,0.327213,-0.078514,-0.008028,-0.078086,0.048608,1.0,0.305965,0.254764,0.230125,0.27832,0.134274,0.243831,0.225611,0.26932,0.080243,0.091565,-0.067249
feature_2,0.145957,0.009695,0.201202,0.230845,-0.09532,-0.079484,-0.036548,0.305965,1.0,0.150669,0.149922,0.285422,0.136106,0.370019,0.224007,0.246808,0.011428,-0.217449,0.234517
feature_3,0.2191,0.003621,0.312789,-0.025752,0.012614,-0.00112,0.085205,0.254764,0.150669,1.0,0.199477,0.198823,0.144153,0.062218,0.202217,0.251275,0.018045,0.032093,-0.025688


In [18]:
price_corr = data.corrwith(data['price']).reset_index().rename(columns={'index': 'features', 0: 'values'})
price_corr['values'] = price_corr['values'].apply(lambda x : abs(x))
price_corr.sort_values(by='values', inplace=True)
price_corr.style.background_gradient(cmap='viridis')

Unnamed: 0,features,values
13,feature_7,0.005337
16,sold_at,0.013613
5,paint_color,0.016292
4,fuel,0.017325
12,feature_6,0.210966
8,feature_2,0.246808
9,feature_3,0.251275
11,feature_5,0.261768
6,car_type,0.267482
7,feature_1,0.26932


In [19]:
def results_regression(y_test_, y_pred_, print_ = False):
    mse = mean_squared_error(y_test_, y_pred_)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(y_test_, y_pred_)
    mape = mean_absolute_percentage_error(y_test_, y_pred_)
    r2 = r2_score(y_test_, y_pred_)

    if print_:
        print(f"mse: {mse}")
        print(f"rmse: {rmse}")    
        print(f"mae: {mae}")
        print(f"mape: {mape}")
        print(f"r2_score {r2}")

    return r2
    
def compute_linear_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train)

    y_pred = linear_regression.predict(X_test)

    return results_regression(y_test, y_pred)


In [20]:
# Test all features x price
X = data.drop('price', axis=1).copy()
y = data.price
compute_linear_regression(X, y)

0.711405926058922

In [21]:
# when normalizing ... same score
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

compute_linear_regression(X_scaled, y)

0.7114059260590464