## Regression model evaluation metrics

Model evaluation metrics for regression:
* R squared
* Mean Absolute Error (MAE)
* Mean Squared Error (MSE)
Find more: https://scikit-learn.org/1.4/modules/model_evaluation.html

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [7]:
car_sales = pd.read_csv("sample_data/car-sales-extended.csv")
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [8]:
encoded_df = pd.get_dummies(car_sales)
encoded_df

Unnamed: 0,Odometer (KM),Doors,Price,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,35431,4,15323,False,True,False,False,False,False,False,False,True
1,192714,5,19943,True,False,False,False,False,True,False,False,False
2,84714,4,28343,False,True,False,False,False,False,False,False,True
3,154365,4,13434,False,False,False,True,False,False,False,False,True
4,181577,3,14043,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
995,35820,4,32042,False,False,False,True,True,False,False,False,False
996,155144,3,5716,False,False,True,False,False,False,False,False,True
997,66604,4,31570,False,False,True,False,False,True,False,False,False
998,215883,4,4001,False,True,False,False,False,False,False,False,True


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = encoded_df.drop('Price', axis=1)
y = encoded_df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

model = RandomForestRegressor(random_state=5)
model.fit(X_train, y_train)

In [10]:
model.score(X_test, y_test)

# sounds meh, but we'll keep it for the sake of using metrics

0.2768270358797007

In [11]:
y_test.mean()

np.float64(16568.11)

In [12]:
from sklearn.metrics import r2_score

# Fill an array with y_test mean
y_test_mean = np.full(len(y_test), y_test.mean())

In [13]:
y_test_mean[:10]

array([16568.11, 16568.11, 16568.11, 16568.11, 16568.11, 16568.11,
       16568.11, 16568.11, 16568.11, 16568.11])

In [14]:
r2_score(y_true=y_test, y_pred=y_test_mean)

0.0

In [15]:
r2_score(y_true=y_test, y_pred=y_test)

1.0