# 4.2. Regression model's Metrics

* R-Squared score or Coefficient of Determination
* MAE (Mean Absolute Error)
* MSE (Mean Square Error)

In [1]:
# Importing libraries 
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [3]:
# putting the dataset into dataframe
housing_df = pd.DataFrame(housing["data"],columns=housing["feature_names"])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [4]:
# adding target value to our dataframe
housing_df["target"] = housing ["target"]
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [10]:
# Setup random seed
np.random.seed(42)

# Create the data
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Importing an algorithm/estimator (After consulting the scikit-learn estimator choosing map)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

# Fitting the model into this algorithm (training data)
model.fit(X_train, y_train)

# Check the score of the model (on test dataset)
model.score(X_test, y_test)

0.8065734772187598

# R-Squared metrics

Compares the model predictions to the mean of the targets. Values can range from negetive infinity (a very poor model) to 1. For example, if all model the mean of the targets, Its R-square value would be 0. If the model perfectly predicts a range of numbers its R-square value would be 1. 

In [11]:
y_test.mean()

2.0550030959302275

In [12]:
from sklearn.metrics import r2_score

# Fill an array with y_test mean
y_test_mean = np.full(len(y_test), y_test.mean())
y_test_mean

array([2.0550031, 2.0550031, 2.0550031, ..., 2.0550031, 2.0550031,
       2.0550031])

In [13]:
r2_score(y_test, y_test_mean)

0.0

In [14]:
r2_score(y_test, y_test)

1.0

# MAE (Mean Absolute Error)

MAE is the average of the absolute differences between predictions and actual value.
It gives an idea of how wrong the model predictions are.

In [16]:
# import the MAE
from sklearn.metrics import mean_absolute_error
y_preds = model.predict(X_test)
y_preds

array([0.49384  , 0.75494  , 4.9285964, ..., 4.8363785, 0.71782  ,
       1.67901  ])

In [17]:
mean_absolute_error(y_test, y_preds)

0.32659871732073664

In [20]:
# We can observe the actual and predicted values differences by using dataframe
df = pd.DataFrame({"Actual values" : y_test, "Predicted values" : y_preds})
df["Differences"] = df["Predicted values"] - df["Actual values"] 
df

Unnamed: 0,Actual values,Predicted values,Differences
20046,0.47700,0.493840,0.016840
3024,0.45800,0.754940,0.296940
15663,5.00001,4.928596,-0.071414
20484,2.18600,2.543160,0.357160
9814,2.78000,2.331760,-0.448240
...,...,...,...
15362,2.63300,2.220380,-0.412620
16623,2.66800,1.947760,-0.720240
18086,5.00001,4.836378,-0.163632
2144,0.72300,0.717820,-0.005180


In [21]:
# lets check the answer of MAE
df["Differences"].mean()

0.012048392126937535

In [23]:
# Because it is a absolute value we need to do
np.abs(df["Differences"]).mean()

0.32659871732073803

# MSE (Mean Squared Error)
MSE is the mean of the square of the errors between actual and predicted values

In [24]:
# import the MSE
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_preds)

0.2534678520824551

In [27]:
df["Square diff"] = np.square(df["Differences"])
df.head()

Unnamed: 0,Actual values,Predicted values,Differences,Square diff
20046,0.477,0.49384,0.01684,0.000284
3024,0.458,0.75494,0.29694,0.088173
15663,5.00001,4.928596,-0.071414,0.0051
20484,2.186,2.54316,0.35716,0.127563
9814,2.78,2.33176,-0.44824,0.200919


In [29]:
# Verify the result
df["Square diff"].mean()

0.25346785208245565

***NOTE***
Which regression metric should we use?
* R-square is similiar to accuracy. It gives the quick identification of how well our model might be doing. Generally, the closer the R-square value is to 1.0 the better the model. But, it does not really tell exactly how wrong the model is in terms of how far of each prediction is?
* MAE gives a better indication of how far of the model's predictions are on average.
* As for MAE or MSE, because of the way MSE is calculated, squaring the differences between predicted values and actual values, it amplifies larger differences. Ex...

 ***Pay more attention to MAE:*** When being \\$10000 off is ***twice*** as bad as being \$5000 off.
 
 ***Pay more attention to MSE:*** When being \\$10000 off is ***more than twice*** as bad as being \$5000 off.