# Regression metrics -- how well did we do?

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn import datasets 

data = datasets.fetch_california_housing()
target = data.target
features = pd.DataFrame(data=data.data, columns=data.feature_names)
features.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Do our train-test-split

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target)

# Fit the model

In [3]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Test the model *on the test data* (we don't care much if it does well on the *training data*)

In [4]:
ypred = linreg.predict(X_test)

# Mean Squared Error

- Penalizes us more if we are 'more wrong'
- Nice and easy to implement
- Sensitive to outliers
- Difficult to interpret ("dollars squared")

In [5]:
mse = np.mean((ypred - y_test) ** 2)
mse

0.53224166094384

We can also use the `metrics` module in sklearn:

In [6]:
from sklearn import metrics

In [7]:
metrics.mean_squared_error(y_test, ypred)

0.53224166094384

# Mean absolute error

- Less sensitive to outliers
- Easier to interpret
- Being off by a lot isn't that much worse than being off by a little

In [8]:
mae = np.mean(np.abs(ypred - y_test))
mae

0.5397722524083323

In [9]:
metrics.mean_absolute_error(y_test, ypred)

0.5397722524083323

# Mean absolute percentage error

- "Puts things into context"
- Only works with nonzero values

In [10]:
mape = np.mean(np.abs((ypred - y_test) / y_test)) * 100
mape

31.985656584977058

# Root mean squared error (RMSE)

"Mean squared error with better units"

In [11]:
rmse = np.sqrt(mse)
rmse

0.729548943487577

# Coefficient of determination or R2

- More often used to determine whether we actually *fit* the training data
- Varies between 0 and 1

In [12]:
metrics.r2_score(y_test, ypred)

0.6043891089865341