In [2]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
boston = load_boston()
X = boston.data
Y = boston.target

print(X.shape)
print(Y.shape)

(506, 13)
(506,)


In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape)
print(X_test.shape)

# train_test_split gives a random split => so different split produced each time, so accuracy varies each time it 
# is run
# Cross validation can be used 

(404, 13)
(102, 13)


### Training our model

In [5]:
# 1. Create a Linear Regression object
lr = LinearRegression(normalize=True)
print(lr)

# 2. Training
lr.fit(X_train, Y_train)

# 3. Display calculated parameters
print(lr.coef_)         # Theta1 to Theta13
print(lr.intercept_)    # Theta0

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
[-1.18104550e-01  4.23622647e-02  3.95656915e-03  4.10979178e+00
 -1.58537548e+01  3.91320428e+00 -1.05255037e-02 -1.29374150e+00
  2.99406701e-01 -1.34124943e-02 -8.46926994e-01  8.19889993e-03
 -4.37501040e-01]
32.6632811723582


### Accuracy of model
##### Coefficient of Determination:

$R^2 = 1 - \frac{(y_A - y_P)^2}{(y_A - \overline{y})^2}$ <br>

The coefficient $R^2$ is defined as <br> $R^2$ = 1 - u/v <br>
where: <br>   u is the residual sum of squares ((y_true - y_pred) ** 2).sum() 
<br>    v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum().
- The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). 
- A constant model that always predicts the expected value of y, disregarding the input features, would get a $R^2$ score of 0.0.

In [6]:
print("Training accuracy score:", lr.score(X_train, Y_train))
print("Testing accuracy score:", lr.score(X_test, Y_test))

Training accuracy score: 0.7486843456600893
Testing accuracy score: 0.6833021705054052


### Cross Validation
- When we randomly divide dataset into training and testing sets, we get different accuracy scores
- We want to calculate average case accuracy score

#### k-fold Cross Validation
1. Divide dataset into k parts/folds
2. Repeat k times (for each fold):
    - Train on k-1 folds
    - Test/validate on other 1 fold
    - Compute score $s_i$
3. Take average of all these scores <br>
Average score = $\frac{s_1 + s_2 + ... + s_k}{k}$

In [7]:
from sklearn.model_selection import cross_val_score

In [19]:
scores = cross_val_score(lr, X_train, Y_train, cv=10, scoring='r2')  # For model = LR and R^2 accuracy score
scores2 = cross_val_score(lr, X_train, Y_train, cv=10, scoring='neg_mean_squared_error')  # For model = LR and mean squared error accuracy score

print(scores)
print()
print(scores.shape)
print(type(scores))

[0.8231021  0.7159263  0.6518783  0.76577106 0.53283634 0.534747
 0.44944689 0.80927103 0.84794208 0.83824311]

(10,)
<class 'numpy.ndarray'>


In [18]:
print(scores.mean())
print(scores.std())
# Accuracy fluctuates between 0.69 ± 0.13

print()
print(scores2.mean())
print(scores2.std())

0.6969164225429704
0.13889618242672833

-22.83110316982713
12.31189582662986
