# Cross Validation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('../Data/Advertising.csv')

In [4]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


----
----
----
## Train | Test Split Procedure 

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Test Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Test Data (by creating predictions and comparing to Y_test)
7. Adjust Parameters as Necessary and repeat steps 5 and 6

In [24]:
# 0)
X = df.drop('sales', axis=1)
y = df['sales']

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
# 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [27]:
# 2) scaling because we don't know whether the unit are same or not
from sklearn.preprocessing import StandardScaler

In [28]:
scaler = StandardScaler()

In [29]:
scaler.fit(X_train) # only fit the train set to avoid data leakage

StandardScaler()

In [30]:
X_train = scaler.transform(X_train)

In [31]:
# 3)
X_test = scaler.transform(X_test)

In [32]:
# 4)
from sklearn.linear_model import Ridge

In [33]:
r_model = Ridge(alpha=100)

In [34]:
# 5)
r_model.fit(X_train, y_train)

Ridge(alpha=100)

In [35]:
y_predictions = r_model.predict(X_test)

In [36]:
# 6)
from sklearn.metrics import mean_squared_error

In [37]:
mean_squared_error(y_test, y_predictions)

7.34177578903413

In [41]:
# 7) Another Model
another_model = Ridge(alpha=1)
another_model.fit(X_train, y_train)
y_predictions = another_model.predict(X_test)
mean_squared_error(y_test, y_predictions)

2.319021579428752

-------

----
## Train | Validation | Test Split Procedure 

This is often also called a "hold-out" set, since you should not adjust parameters based on the final test set, but instead use it *only* for reporting final expected performance.

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Validation/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Eval Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval)
7. Adjust Parameters as Necessary and repeat steps 5 and 6
8. Get final metrics on Test set (not allowed to go back and adjust after this!)

In [42]:
# 0)
X = df.drop('sales', axis=1)
y = df['sales']

In [43]:
# 1)
#### SPLIT TWICE! Here we create TRAIN | VALIDATION | TEST  #########
from sklearn.model_selection import train_test_split

X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=101)

# split again (50% , 50%)
X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=101)

In [44]:
len(df)

200

In [45]:
len(X_train)

140

In [46]:
len(X_val)

30

In [47]:
len(X_test)

30

In [48]:
# 2)
from sklearn.preprocessing import StandardScaler

In [49]:
scaler = StandardScaler()

In [50]:
scaler.fit(X_train)

StandardScaler()

In [51]:
# 3)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

In [52]:
# 4)
from sklearn.linear_model import Ridge

In [53]:
r_model = Ridge(alpha=100)

In [54]:
# 5) 
r_model.fit(X_train, y_train)

Ridge(alpha=100)

In [55]:
y_val_predictions = r_model.predict(X_val)

In [57]:
# 6)
from sklearn.metrics import mean_squared_error

In [59]:
mean_squared_error(y_val, y_val_predictions)

7.320101458823872

In [61]:
# 7) new model
model_two = Ridge(alpha=1)

In [62]:
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [63]:
y_val_predictions = model_two.predict(X_val)

In [64]:
mean_squared_error(y_val, y_val_predictions)

2.383783075056986

In [65]:
# 8) Make Predictions on FInal Test Set
y_final_test_predictions = model_two.predict(X_test)

In [67]:
mean_squared_error(y_test, y_final_test_predictions)

2.2542600838005176

---------

# Cross Validation with cross_val_score function

### SCORING OPTIONS:
- https://scikit-learn.org/stable/modules/model_evaluation.html
- All scorer objects follow the convention that **higher return values are better than lower return values.** Thus metrics which measure the distance between the model and the data, like metrics.mean_squared_error, are available as neg_mean_squared_error which return the negated value of the metric.
----

<img src="grid_search_cross_validation.png">

----

In [100]:
X = df.drop('sales', axis=1)
y = df['sales']

In [101]:
from sklearn.model_selection import train_test_split

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [103]:
from sklearn.preprocessing import StandardScaler

In [104]:
scaler = StandardScaler()

In [105]:
scaler.fit(X_train)

StandardScaler()

In [106]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [107]:
r_model = Ridge(alpha=100)

In [108]:
from sklearn.model_selection import cross_val_score

In [109]:
scores = cross_val_score(r_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5) 

In [110]:
scores # scores for each fold , in this case we are using 5 folds so that 5 scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [111]:
abs(scores.mean())

8.215396464543606

#### Adjust model based on metrics

In [115]:
model2 = Ridge(alpha=1)

In [116]:
scores = cross_val_score(model2, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

In [117]:
abs(scores.mean())

3.344839296530695

#### Final Evaluation (Can no longer edit parameters after this!)

In [118]:
model2.fit(X_train, y_train)

Ridge(alpha=1)

In [119]:
y_final_test_predictions = model2.predict(X_test)

In [120]:
mean_squared_error(y_test, y_final_test_predictions)

2.319021579428752

----------

# Cross Validation with cross_validate function

- https://scikit-learn.org/stable/modules/model_evaluation.html

The cross_validate function differs from cross_val_score in two ways:

** It allows specifying multiple metrics for evaluation. **

** It returns a dict containing fit-times, score-times (and optionally training scores as well as fitted estimators) in addition to the test score. **

For single metric evaluation, where the scoring parameter is a string, callable or None, the keys will be:
        
        - ['test_score', 'fit_time', 'score_time']

And for multiple metric evaluation, the return value is a dict with the following keys:

    ['test_<scorer1_name>', 'test_<scorer2_name>', 'test_<scorer...>', 'fit_time', 'score_time']

return_train_score is set to False by default to save computation time. To evaluate the scores on the training set as well you need to be set to True.

In [121]:
## CREATE X and y
X = df.drop('sales',axis=1)
y = df['sales']

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [123]:
from sklearn.model_selection import cross_validate

In [124]:
model = Ridge(alpha=100)

In [128]:
scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=10)

In [129]:
scores

{'fit_time': array([0.00200176, 0.00200272, 0.0010004 , 0.00100136, 0.00100112,
        0.00099969, 0.00200105, 0.00200295, 0.00100136, 0.00100064]),
 'score_time': array([0.00200129, 0.00099969, 0.00100088, 0.00100017, 0.00100017,
        0.00100255, 0.00099897, 0.0010004 , 0.00099921, 0.00100064]),
 'test_neg_mean_squared_error': array([ -6.06067062, -10.62703078,  -3.99342608,  -5.00949402,
         -9.14179955, -13.08625636,  -3.83940454,  -9.05878567,
         -9.05545685,  -5.77888211]),
 'test_neg_mean_absolute_error': array([-1.8102116 , -2.54195751, -1.46959386, -1.86276886, -2.52069737,
        -2.45999491, -1.45197069, -2.37739501, -2.44334397, -1.89979708])}

In [130]:
scores = pd.DataFrame(scores)
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.002002,0.002001,-6.060671,-1.810212
1,0.002003,0.001,-10.627031,-2.541958
2,0.001,0.001001,-3.993426,-1.469594
3,0.001001,0.001,-5.009494,-1.862769
4,0.001001,0.001,-9.1418,-2.520697
5,0.001,0.001003,-13.086256,-2.459995
6,0.002001,0.000999,-3.839405,-1.451971
7,0.002003,0.001,-9.058786,-2.377395
8,0.001001,0.000999,-9.055457,-2.443344
9,0.001001,0.001001,-5.778882,-1.899797


In [131]:
scores.mean()

fit_time                        0.001401
score_time                      0.001100
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

### Adjust model based on metrics

In [132]:
model = Ridge(alpha=1)

In [136]:
scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=10)

In [137]:
scores = pd.DataFrame(scores)

In [138]:
scores.mean()

fit_time                        0.001401
score_time                      0.001301
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [139]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [140]:
y_final_predictions = model.predict(X_test)

In [141]:
mean_squared_error(y_test, y_final_predictions)

2.319021579428752