# Cross Validation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('../Data/Advertising.csv')

In [4]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


----
----
----
## Train | Test Split Procedure 

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Test Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Test Data (by creating predictions and comparing to Y_test)
7. Adjust Parameters as Necessary and repeat steps 5 and 6

In [24]:
# 0)
X = df.drop('sales', axis=1)
y = df['sales']

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
# 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [27]:
# 2) scaling because we don't know whether the unit are same or not
from sklearn.preprocessing import StandardScaler

In [28]:
scaler = StandardScaler()

In [29]:
scaler.fit(X_train) # only fit the train set to avoid data leakage

StandardScaler()

In [30]:
X_train = scaler.transform(X_train)

In [31]:
# 3)
X_test = scaler.transform(X_test)

In [32]:
# 4)
from sklearn.linear_model import Ridge

In [33]:
r_model = Ridge(alpha=100)

In [34]:
# 5)
r_model.fit(X_train, y_train)

Ridge(alpha=100)

In [35]:
y_predictions = r_model.predict(X_test)

In [36]:
# 6)
from sklearn.metrics import mean_squared_error

In [37]:
mean_squared_error(y_test, y_predictions)

7.34177578903413

In [41]:
# 7) Another Model
another_model = Ridge(alpha=1)
another_model.fit(X_train, y_train)
y_predictions = another_model.predict(X_test)
mean_squared_error(y_test, y_predictions)

2.319021579428752

-------

----
## Train | Validation | Test Split Procedure 

This is often also called a "hold-out" set, since you should not adjust parameters based on the final test set, but instead use it *only* for reporting final expected performance.

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Validation/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Eval Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval)
7. Adjust Parameters as Necessary and repeat steps 5 and 6
8. Get final metrics on Test set (not allowed to go back and adjust after this!)

In [42]:
# 0)
X = df.drop('sales', axis=1)
y = df['sales']

In [43]:
# 1)
#### SPLIT TWICE! Here we create TRAIN | VALIDATION | TEST  #########
from sklearn.model_selection import train_test_split

X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=101)

# split again (50% , 50%)
X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=101)

In [44]:
len(df)

200

In [45]:
len(X_train)

140

In [46]:
len(X_val)

30

In [47]:
len(X_test)

30

In [48]:
# 2)
from sklearn.preprocessing import StandardScaler

In [49]:
scaler = StandardScaler()

In [50]:
scaler.fit(X_train)

StandardScaler()

In [51]:
# 3)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

In [52]:
# 4)
from sklearn.linear_model import Ridge

In [53]:
r_model = Ridge(alpha=100)

In [54]:
# 5) 
r_model.fit(X_train, y_train)

Ridge(alpha=100)

In [55]:
y_val_predictions = r_model.predict(X_val)

In [57]:
# 6)
from sklearn.metrics import mean_squared_error

In [59]:
mean_squared_error(y_val, y_val_predictions)

7.320101458823872

In [61]:
# 7) new model
model_two = Ridge(alpha=1)

In [62]:
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [63]:
y_val_predictions = model_two.predict(X_val)

In [64]:
mean_squared_error(y_val, y_val_predictions)

2.383783075056986

In [65]:
# 8) Make Predictions on FInal Test Set
y_final_test_predictions = model_two.predict(X_test)

In [67]:
mean_squared_error(y_test, y_final_test_predictions)

2.2542600838005176

---------