In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Advertising.csv')

In [3]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


----
----
----
## Train | Validation | Test Split Procedure 

This is often also called a "hold-out" set, since you should not adjust parameters based on the final test set, but instead use it *only* for reporting final expected performance.

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Validation/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Eval Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval)
7. Adjust Parameters as Necessary and repeat steps 5 and 6
8. Get final metrics on Test set (not allowed to go back and adjust after this!)

In [4]:
## CREATE X and y
X = df.drop('sales',axis=1)
y = df['sales']

In [5]:
######################################################################
#### SPLIT TWICE! Here we create TRAIN | VALIDATION | TEST  #########
####################################################################
from sklearn.model_selection import train_test_split

# 70% of data is training data, set aside other 30%
X_train, X_OTHER, y_train, y_OTHER = train_test_split(X, y, test_size=0.3, random_state=101)

# Remaining 30% is split into evaluation and test sets
# Each is 15% of the original data size
X_eval, X_test, y_eval, y_test = train_test_split(X_OTHER, y_OTHER, test_size=0.5, random_state=101)

In [6]:
# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_eval = scaler.transform(X_eval)
X_test = scaler.transform(X_test)

**Create Model**

In [7]:
from sklearn.linear_model import Ridge

In [8]:
# Poor Alpha Choice on purpose!
model = Ridge(alpha=100)

In [9]:
model.fit(X_train,y_train)

Ridge(alpha=100)

In [10]:
y_eval_pred = model.predict(X_eval)

**Evaluation**

In [11]:
from sklearn.metrics import mean_squared_error

In [12]:
mean_squared_error(y_eval,y_eval_pred)

7.320101458823872

**Adjust Parameters and Re-evaluate**

In [13]:
model = Ridge(alpha=1)

In [14]:
model.fit(X_train,y_train)

Ridge(alpha=1)

In [15]:
y_eval_pred = model.predict(X_eval)

**Another Evaluation**

In [16]:
mean_squared_error(y_eval,y_eval_pred)

2.3837830750569866

**Final Evaluation (Can no longer edit parameters after this!)**

In [17]:
y_final_test_pred = model.predict(X_test)

In [18]:
mean_squared_error(y_test,y_final_test_pred)

2.254260083800517