## Train | Test Split

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("../DATA/Advertising.csv")

In [4]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


## Train | Test Split Procedure 

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Test Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Test Data (by creating predictions and comparing to Y_test)
7. Adjust Parameters as Necessary and repeat steps 5 and 6

In [5]:
X = df.drop("sales", axis=1)

In [6]:
y = df["sales"]

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
#help(train_test_split)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()

In [12]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
from sklearn.linear_model import Ridge

In [14]:
model = Ridge(alpha=100)

In [15]:
model.fit(X_train, y_train)

Ridge(alpha=100)

In [16]:
y_pred = model.predict(X_test)

In [17]:
from sklearn.metrics import mean_squared_error

In [18]:
MSE = mean_squared_error(y_pred, y_test)

In [19]:
MSE

7.341775789034129

In [20]:
model_2 = Ridge(alpha=1)

In [21]:
model_2.fit(X_train, y_train)

Ridge(alpha=1)

In [22]:
y_pred_2 = model_2.predict(X_test)

In [23]:
MSE = mean_squared_error(y_pred_2, y_test)

In [24]:
MSE

2.3190215794287514

## Train | Validation | Test Split Procedure 

This is often also called a "hold-out" set, since you should not adjust parameters based on the final test set, but instead use it *only* for reporting final expected performance.

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Validation/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Eval Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval)
7. Adjust Parameters as Necessary and repeat steps 5 and 6
8. Get final metrics on Test set (not allowed to go back and adjust after this!)

In [25]:
X = df.drop('sales',axis=1)
y = df['sales']

In [26]:
# First Split, 70% training, rest is other data
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=101)

In [27]:
# 50% split between each
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=101)

In [28]:
len(df)

200

In [29]:
len(X_train)

140

In [30]:
len(X_test)

30

In [31]:
len(X_eval)

30

In [32]:
scaler = StandardScaler()

In [33]:
scaler.fit(X_train)

StandardScaler()

In [34]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_eval = scaler.transform(X_eval)

In [35]:
model_1 = Ridge(alpha=100)

In [36]:
model_1.fit(X_train, y_train)

Ridge(alpha=100)

In [37]:
y_pred_eval = model_1.predict(X_eval)

In [38]:
mean_squared_error(y_pred_eval, y_eval)

7.320101458823872

In [39]:
model_2 = Ridge(alpha=1)

In [40]:
model_2.fit(X_train, y_train)

Ridge(alpha=1)

In [41]:
y_pred_eval = model_2.predict(X_eval)

In [42]:
mean_squared_error(y_pred_eval, y_eval)

2.3837830750569866

In [43]:
y_final_test_pred = model_2.predict(X_test)

In [44]:
mean_squared_error(y_final_test_pred, y_test)

2.254260083800517

### cross_val_score

In [45]:
X = df.drop("sales", axis=1)

In [46]:
y = df["sales"]

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [48]:
from sklearn.preprocessing import StandardScaler

In [49]:
scaler = StandardScaler()

In [50]:
scaler.fit(X_train)

StandardScaler()

In [51]:
X_train = scaler.transform(X_train)

In [52]:
X_test = scaler.transform(X_test)

In [53]:
model = Ridge(alpha=100)

In [54]:
from sklearn.model_selection import cross_val_score

In [56]:
scores = cross_val_score(estimator=model, X=X_train, y=y_train, scoring="neg_mean_squared_error", cv=5)

In [59]:
abs(scores.mean())

8.215396464543607

In [60]:
model = Ridge(alpha=1)

In [61]:
scores = cross_val_score(estimator=model, X=X_train, y=y_train, scoring="neg_mean_squared_error", cv=5)

In [62]:
abs(scores.mean())

3.344839296530695

In [63]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [64]:
y_final_test_pred = model.predict(X_test)

In [65]:
mean_squared_error(y_final_test_pred, y_test)

2.3190215794287514

### cross_validate

In [66]:
## CREATE X and y
X = df.drop('sales',axis=1)
y = df['sales']

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [67]:
from sklearn.model_selection import cross_validate

In [68]:
model = Ridge(alpha=100)

In [69]:
scores = cross_validate(estimator=model, X=X_test, y=y_test,
                        scoring=["neg_mean_squared_error", "neg_mean_absolute_error"], cv=10) 

In [70]:
scores

{'fit_time': array([0.00166583, 0.0009439 , 0.00060391, 0.00064588, 0.00064373,
        0.00110412, 0.00108957, 0.00112295, 0.00102592, 0.00064778]),
 'score_time': array([0.00194025, 0.00055504, 0.0009141 , 0.000494  , 0.00049829,
        0.00094509, 0.00079632, 0.00103593, 0.00051713, 0.0004952 ]),
 'test_neg_mean_squared_error': array([ -7.09927168,  -8.74648466, -14.44004396, -41.55988543,
         -7.71037806, -10.62976235,  -6.71037379, -10.47242201,
         -7.4010742 , -23.45072882]),
 'test_neg_mean_absolute_error': array([-2.2211634 , -2.56774745, -3.18993129, -5.84175213, -2.35686313,
        -2.87820168, -2.22992141, -2.26995313, -1.75986084, -3.8666902 ])}

In [71]:
scores = pd.DataFrame(scores)

In [72]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.001666,0.00194,-7.099272,-2.221163
1,0.000944,0.000555,-8.746485,-2.567747
2,0.000604,0.000914,-14.440044,-3.189931
3,0.000646,0.000494,-41.559885,-5.841752
4,0.000644,0.000498,-7.710378,-2.356863
5,0.001104,0.000945,-10.629762,-2.878202
6,0.00109,0.000796,-6.710374,-2.229921
7,0.001123,0.001036,-10.472422,-2.269953
8,0.001026,0.000517,-7.401074,-1.759861
9,0.000648,0.000495,-23.450729,-3.86669


In [73]:
scores.mean()

fit_time                         0.000949
score_time                       0.000819
test_neg_mean_squared_error    -13.822042
test_neg_mean_absolute_error    -2.918208
dtype: float64

In [74]:
model = Ridge(alpha=1)

In [75]:
scores = cross_validate(estimator=model, X=X_test, y=y_test,
                        scoring=["neg_mean_squared_error", "neg_mean_absolute_error"], cv=10) 

In [76]:
scores = pd.DataFrame(scores)

In [77]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.001375,0.000931,-0.280557,-0.414609
1,0.001329,0.000931,-0.495824,-0.601341
2,0.001159,0.000843,-6.032891,-2.160291
3,0.000933,0.000564,-2.648814,-1.435727
4,0.000739,0.000489,-3.088623,-1.552444
5,0.000653,0.001103,-1.520116,-0.998847
6,0.001038,0.000983,-2.727232,-1.350901
7,0.001202,0.00072,-3.030771,-1.467638
8,0.00087,0.000495,-3.015139,-1.518798
9,0.000599,0.000491,-2.991273,-1.604029


In [78]:
scores.mean()

fit_time                        0.000990
score_time                      0.000755
test_neg_mean_squared_error    -2.583124
test_neg_mean_absolute_error   -1.310462
dtype: float64

In [79]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [80]:
y_final_pred = model.predict(X_test)

In [81]:
mean_squared_error(y_final_pred, y_test)

2.3190215794287514