In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("../08-Linear-Regression-Models/Advertising.csv")

In [4]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [5]:
X = df.drop("sales", axis=1)
y = df["sales"]

Steps:
- Convert to polynomial features
- Split into training and testing dataset (In our case training dataset consists of training + evaluation data)
- Fit/train scaler to the training dataset
- Scale both the training and testing dataset
- Perform cross-validation on the training + evaluation dataset (testing dataset would be the never-seen-before data or hold-out-test dataset)
- Train the model
- Check the performance metrics, if not satisfied with the metric, retrain the model on the training + evaluation dataset

In [6]:
from sklearn.preprocessing import PolynomialFeatures

In [7]:
poly_converter = PolynomialFeatures(degree=3, include_bias=False)

In [13]:
poly_features = poly_converter.fit_transform(X)

In [14]:
type(poly_features)

numpy.ndarray

In [15]:
poly_features.shape

(200, 19)

In [9]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state=101)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [36]:
X_train.shape

(140, 3)

In [37]:
X_test.shape

(60, 3)

In [38]:
from sklearn.linear_model import Ridge

In [39]:
model = Ridge(alpha=100) #Purposefully using a model, that might perform poorly and then we will improve on that

In [40]:
from sklearn.model_selection import cross_val_score

cross_val_score() helps perform cross validation automatically, giving the test results according to a particular metric.
Useful for cases when the model doesn't have a CV variant (like Ridge-RidgeCV)

In [41]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error')

In [42]:
scores

array([-3.1574411 , -1.61190525, -5.37588672, -2.23984591, -4.3264032 ])

In [43]:
abs(scores.mean())

3.3422964358412406

In [44]:
#We are not very satisfied with the error, so we will update our model with a different alpha 
#and check its performance metrics
model_two = Ridge(alpha=1)

In [45]:
scores = cross_val_score(model_two, X_train, y_train, scoring= 'neg_mean_squared_error')

In [46]:
scores

array([-3.13950859, -1.62235574, -5.37385624, -2.24222086, -4.34151629])

In [47]:
abs(scores.mean())

3.3438915436537493