In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../08-Linear-Regression-Models/Advertising.csv')

In [3]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [4]:
X = df.drop("sales", axis='columns')
y = df["sales"]

In [5]:
X.head()

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [6]:
#Scaling the features
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [10]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
#Rough random model
from sklearn.linear_model import Ridge

In [12]:
model = Ridge(alpha=100)

In [13]:
from sklearn.model_selection import cross_val_score

In [14]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error')

In [15]:
scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [16]:
abs(scores.mean())

8.215396464543606

As we can see from above is that the score is not too great, it needs to be improved upon

In [17]:
model_two = Ridge(alpha=1)

In [18]:
scores = cross_val_score(model_two, X_train, y_train, scoring='neg_mean_squared_error')

In [19]:
scores

array([-3.15513238, -1.58086982, -5.40455562, -2.21654481, -4.36709384])

In [20]:
abs(scores.mean())

3.344839296530695

As we can see, the score has improved quite a lot, by _updating_ our alpha value <br>
Now, we will use the test dataset, for our reporting purpose

In [21]:
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [22]:
test_predictions = model_two.predict(X_test)

In [23]:
from sklearn.metrics import mean_squared_error

In [24]:
MSE = mean_squared_error(y_test, test_predictions)

In [25]:
MSE

2.3190215794287514

2.32 is our final true error reporting metric, since it was tested on the data that it has never seen before

In [26]:
from sklearn.model_selection import cross_validate

In [31]:
model = Ridge(alpha=100)

In [35]:
scores = cross_validate(model, X_train, y_train, 
                        scoring=["neg_mean_squared_error",
                                 "neg_median_absolute_error"], cv=10)
scores

{'fit_time': array([0.00228786, 0.00254273, 0.00149202, 0.00195503, 0.00123501,
        0.00060105, 0.00047183, 0.00048614, 0.00049615, 0.00049877]),
 'score_time': array([0.00149512, 0.00127792, 0.00086021, 0.00089812, 0.00044703,
        0.00037193, 0.00037694, 0.00039291, 0.00040483, 0.00038624]),
 'test_neg_mean_squared_error': array([ -6.06067062, -10.62703078,  -3.99342608,  -5.00949402,
         -9.14179955, -13.08625636,  -3.83940454,  -9.05878567,
         -9.05545685,  -5.77888211]),
 'test_neg_median_absolute_error': array([-0.95405044, -1.43720847, -0.98266339, -1.57156859, -2.27809665,
        -1.78619462, -0.98052783, -2.03771141, -1.95608215, -1.53149549])}

In [36]:
scores_df = pd.DataFrame(scores)

In [37]:
scores_df

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_median_absolute_error
0,0.002288,0.001495,-6.060671,-0.95405
1,0.002543,0.001278,-10.627031,-1.437208
2,0.001492,0.00086,-3.993426,-0.982663
3,0.001955,0.000898,-5.009494,-1.571569
4,0.001235,0.000447,-9.1418,-2.278097
5,0.000601,0.000372,-13.086256,-1.786195
6,0.000472,0.000377,-3.839405,-0.980528
7,0.000486,0.000393,-9.058786,-2.037711
8,0.000496,0.000405,-9.055457,-1.956082
9,0.000499,0.000386,-5.778882,-1.531495


Here, we can see the benefits of using cross_validate(). This allows us to 
- Use 2 error metrics at once
- Gives the results for both the metrics in the number of folds (number of times, the model has been trained-and-tested) <br>
Also, we can see the _error metrics_ are not too good

In [38]:
scores_df.mean()

fit_time                          0.001207
score_time                        0.000691
test_neg_mean_squared_error      -7.565121
test_neg_median_absolute_error   -1.551560
dtype: float64

In [40]:
#Revising the lambda/alpha of our model
model_two = Ridge(alpha=1)

In [41]:
scores = cross_validate(estimator=model_two,X=X_train, y=y_train, cv=10, 
               scoring=["neg_mean_squared_error",
                                 "neg_median_absolute_error"])

In [43]:
scores = pd.DataFrame(scores)

In [44]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_median_absolute_error
0,0.00235,0.001488,-2.962508,-1.384107
1,0.002168,0.001255,-3.057378,-1.544264
2,0.001126,0.000823,-2.17374,-1.195506
3,0.00131,0.000787,-0.833034,-0.813064
4,0.000937,0.0008,-3.464018,-1.131223
5,0.0011,0.000597,-8.232647,-0.438475
6,0.000631,0.000397,-1.905864,-0.962423
7,0.000479,0.000375,-2.765048,-0.943862
8,0.000468,0.000368,-4.989505,-1.177499
9,0.00046,0.000357,-2.846438,-0.616302


In [45]:
scores.mean()

fit_time                          0.001103
score_time                        0.000725
test_neg_mean_squared_error      -3.323018
test_neg_median_absolute_error   -1.020673
dtype: float64

In [47]:
model_two.fit(X_train,y_train)

Ridge(alpha=1)

In [48]:
test_predictions = model_two.predict(X_test)

In [49]:
MSE = mean_squared_error(y_true=y_test, y_pred=test_predictions)

In [51]:
MSE  #### This is our true error reporting metric

2.3190215794287514

In [52]:
dir()

['In',
 'MSE',
 'Out',
 'Ridge',
 'StandardScaler',
 'X',
 'X_test',
 'X_train',
 '_',
 '_15',
 '_16',
 '_19',
 '_20',
 '_21',
 '_25',
 '_28',
 '_3',
 '_30',
 '_32',
 '_34',
 '_35',
 '_37',
 '_38',
 '_42',
 '_44',
 '_45',
 '_47',
 '_5',
 '_50',
 '_51',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_i10',
 '_i11',
 '_i12',
 '_i13',
 '_i14',
 '_i15',
 '_i16',
 '_i17',
 '_i18',
 '_i19',
 '_i2',
 '_i20',
 '_i21',
 '_i22',
 '_i23',
 '_i24',
 '_i25',
 '_i26',
 '_i27',
 '_i28',
 '_i29',
 '_i3',
 '_i30',
 '_i31',
 '_i32',
 '_i33',
 '_i34',
 '_i35',
 '_i36',
 '_i37',
 '_i38',
 '_i39',
 '_i4',
 '_i40',
 '_i41',
 '_i42',
 '_i43',
 '_i44',
 '_i45',
 '_i46',
 '_i47',
 '_i48',
 '_i49',
 '_i5',
 '_i50',
 '_i51',
 '_i52',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'cross_val_score',
 'cross_validate',
 'df',
 'exit',
 'get_ipython',
 'mean_squared_error',
 'model',
 'model_two',
 'np'