In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Bias vs Variance 
- The literal meaning of bias can be understood as the systematic deviation or error between the predictions made by a model and the true values of the target variable. A model with high bias tends to oversimplify the underlying patterns in the data, leading to underfitting. It fails to capture the complexity of the relationship between the features and the target variable, resulting in consistently inaccurate predictions across different datasets.

- On the other hand, variance refers to the variability or inconsistency in the model's predictions when trained on different subsets of the data. A model with high variance is sensitive to small changes in the training data, leading to overfitting. Such a model may capture noise or random fluctuations in the training set, resulting in poor performance when applied to unseen data.

In [2]:
df = pd.read_csv("Advertising.csv")

In [3]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [4]:
X = df.drop('sales', axis = 1)

In [5]:
y = df['sales']

In [6]:
from sklearn.preprocessing import PolynomialFeatures

In [7]:
polynomial_converter = PolynomialFeatures(degree = 3, include_bias = False )

In [8]:
poly_features = polynomial_converter.fit_transform(X)

In [9]:
X.shape

(200, 3)

In [11]:
poly_features.shape

(200, 19)

### Split the data into Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size = 0.3, random_state = 101)

### Scaling the Features using a StandardScaler

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()

In [19]:
scaler.fit(X_train)

In [20]:
X_train = scaler.transform(X_train)

- Note
    - We even scale the test data on the parameters(mean, standard deviation) obtained from the train data to maintain consistency in range of the parameters

In [21]:
X_test = scaler.transform(X_test)

In [22]:
X_train[0]

array([ 0.49300171, -0.33994238,  1.61586707,  0.28407363, -0.02568776,
        1.49677566, -0.59023161,  0.41659155,  1.6137853 ,  0.08057172,
       -0.05392229,  1.01524393, -0.36986163,  0.52457967,  1.48737034,
       -0.66096022, -0.16360242,  0.54694754,  1.37075536])

## Regularization

### Ridge Regression

### Important Note!
- Sklearn refers to lambda as alpha withing the class call
- For cross validation metrics, sklearn uses scorer object
- All scorer objects follow the convention that higher return values are better than lower return values.
- Higher accuracy is better but higher RMSE is actually worse so Scikit Learn fixes this by using a negative RMSE as its scorer metric


In [25]:
from sklearn.linear_model import Ridge

In [27]:
ridge_model = Ridge(alpha = 10)

In [28]:
ridge_model.fit(X_train, y_train)

In [29]:
test_predictions = ridge_model.predict(X_test)

In [30]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [31]:
MAE = mean_absolute_error(y_test, test_predictions)

In [32]:
MAE

0.5774404204714176

In [33]:
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [34]:
RMSE

0.8946386461319678

In [36]:
from sklearn.linear_model import RidgeCV
# Cross Validation

In [43]:
ridge_cv_model = RidgeCV(alphas = (0.1, 1.0, 10.0), scoring = 'neg_mean_absolute_error')
# You can mention the cv term here, if you dont mention it by default it performs LOOCV

### Holdout Method

In [44]:
ridge_cv_model.fit(X_train, y_train)
# Here we only using X_train for hyperparameter tuning so this is "Holdout Method"

In [45]:
ridge_cv_model.alpha_

0.1

### Choose a scorer

In [41]:
from sklearn.metrics import SCORERS

In [42]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'matthews_corrcoef', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'positive_likelihood_ratio', 'neg_negative_likelihood_ratio', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weig

### Metrics

In [46]:
test_predictions = ridge_cv_model.predict(X_test)

In [47]:
MAE = mean_absolute_error(y_test, test_predictions)

In [48]:
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [49]:
MAE

0.42737748843648854

In [50]:
RMSE

0.618071992695613

In [52]:
ridge_cv_model.coef_

array([ 5.40769392,  0.5885865 ,  0.40390395, -6.18263924,  4.59607939,
       -1.18789654, -1.15200458,  0.57837796, -0.1261586 ,  2.5569777 ,
       -1.38900471,  0.86059434,  0.72219553, -0.26129256,  0.17870787,
        0.44353612, -0.21362436, -0.04622473, -0.06441449])

In [53]:
ridge_cv_model.best_score_
# Negative root mean error 
# Higher the better
# -8000 error model is good compared to -10000 error model

-0.3749223340292953

### Lasso Regression or L1 Regularization

In [54]:
from sklearn.linear_model import LassoCV

In [61]:
lasso_cv_model = LassoCV(eps = 0.001, n_alphas = 100, max_iter = 1000000)
# eps = ration of alpha_min to alpha max, n_alphas = number of alphas
# Use the above if dont mentiong alphas = (x, y, x) explicity then sklearn sets them automatically
# cv defautl is 5 fold cross
# max_iter for stochatic gradient descent
# if max_iters is low we get a convergence warning so increase max_iters or eps value

In [62]:
lasso_cv_model.fit(X_train, y_train)

In [63]:
lasso_cv_model.alpha_

0.004943070909225827

In [64]:
test_predictions = lasso_cv_model.predict(X_test)

### Metrics

In [65]:
MAE = mean_absolute_error(y_test, test_predictions)

In [66]:
RMSE = np.sqrt(mean_absolute_error(y_test, test_predictions))

In [67]:
MAE

0.4335034618590074

In [68]:
RMSE

0.6584097978151657

In [69]:
lasso_cv_model.coef_

array([ 4.86023329,  0.12544598,  0.20746872, -4.99250395,  4.38026519,
       -0.22977201, -0.        ,  0.07267717, -0.        ,  1.77780246,
       -0.69614918, -0.        ,  0.12044132, -0.        , -0.        ,
       -0.        ,  0.        ,  0.        , -0.        ])

## Elastic Net

In [70]:
from sklearn.linear_model import ElasticNetCV
# Read documentation

In [74]:
elastic_model = ElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1 ], eps = 0.001, n_alphas = 100, max_iter = 1000000)
# l1_ratio -> a float between 0 and 1 0 means complete L2 regularization and 1 means L1
# Go more towrards lasso 
# First take l1 ratio of 0.1 try eps = 0.001 with 100 different alphas for 1 million iteration

In [75]:
elastic_model.fit(X_train, y_train)

In [76]:
elastic_model.l1_ratio

[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]

In [78]:
elastic_model.l1_ratio_ # best ratio here as 1 disregarded ridge completely

1.0

In [79]:
elastic_model.alpha_

0.004943070909225827

In [80]:
lasso_cv_model.alpha_

0.004943070909225827

In [81]:
test_predictions = elastic_model.predict(X_test)

In [82]:
MAE = mean_absolute_error(y_test, test_predictions)

In [83]:
MAE

0.4335034618590074