In [2]:
from IPython.display import display, Math, Latex
# This is imported for proper rendering of Latex in Notebook.

In [3]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

np.random.seed(306)
plt.style.use('seaborn')

# Graded Assignment

In [7]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

features, labels = fetch_california_housing(as_frame=True, return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0, shuffle=False)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

lin_reg_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('lin_reg', LinearRegression()),
])

lin_reg_pipe.fit(X_train, y_train)
lin_reg_pipe.score(X_test, y_test) # This is same as the R2-score

0.660514059153199

In [17]:
from sklearn.metrics import r2_score

r2_score(y_test, lin_reg_pipe.predict(X_test))

0.660514059153199

In [16]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, lin_reg_pipe.predict(X_test)))
rmse

0.7033383507521879

In [19]:
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error

y_pred = lin_reg_pipe.predict(X_test)
print(explained_variance_score(y_test, y_pred))
print(max_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.6605500501742702
7.260453292958445
0.5168526993787042
0.4946848356388077


In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

sgd_reg_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('sgd', SGDRegressor(random_state=0)), # SGD Regression using the default parameters and random_state=0
])

sgd_reg_pipe.fit(X_train, y_train)

Pipeline(steps=[('feature_scaler', StandardScaler()),
                ('sgd', SGDRegressor(random_state=0))])

In [27]:
sgd_reg_pipe[-1].intercept_

array([2.01123921])

In [28]:
sgd_reg_pipe[-1].coef_

array([ 0.84046697,  0.112331  , -0.41213039,  0.21595971, -0.01781887,
       -0.01480892, -0.87394103, -0.83913104])

# Solve with Instructors

## Task 1

In [29]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

X,y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

## Task 2

In [30]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg.score(X_test, y_test) # This gives the R2-score

0.5957702326061665

## Task 3

In [31]:
from sklearn.datasets import load_diabetes

X,y = load_diabetes(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=42)

## Task 4

In [34]:
lin_reg = LinearRegression(fit_intercept=True) # fit_intercept = True by default
lin_reg.fit(X_train, y_train)
lin_reg.score(X_test, y_test)

0.5157444756897698

## Task 5

In [35]:
lin_reg_without_int = LinearRegression(fit_intercept=False)
lin_reg_without_int.fit(X_train, y_train)
lin_reg_without_int.score(X_test, y_test)

-3.7861093338014173

## Task 6

In [37]:
print(lin_reg.intercept_)
print(lin_reg.coef_)

148.92850899668235
[  18.08383103 -227.04654841  592.2754776   361.54657801 -655.89624143
  353.71022539   14.40233952  142.86622578  594.01401521   31.67348554]


## Task 7

In [39]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor()
sgd.fit(X_train, y_train)
sgd.score(X_test, y_test)



0.405686919691702

## Task 8

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

sgd_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('sgd', SGDRegressor()),
])
sgd_pipe.fit(X_train,y_train)
sgd_pipe.score(X_test,y_test)

0.5073462227456428

## Task 9

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

sgd_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('sgd', SGDRegressor(max_iter=10000)),
])
sgd_pipe.fit(X_train,y_train)
sgd_pipe.score(X_test,y_test)

0.5096235269665215

## Task 10

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

sgd_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('sgd', SGDRegressor(penalty='l1')),
])
sgd_pipe.fit(X_train,y_train)
sgd_pipe.score(X_test,y_test)

0.5097342543524186

## Task 11

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

sgd_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('sgd', SGDRegressor(learning_rate='adaptive')),
])
sgd_pipe.fit(X_train,y_train)
sgd_pipe.score(X_test,y_test)

0.5113770407400162

## Task 12

### Ridge Regression

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

ridge_reg_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('ridge', Ridge(random_state=42)),
])
ridge_reg_pipe.fit(X_train,y_train)
ridge_reg_pipe.score(X_test,y_test)

0.5141871981733375

## Task 13

### Lasso Regression

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

lasso_reg_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('lasso', Lasso(random_state=42)),
])
lasso_reg_pipe.fit(X_train,y_train)
lasso_reg_pipe.score(X_test,y_test)

0.5084595713084391

## Task 14

### Polynomial Regression

In [57]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly_reg_pipe = Pipeline([
    ('poly_feature', PolynomialFeatures(degree=5)),
    ('poly_reg', LinearRegression(fit_intercept=False))
])
poly_reg_pipe.fit(X_train,y_train)
print(poly_reg_pipe.score(X_test,y_test))
print(poly_reg_pipe[-1].coef_)

-20886.845250992857
[ 1.36183666e-08 -1.16767715e-10 -9.68600385e-11 ...  3.21440937e-09
 -7.12510981e-09 -2.55390471e-09]


## Task 15

### Ridge CV

This internally applies the cross validation (Leave One Out CV by default)

In [54]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

X,y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [65]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler

reg_rates = [0.002, 0.02, 0.2, 0.5, 1, 10]

ridge_cv_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('ridge_cv', RidgeCV(alphas=reg_rates)),
])

ridge_cv_pipe.fit(X_train,y_train)
print(ridge_cv_pipe.score(X_test,y_test))
print("Best alpha: ", ridge_cv_pipe[-1].alpha_)

0.5758157428902873
Best alpha:  1.0


## Task 16

###  LassoCV

This internally applies the cross validation (Leave One Out CV by default)

In [66]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

reg_rates = [0.002, 0.02, 0.2, 0.5, 1, 10]

lasso_cv_pipe = Pipeline([
    ('feature_scaler', StandardScaler()),
    ('lasso_cv', LassoCV(n_alphas=6, alphas=reg_rates)),
])

lasso_cv_pipe.fit(X_train,y_train)
print(lasso_cv_pipe.score(X_test,y_test))
print("Best alpha: ", lasso_cv_pipe[-1].alpha_)

0.5778171692208816
Best alpha:  0.002


## Task 17

In [70]:
from sklearn.metrics import mean_squared_error

y_true = [1, 2, -1, 5]
y_pred = [2.5, 0.0, 2, 8]

mean_squared_error(y_true, y_pred)

6.0625