In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/housing.csv')

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression

In [4]:
lreg = LinearRegression()
X = df.drop('PRICE', axis=1)
y = df['PRICE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

In [5]:
cross_val_score(estimator=lreg, X=X_train, y=y_train, cv=10)

array([0.49945536, 0.62090992, 0.68073943, 0.66394911, 0.3934969 ,
       0.72625023, 0.77370567, 0.79414459, 0.83785026, 0.78381453])

In [6]:
lreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
alpha = 10000

In [8]:
np.abs(lreg.coef_).sum()

26.298529417494624

In [9]:
np.sum((y_train - lreg.predict(X_train))**2)

9089.097809228135

In [10]:
alpha*(lreg.coef_**2).sum()

3073893.8992837956

In [11]:
lreg.coef_

array([-1.06957244e-01,  4.45558546e-02,  2.37053116e-02,  2.23742166e+00,
       -1.68583938e+01,  3.87986730e+00,  5.46111202e-05, -1.37954394e+00,
        3.04718275e-01, -1.21659387e-02, -9.14169249e-01,  9.14771590e-03,
       -5.27828497e-01])

In [12]:
100*(3.5**2 - 2.5**2)

600.0

In [13]:
from sklearn.linear_model import Ridge, Lasso

In [14]:
ridge, lasso = Ridge(), Lasso()

In [15]:
ridge

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [16]:
ridge.alpha = 10000

In [17]:
ridge.fit(X_train, y_train)

Ridge(alpha=10000, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [18]:
ridge.coef_

array([-7.05534222e-02,  5.74457939e-02, -5.59530041e-02,  1.09558337e-02,
       -4.21703719e-04,  8.85516842e-02,  2.06349886e-02, -8.96007975e-02,
        1.19815324e-01, -1.37115961e-02, -1.43617695e-01,  8.35050952e-03,
       -4.74676136e-01])

In [19]:
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [20]:
lasso.coef_

array([-0.06092354,  0.04545304, -0.        ,  0.        , -0.        ,
        1.05001025,  0.02287044, -0.56125924,  0.25591408, -0.01475951,
       -0.69776051,  0.00842163, -0.76068136])

In [21]:
alphas = np.logspace(-3, 3, 7)

In [22]:
cv_scores = []

for alpha in alphas:
    ridge.set_params(alpha=alpha)
    scores = cross_val_score(estimator=ridge, X=X_train, y=y_train, cv=10)
    cv_scores.append((min(scores), alpha))

In [23]:
max(cv_scores)

(0.39347446425022914, 0.001)

In [24]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit_transform(X_train)

array([[-0.41174958, -0.48041482, -0.07436025, ...,  0.08173983,
         0.37599244, -0.44009701],
       [-0.42850904,  1.46175584, -1.10728653, ..., -1.47767444,
         0.32970292, -1.11493712],
       [-0.42044068, -0.48041482, -0.36762464, ...,  1.13663771,
         0.30584965,  0.21215431],
       ...,
       [-0.40058181, -0.48041482, -0.53809359, ...,  0.54039108,
         0.34647107, -0.11961855],
       [ 1.06773675, -0.48041482,  1.01057351, ...,  0.81558184,
         0.41460641,  1.84842981],
       [-0.42807611,  2.10914606, -1.36010066, ..., -0.05585555,
         0.36064133, -0.67727931]])

In [25]:
sc.transform(X_test)

array([[ 1.43961437, -0.48041482,  1.01057351, ...,  0.81558184,
        -2.15422575,  1.01546817],
       [-0.41193809,  0.46909083, -0.75768071, ...,  0.31106546,
         0.16946089, -0.34409465],
       [ 0.85174637, -0.48041482,  1.01057351, ...,  0.81558184,
        -0.27902787,  2.45409175],
       ...,
       [-0.41070171,  0.46909083, -0.75768071, ...,  0.31106546,
         0.12635968, -0.01232179],
       [ 3.28801671, -0.48041482,  1.01057351, ...,  0.81558184,
        -1.78095746,  1.05782215],
       [-0.43156555, -0.48041482,  0.40237496, ..., -0.92729293,
         0.36619135,  0.13026995]])

In [26]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(sc, ridge)

In [27]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge',
                 Ridge(alpha=1000.0, copy_X=True, fit_intercept=True,
                       max_iter=None, normalize=False, random_state=None,
                       solver='auto', tol=0.001))],
         verbose=False)

In [28]:
pipe.predict(X_test)

array([18.61264065, 23.89790735, 15.82072225, 28.75573419, 24.58108404,
       21.64373018, 25.77068315, 20.98322869, 17.58047018, 26.54314078,
       26.44676788, 18.5774517 , 24.03534795, 29.14314292, 17.03786231,
       22.40026083, 16.51113464, 24.27901034, 24.33089874, 20.40845662,
       20.0796294 , 27.94456949, 21.51710373, 26.56935474, 13.47772384,
       13.55908443, 24.76264396, 17.52142434, 27.26478773, 16.04421386,
       24.35386863, 26.65069337,  8.93148912, 23.99402339, 21.56455087,
       21.27218598, 22.69843435, 26.01496891, 20.6671905 , 19.93064201,
       24.21619577, 22.95094889, 18.48129859, 28.73518767, 23.5413658 ,
       20.55739241, 25.69356749, 17.02217011, 24.79941937, 28.51961884,
       27.91167077, 26.80403418, 21.51148124, 17.70720884, 21.04360456,
       20.38147824, 26.75038983, 16.77632644, 14.98816647, 26.99365064,
       26.16070614, 22.6521418 , 24.98754039, 31.35443201, 29.0090383 ,
       25.23111745, 20.87503055, 13.22944049, 16.59776031, 22.50

In [29]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

In [30]:
sc.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [31]:
sc.transform(X_test)

array([[ 1.43961437, -0.48041482,  1.01057351, ...,  0.81558184,
        -2.15422575,  1.01546817],
       [-0.41193809,  0.46909083, -0.75768071, ...,  0.31106546,
         0.16946089, -0.34409465],
       [ 0.85174637, -0.48041482,  1.01057351, ...,  0.81558184,
        -0.27902787,  2.45409175],
       ...,
       [-0.41070171,  0.46909083, -0.75768071, ...,  0.31106546,
         0.12635968, -0.01232179],
       [ 3.28801671, -0.48041482,  1.01057351, ...,  0.81558184,
        -1.78095746,  1.05782215],
       [-0.43156555, -0.48041482,  0.40237496, ..., -0.92729293,
         0.36619135,  0.13026995]])

In [32]:
sc.var_

array([5.91634448e+01, 5.36846651e+02, 4.79152135e+01, 6.45034801e-02,
       1.32392481e-02, 5.06456506e-01, 7.79255225e+02, 4.40503552e+00,
       7.60880551e+01, 2.88664528e+04, 4.75373346e+00, 7.17144695e+03,
       5.01711456e+01])

In [33]:
sc.mean_

array([3.36510626e+00, 1.11311881e+01, 1.11047277e+01, 6.93069307e-02,
       5.54256683e-01, 6.28837376e+00, 6.87851485e+01, 3.80309876e+00,
       9.55940594e+00, 4.06806931e+02, 1.84217822e+01, 3.61789307e+02,
       1.25872772e+01])

In [34]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(sc, LinearRegression())

In [37]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [38]:
pipe.predict(X_test)

array([20.00101515, 20.28791996, 11.01110241, 30.96363775, 25.10775846,
       18.29968819, 22.34584907, 25.53119837, 16.96133139, 27.41721052,
       31.32722599, 14.1892731 , 23.17566495, 40.16731102, 14.49409236,
       20.78246771, 13.28406375, 25.62698695, 20.16197915, 21.36932737,
       17.45976865, 31.45637476, 17.72944877, 27.24873378,  6.64787399,
        6.50259753, 25.69001089, 16.27186452, 32.9098371 , 14.64485892,
       26.59272781, 27.89402778, -4.19547078, 23.85639883, 19.20707293,
       19.71945987, 25.34683481, 20.98826507, 23.04293937, 13.90991899,
       26.31233834, 22.40006257,  9.0153142 , 34.52580383, 25.7370247 ,
       19.11759034, 20.45014258, 13.68798012, 24.80753665, 30.9571743 ,
       32.3927216 , 27.14775914, 19.97213546, 11.97152171, 16.39737432,
       18.36206545, 28.61296585, 13.62799241, 10.1407046 , 27.68208063,
       25.48464464, 22.4867408 , 27.68541058, 43.26026554, 35.70072891,
       25.47447112, 22.84682975,  8.01771253, 13.01416473, 18.80