<a href="https://colab.research.google.com/github/seanigens/IntroDataViz/blob/main/PracticeCh14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Chapter 14

In [18]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.linear_model import ElasticNet

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
file_path = "/content/drive/My Drive/S544/AmesHousing.csv"
ames = pd.read_csv(file_path)

In [7]:

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [8]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [9]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')

array([-4.59217770e+20, -1.21723181e+20, -8.37868327e+19, -9.71584476e+20,
       -3.78389822e+20])

In [10]:
from sklearn.model_selection import GridSearchCV

ridge_pipeline = Pipeline(
    [("preprocessing", ct),
     ("ridge_regression", Ridge())]
)

cross_val_score_ridge = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
print("Cross-validation scores for Ridge Regression:", cross_val_score_ridge)

ridge_pipeline = Pipeline(
    [("preprocessing", ct),
     ("ridge_regression", Ridge())]
)

param_grid = {
    'ridge_regression__alpha': [0.001, 0.01, 0.1, 1, 10]
}

search = GridSearchCV(ridge_pipeline, param_grid, n_jobs=-1, cv=5, scoring='r2')

search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

best_ridge_pipeline = search.best_estimator_

Cross-validation scores for Ridge Regression: [0.8983804  0.91749817 0.79251623 0.78248152 0.91492025]
Best parameter (CV score=0.863):
{'ridge_regression__alpha': 10}


In [16]:
lassopipe = Pipeline([
    ("preprocessing", ct),
    ("lasso_regression", Lasso(max_iter=10000))
])
cross_val_score_lasso = cross_val_score(lassopipe, X, y, cv=5, scoring='r2')
print("Cross-validation scores for LASSO Regression:", cross_val_score_lasso)
param_grid_lasso = {
    'lasso_regression__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}
search_lasso = GridSearchCV(lasso_pipeline, param_grid_lasso, n_jobs=-1, cv=5, scoring='r2')
search_lasso.fit(X, y)
print("Best parameter for LASSO (CV score=%0.3f):" % search_lasso.best_score_)
print(search_lasso.best_params_)
best_lasso_pipe = search_lasso.best_estimator_

Cross-validation scores for LASSO Regression: [0.89890206 0.91021249 0.79360837 0.7703388  0.90647349]
Best parameter for LASSO (CV score=0.859):
{'lasso_regression__alpha': 10}


In [17]:
best_ridge_pipeline.fit(X, y)
ridge_coefs = best_ridge_pipeline.named_steps['ridge_regression'].coef_

best_lasso_pipe.fit(X, y)
lasso_coefs = best_lasso_pipe.named_steps['lasso_regression'].coef_

print("Ridge coefficients:", ridge_coefs)
print("LASSO coefficients:", lasso_coefs)

Ridge coefficients: [-4.85094592e+03  2.06638926e+03 -1.25886291e+03  4.44722578e+03
  2.34871909e+03 -2.75252531e+03 -5.83247363e+03  5.83247363e+03
  2.31233042e+03  7.32962737e+03 -1.24531787e+04  2.81122089e+03
 -1.11224443e+04  8.88328411e+03 -1.26747050e+03  3.50663067e+03
  3.01750926e+03 -2.59848635e+03 -4.19022913e+02 -8.41260699e+01
  7.59338485e+03 -5.73685935e+03 -2.31625871e+03  5.43859276e+02
 -8.82362352e+01  5.66136141e+03 -5.57312518e+03 -1.82924573e+03
  1.87016999e+03  5.99520029e+03 -7.67705746e+02 -4.10486548e+03
 -8.32881350e+03  9.09552956e+03 -1.56963025e+04 -1.13576156e+04
  1.15711997e+03  1.25635693e+04 -6.68088456e+03 -4.62573873e+02
  1.24778131e+03 -1.03711746e+04 -1.06702684e+04  3.52269485e+03
 -1.25655836e+04  2.72650861e+04  2.17555850e+04 -9.90799673e+03
 -7.00430668e+03 -6.27472964e+03 -8.75404865e+03  8.99835289e+03
  3.04140260e+04 -5.32982751e+03 -3.77917238e+03 -2.24802951e+03
 -4.10812621e+03  5.97024291e+03  7.99403362e+03  5.49655499e+03
 -5.8

In [21]:
elastic_net_pipeline = Pipeline([
    ("preprocessing", ct),
    ("elastic_net", ElasticNet(max_iter=10000))
])

param_grid_elastic_net = {
    'elastic_net__alpha': [0.001, 0.01, 0.1, 1, 10],
    'elastic_net__l1_ratio': [0.1, 0.5, 0.9]
}

search_elastic_net = GridSearchCV(
    elastic_net_pipeline, param_grid_elastic_net, n_jobs=-1, cv=5, scoring='r2'
)

search_elastic_net.fit(X, y)
print("Best parameters for Elastic Net (CV score=%0.3f):" % search_elastic_net.best_score_)
print(search_elastic_net.best_params_)

best_elastic_net_pipeline = search_elastic_net.best_estimator_

best_elastic_net_pipeline.fit(X, y)
elastic_net_coefs = best_elastic_net_pipeline.named_steps['elastic_net'].coef_

best_ridge_pipeline.fit(X, y)
ridge_coefs = best_ridge_pipeline.named_steps['ridge_regression'].coef_

print("Elastic Net coefficients:", elastic_net_coefs)
print("Ridge coefficients:", ridge_coefs)


Best parameters for Elastic Net (CV score=0.863):
{'elastic_net__alpha': 0.01, 'elastic_net__l1_ratio': 0.5}
Elastic Net coefficients: [ -4297.52849666   2260.65344629   -937.59641866   3697.22499627
   2363.94559476  -3086.6991799   -5131.93180002   5131.9317928
   2045.53423427   6766.81936494 -11220.75460148   2406.40126694
 -11081.55222868   8782.06867369  -1051.63195755   3351.11553642
   2326.31282522  -1952.96165399   -372.35117077   -232.60691963
   7346.82081072  -5527.4619454   -2051.69507912    465.94318867
   -370.61900886   5209.13636769  -4837.51741914  -2115.91684916
   1384.16211148   4991.78356503    158.23040476  -3386.49274973
  -8023.47193689   9256.99765417 -14584.08898029 -10940.86882424
    649.46704936   9157.68872746  -5532.54156524   -357.49559918
   1118.02296466  -9453.12347675  -9737.58006139   2790.08361499
 -11637.85442529  25792.98547238  20645.16063874  -8632.41457012
  -5866.66494515  -5344.10382169  -8231.73338216   8207.96686361
  27848.62078689  -50