In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
ames = pd.read_csv("AmesHousing.csv")

good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

ames = ames.dropna()

FileNotFoundError: [Errno 2] No such file or directory: 'AmesHousing.csv'

In [None]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", Ridge(alpha=1))]
)

In [3]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')

NameError: name 'lr_pipeline_1' is not defined

In [None]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("ridge", Ridge())]
)

alphas = {
  'ridge__alpha': [0.001, 0.01, 0.1, 1, 10]
}

gscv = GridSearchCV(lr_pipeline_1, alphas, cv=5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

In [4]:
params_df = pd.DataFrame(gscv_fitted.cv_results_["params"])

results_df = params_df.assign(scores=gscv_fitted.cv_results_["mean_test_score"])

results_df.sort_values(by="scores", ascending=False)

NameError: name 'gscv_fitted' is not defined

In [None]:
# Finding coefficient of the best ridge regression

ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(drop = "first", sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("ridge", Ridge(alpha=10))]
)

pipeline_fit = lr_pipeline_1.fit(X, y)
ridge_coefficients = pipeline_fit.named_steps["ridge"].coef_
feature_names_ridge = pipeline_fit.named_steps["preprocessing"].get_feature_names_out()

ridge_df = pd.DataFrame({
    "Feature": feature_names_ridge,
    "Coefficient": ridge_coefficients,
    "Type": "Ridge"
})

In [5]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("lasso", Lasso())]
)

alphas = {
  'lasso__alpha': [0.001, 0.01, 0.1, 1, 10]
}

gscv = GridSearchCV(lr_pipeline_1, alphas, cv=5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

NameError: name 'X' is not defined

In [None]:
params_df = pd.DataFrame(gscv_fitted.cv_results_["params"])

results_df = params_df.assign(scores=gscv_fitted.cv_results_["mean_test_score"])

results_df.sort_values(by="scores", ascending=False)

In [6]:
# Finding coefficient of the best lasso regression

ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(drop = "first", sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("lasso", Lasso(alpha=10))]
)

pipeline_fit = lr_pipeline_1.fit(X, y)
lasso_coefficients = pipeline_fit.named_steps["lasso"].coef_
feature_names_lasso = pipeline_fit.named_steps["preprocessing"].get_feature_names_out()

lasso_df = pd.DataFrame({
    "Feature": feature_names_lasso,
    "Coefficient": lasso_coefficients,
    "Type": "Lasso"
})


NameError: name 'X' is not defined

In [None]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("elastic_net", ElasticNet())]
)

alphas = {
  'elastic_net__l1_ratio': [0.2, 0.4, 0.6, 0.8, 10],
  'elastic_net__alpha': [0.001, 0.01, 0.1, 1, 10]
}

gscv = GridSearchCV(lr_pipeline_1, alphas, cv=5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

In [7]:
params_df = pd.DataFrame(gscv_fitted.cv_results_["params"])

results_df = params_df.assign(scores=gscv_fitted.cv_results_["mean_test_score"])

results_df.sort_values(by="scores", ascending=False)

NameError: name 'gscv_fitted' is not defined

In [None]:
df_coef = pd.concat([lasso_df, ridge_df])

In [8]:
from plotnine import ggplot, geom_bar, aes, labs, theme_classic, position_dodge

(ggplot(df_coef, aes(x="Feature", y="Coefficient", fill="Type")) +
 geom_bar(stat="identity", position=position_dodge(width=0.8)) +
 labs(x="Feature", y="Coefficient", title="Comparison of Ridge and Lasso Coefficients") +
 theme_classic())

NameError: name 'df_coef' is not defined