In [32]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

In [33]:
df_ames = pd.read_csv("AmesHousing.csv")
df_ames.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [34]:
# Pipeline 1

ct1 = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline1 = Pipeline(
  [("preprocessing", ct1),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

lr_pipeline1

In [35]:
# Pipeline 2

ct_2 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline2 = Pipeline(
  [("preprocessing", ct_2),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

lr_pipeline2

In [53]:
# Pipeline 3

ct_pre = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop="first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
    [
      ("interaction1", PolynomialFeatures(interaction_only = False), ["dummify__Bldg Type_1Fam", "standardize__Gr Liv Area"]),
      ("interaction2", PolynomialFeatures(interaction_only = False), ["dummify__Bldg Type_Twnhs", "standardize__Gr Liv Area"]),
      ("interaction3", PolynomialFeatures(interaction_only = False), ["dummify__Bldg Type_Duplex", "standardize__Gr Liv Area"]),
      ("interaction4", PolynomialFeatures(interaction_only = False), ["dummify__Bldg Type_2fmCon", "standardize__Gr Liv Area"]),
      ("interaction5", PolynomialFeatures(interaction_only = False), ["dummify__Bldg Type_TwnhsE", "standardize__Gr Liv Area"])
    ]
)

lr_pipeline3 = Pipeline(
  [("standardize", ct_pre),
   ("interaction", ct_inter),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

lr_pipeline3

In [52]:
ct_pre.fit_transform(X_train)

Unnamed: 0,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area
2552,0.0,0.0,0.0,0.0,-0.968329
903,0.0,0.0,0.0,0.0,-0.392246
1965,0.0,0.0,0.0,0.0,-0.503490
2032,0.0,0.0,0.0,0.0,1.586302
1821,0.0,0.0,0.0,0.0,-0.986208
...,...,...,...,...,...
1808,0.0,0.0,0.0,0.0,-0.797491
1822,0.0,0.0,0.0,0.0,-0.745842
729,0.0,0.0,0.0,0.0,-0.126056
781,0.0,0.0,0.0,0.0,0.581136


In [37]:
# Pipeline 4

ct_4 = ColumnTransformer(
  [
    ("degree1", PolynomialFeatures(degree=5), ["Gr Liv Area"]),
    ("degree2", PolynomialFeatures(degree=5), ["TotRms AbvGrd"]),
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])
  ],
  remainder = "drop"
)

lr_pipeline4 = Pipeline(
  [("preprocessing", ct_4),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

In [38]:
X = df_ames.drop("SalePrice", axis = 1)
y = df_ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

# Fit 1

lr_fitted_1 = lr_pipeline1.fit(X_train, y_train)
y_pred_1 = lr_fitted_1.predict(X_test)

root_mean_squared_error(y_test, y_pred_1)

53357.7045507084

In [39]:
lr_fitted_2 = lr_pipeline2.fit(X_train, y_train)
y_pred_2 = lr_fitted_2.predict(X_test)

root_mean_squared_error(y_test, y_pred_2)

51698.64966645307

In [40]:
lr_fitted_3 = lr_pipeline3.fit(X_train, y_train)
y_pred_3 = lr_fitted_3.predict(X_test)

root_mean_squared_error(y_test, y_pred_3)

52811.03188543954

In [41]:
lr_fitted_4 = lr_pipeline4.fit(X_train, y_train)
y_pred_4 = lr_fitted_4.predict(X_test)

root_mean_squared_error(y_test, y_pred_4)

53996.43375127621

In [42]:
scores_1 = cross_val_score(lr_pipeline1, X, y, cv=5, scoring='r2')
scores_1.mean()

0.504208752508862

In [43]:
scores_2 = cross_val_score(lr_pipeline2, X, y, cv=5, scoring='r2')
scores_2.mean()

0.5334561732637108

In [44]:
scores_3 = cross_val_score(lr_pipeline3, X, y, cv=5, scoring='r2')
scores_3.mean()

0.5458181828368385

In [45]:
scores_4 = cross_val_score(lr_pipeline4, X, y, cv=5, scoring='r2')
scores_4.mean()

0.49270140429529236

In [46]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
    ("poly_size", PolynomialFeatures(), ["Gr Liv Area"]),
    ("poly_rooms", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder="drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
   ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {
  'preprocessing__poly_size__degree': np.arange(1, 10),
  'preprocessing__poly_rooms__degree': np.arange(1, 10)
}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv=5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

In [47]:
params_df = pd.DataFrame(gscv_fitted.cv_results_["params"])

results_df = params_df.assign(scores=gscv_fitted.cv_results_["mean_test_score"])

results_df.sort_values(by="scores", ascending=False)

Unnamed: 0,preprocessing__poly_rooms__degree,preprocessing__poly_size__degree,scores
2,1,3,0.557641
30,4,4,0.556932
11,2,3,0.556857
39,5,4,0.556414
20,3,3,0.554039
...,...,...,...
53,6,9,-4.545604
44,5,9,-4.545604
35,4,9,-4.545604
26,3,9,-4.545604
