In [77]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

In [67]:
# Read the data
ames = pd.read_csv("AmesHousing.csv")

Consider four possible models for predicting house prices:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [68]:
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [69]:
# Model 1: Size and Number of Rooms
ct1 = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct1),
  ("linear_regression", LinearRegression())]
)

lr_fit = lr_pipeline.fit(X_train, y_train)

# Predictions for train and test sets
y_train_pred = lr_fit.predict(X_train)
y_test_pred = lr_fit.predict(X_test)

# R-squared
r2_1 = r2_score(y_test, y_test_pred)

# Model Coefficients
int1 = lr_fit.named_steps['linear_regression'].intercept_
coeff1 = lr_fit.named_steps['linear_regression'].coef_

# Cross-Validation Score
crossval1 = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring="neg_mean_squared_error")

# Root Mean Squared Error
rmse1 = np.sqrt(-crossval1) # RMSE for the cross-validation results
mse1 = mean_squared_error(y_test, y_test_pred) # Mean squared error (MSE) for the test predictions

print("R_Squared:", r2_1)
print("Cross Validation RMSE:", rmse1.mean())
print("MSE:", mse1)
print("Intercept:", int1)
print("Coefficients:", coeff1)

R_Squared: 0.5462656115392046
Cross Validation RMSE: 57127.553127324696
MSE: 2601545432.045199
Intercept: 182376.85116067363
Coefficients: [ 70978.10104756 -17646.21136933]


In [70]:
# Model 2: Size, Number of Rooms, and Building Type
ct2 = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd",]),
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"])
  ],
  remainder = "drop"
)

lr_pipeline = Pipeline(
  [("preprocessing", ct2),
  ("linear_regression", LinearRegression())]
)

lr_fit = lr_pipeline.fit(X_train, y_train)

# Predictions for train and test sets
y_train_pred = lr_fit.predict(X_train)
y_test_pred = lr_fit.predict(X_test)

# R-squared
r2_2 = r2_score(y_test, y_test_pred)

# Model Coefficients
int2 = lr_fit.named_steps['linear_regression'].intercept_
coeff2 = lr_fit.named_steps['linear_regression'].coef_

# Cross-Validation Score
crossval2 = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring="neg_mean_squared_error")

# Root Mean Squared Error
rmse2 = np.sqrt(-crossval2) # RMSE for the cross-validation results
mse2 = mean_squared_error(y_test, y_test_pred) # Mean squared error (MSE) for the test predictions

print("R_Squared:", r2_2)
print("Cross Validation RMSE:", rmse2.mean())
print("MSE:", mse2)
print("Intercept:", int2)
print("Coefficients:", coeff2)

R_Squared: 0.5653807332423344
Cross Validation RMSE: 55118.33275053743
MSE: 2491946382.8779964
Intercept: 185036.17639149373
Coefficients: [ 65162.44810526  -9225.93940763 -59132.28077904 -57584.40038573
 -23458.39787201  20735.88964407]


In [71]:
# Model 3: Size, Building Type, and their interaction
ct3 = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area"]),
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"])
  ],
  remainder = "drop"
).set_output(transform="pandas")

X_train_dummified = ct3.fit_transform(X_train)
X_train_dummified

ct_inter = ColumnTransformer(
  [
    ("interaction1", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_TwnhsE"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

lr_pipeline = Pipeline(
  [("preprocessing", ct3),
   ("interactions", ct_inter),
  ("linear_regression", LinearRegression())]
)

lr_fit = lr_pipeline.fit(X_train, y_train)

# Predictions for train and test sets
y_train_pred = lr_fit.predict(X_train)
y_test_pred = lr_fit.predict(X_test)

# R-squared
r2_3 = r2_score(y_test, y_test_pred)

# Model Coefficients
int3 = lr_fit.named_steps['linear_regression'].intercept_
coeff3 = lr_fit.named_steps['linear_regression'].coef_

# Cross-Validation Score
crossval3 = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring="neg_mean_squared_error")

# Root Mean Squared Error
rmse3 = np.sqrt(-crossval3) # RMSE for the cross-validation results
mse3 = mean_squared_error(y_test, y_test_pred) # Mean squared error (MSE) for the test predictions

print("R_Squared:", r2_3)
print("Cross Validation RMSE:", rmse3.mean())
print("MSE:", mse3)
print("Intercept:", int3)
print("Coefficients:", coeff3)

R_Squared: 0.5741377594565966
Cross Validation RMSE: 54427.9649756646
MSE: 2441736828.2896905
Intercept: 184841.08687937906
Coefficients: [ 0.00000000e+00  1.50919608e+04 -6.13481054e+04 -4.94532565e+04
  3.63797881e-12  1.50919608e+04 -5.19142094e+04 -3.94021986e+04
  1.45519152e-11  1.50919608e+04 -2.73567176e+04 -1.54682842e+04
 -9.40395481e-38  1.50919608e+04  3.07319705e+04  1.45617200e+04]


In [72]:
# Model 4: Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type
ct4 = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
    ('polynomial_features', PolynomialFeatures(degree=5, include_bias=False), ["Gr Liv Area", "TotRms AbvGrd"]),
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"])
  ],
  remainder = "drop"
)

lr_pipeline = Pipeline(
  [("preprocessing", ct4),
  ("linear_regression", LinearRegression())]
)

lr_fit = lr_pipeline.fit(X_train, y_train)

# Predictions for train and test sets
y_train_pred = lr_fit.predict(X_train)
y_test_pred = lr_fit.predict(X_test)

# R-squared
r2_4 = r2_score(y_test, y_test_pred)

# Model Coefficients
int4 = lr_fit.named_steps['linear_regression'].intercept_
coeff4 = lr_fit.named_steps['linear_regression'].coef_

# Cross-Validation Score
crossval4 = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring="neg_mean_squared_error")

# Root Mean Squared Error
rmse4 = np.sqrt(-crossval4) # RMSE for the cross-validation results
mse4 = mean_squared_error(y_test, y_test_pred) # Mean squared error (MSE) for the test predictions

print("R_Squared:", r2_4)
print("Cross Validation RMSE:", rmse4.mean())
print("MSE:", mse4)
print("Intercept:", int4)
print("Coefficients:", coeff4)

R_Squared: 0.5351398532643997
Cross Validation RMSE: 97778.28583319395
MSE: 2665336421.5623155
Intercept: 40326.725745282194
Coefficients: [-6.17214385e-02  3.01820755e-02 -3.24740043e-01 -9.65218713e-03
  2.49708881e-01 -8.13475273e+00 -1.03420760e-02 -1.56725694e-04
  1.12112109e-02 -3.82087914e+00  3.75390318e-01  2.47314392e-08
  1.21193134e-05 -4.65807894e-03  9.28799547e-01  3.64300372e+00
 -1.18941456e-11  1.84101404e-08 -1.36727458e-05  4.58344883e-03
 -7.07811324e-01  3.68393427e+01  6.19741582e-04 -4.58871667e-03
  1.00438138e-03  3.84696452e-03]


In [73]:
table = {"Model": ["Model 1", "Model 2","Model 3","Model 4"],
         "RMSE": [np.sqrt(mse1), np.sqrt(mse2), np.sqrt(mse3), np.sqrt(mse4)]}

pd.DataFrame(table)

Unnamed: 0,Model,RMSE
0,Model 1,51005.347093
1,Model 2,49919.398863
2,Model 3,49413.933544
3,Model 4,51626.8963


Model 3 performed the best because it has the lowest RMSE of 49413.933544. Lower RMSE values indicate better model performance, as they suggest that the model’s predictions are closer to the actual values.

Once again consider four modeling options for house price:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [75]:
table = {"Model": ["Model 1", "Model 2","Model 3","Model 4"],
         "Cross Validation RMSE": [rmse1.mean(), rmse2.mean(), rmse3.mean(), rmse4.mean()]}

pd.DataFrame(table)

Unnamed: 0,Model,Cross Validation RMSE
0,Model 1,57127.553127
1,Model 2,55118.332751
2,Model 3,54427.964976
3,Model 4,97778.285833



Looking at the cross-validated RMSE values in this table, Model 3 once again performs the best, with the lowest cross-validated RMSE of 54427.964976. This result is consistent with the previous conclusion, where Model 3 had the lowest RMSE, indicating that it is the preferred model among the four options.

Consider one hundred modeling options for house price:

- House size, trying degrees 1 through 10
- Number of rooms, trying degrees 1 through 10
- Building Type

Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [84]:
ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"]),
    ("polynomial_area", PolynomialFeatures(), ["Gr Liv Area"]),
    ("polynomial_rooms", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial_area__degree': np.arange(1, 11),
           'preprocessing__polynomial_rooms__degree': np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [87]:
gscv_fitted = gscv.fit(X, y)
# gscv_fitted.cv_results_

results_df = pd.DataFrame({
    "degree_area": gscv_fitted.cv_results_['param_preprocessing__polynomial_area__degree'],
    "degree_rooms": gscv_fitted.cv_results_['param_preprocessing__polynomial_rooms__degree'],
    "scores": gscv_fitted.cv_results_['mean_test_score']
})

results_df.sort_values(by="scores", ascending=False)

  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,degree_area,degree_rooms,scores
20,3,1,0.557641
21,3,2,0.556857
33,4,4,0.556690
34,4,5,0.556407
36,4,7,0.554103
...,...,...,...
90,10,1,-16.188147
99,10,10,-16.190863
9,1,10,-184.221196
19,2,10,-189.473606


Q1: The model with the highest score has an area degree of 3 and room degree of 1, with a score of approximately 0.556741. This suggests that the best-performing model used a low polynomial degree for both Gr Liv Area and TotRms AbvGrd. Higher or lower degrees may not have captured the relationship as effectively for this dataset.

Q2: Testing every possible combination of polynomial degrees for both features is computationally demanding and time-consuming. This approach can also increase the chance of overfitting, where the model fits the training data too closely but doesn’t generalize well to new data. To simplify, we could choose a smaller set of degrees to test.