# Practice Activity part1:

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [5]:
ames = pd.read_csv("https://www.dropbox.com/scl/fi/g0n5le5p6fr136ggetfsf/AmesHousing.csv?rlkey=jlr9xtz1o6u5rghfo29a5c02f&dl=1")
ames.head(1)

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000


In [11]:
X = ames.drop("SalePrice", axis=1)
y = ames["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Model 1: Using only the size and number of rooms.

In [None]:
lr = LinearRegression()
preprocessor = ColumnTransformer(
    transformers=[
        ('standardize', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd'])
    ],
    remainder='drop'  # Dropping all other features
)

pipeline1 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('linear_regression', LinearRegression())
])

In [None]:
fitted1= pipeline1.fit(X_train, y_train)

In [None]:
y_preds1 = fitted1.predict(X_test)

In [None]:
scores = cross_val_score(pipeline1, X, y, cv=5, scoring='neg_mean_squared_error')
scores

array([-3.79554999e+09, -2.93047119e+09, -3.47893144e+09, -3.17877892e+09,
       -2.29696300e+09])

In [None]:
abs(scores.mean())

3136138908.1709027

In [None]:
mean_squared_error(y_test, y_preds1)

3511950654.7013855

### Model 2: Using size, number of rooms, and building type.

In [None]:
lr = LinearRegression()

# Creating the ColumnTransformer with both transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(sparse_output = False), ['Bldg Type']),  # summies for categorical 'Bldg Type' feature
        # also always incldue space_ouput
        ('standardize', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd'])  # This will standardize the two numerical features
    ],
    remainder='drop'  # Dropping all other features not specified
)

pipeline2 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('linear_regression', LinearRegression())
])

In [None]:
fitted2= pipeline2.fit(X_train, y_train)

In [None]:
y_preds2 = fitted2.predict(X_test)

In [None]:
scores2 = cross_val_score(pipeline2, X, y, cv=5, scoring='neg_mean_squared_error')
scores2

array([-3.53401650e+09, -2.67051679e+09, -3.31383955e+09, -2.95894166e+09,
       -2.26733719e+09])

In [None]:
abs(scores2.mean())

2948930339.0934515

In [None]:
mean_squared_error(y_test, y_preds2)

3261697263.9427013

### Model 3: Using size and building type, and their interaction.

method 2

In [None]:
ames["Bldg Type"].unique() # find the unique dummy variables names

array(['1Fam', 'TwnhsE', 'Twnhs', 'Duplex', '2fmCon'], dtype=object)

In [None]:
lr = LinearRegression()
# step 1
ct_dummies = ColumnTransformer(
    [("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])],
    remainder="passthrough"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction1", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_1Fam"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
    ("interaction5", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_TwnhsE"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

In [None]:
# step 2 see which the column names and choose from here to do the interaction
ct_dummies.fit_transform(X_train)

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,remainder__Order,remainder__PID,remainder__MS SubClass,remainder__MS Zoning,remainder__Lot Frontage,...,remainder__Screen Porch,remainder__Pool Area,remainder__Pool QC,remainder__Fence,remainder__Misc Feature,remainder__Misc Val,remainder__Mo Sold,remainder__Yr Sold,remainder__Sale Type,remainder__Sale Condition
844,1.0,0.0,0.0,0.0,0.0,845,907181100,20,RL,68.0,...,0,0,,,,0,9,2009,WD,Normal
2730,0.0,0.0,0.0,1.0,0.0,2731,905351045,150,RL,,...,200,0,,,,0,4,2006,WD,Normal
2793,1.0,0.0,0.0,0.0,0.0,2794,907255050,20,RL,,...,0,0,,,,0,8,2006,WD,Normal
1187,1.0,0.0,0.0,0.0,0.0,1188,534127230,20,RL,85.0,...,192,0,,GdWo,,0,10,2008,WD,Family
2770,1.0,0.0,0.0,0.0,0.0,2771,907130110,60,RL,65.0,...,0,0,,,,0,8,2006,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,0.0,1.0,0.0,0.0,0.0,1639,527226040,190,RL,195.0,...,0,0,,,,0,11,2007,WD,Normal
1095,1.0,0.0,0.0,0.0,0.0,1096,528290090,60,RL,,...,0,0,,,,0,4,2008,WD,Normal
1130,1.0,0.0,0.0,0.0,0.0,1131,528490080,60,FV,64.0,...,0,0,,,,0,5,2008,WD,Normal
1294,1.0,0.0,0.0,0.0,0.0,1295,902109160,50,RM,81.0,...,0,0,,,,0,6,2008,WD,Normal


In [None]:
pipeline3 = Pipeline([
    ('dummies', ct_dummies),
    ('interactions', ct_inter),
    ('linear_regression', LinearRegression())
])


In [None]:
fitted3= pipeline3.fit(X_train, y_train)

In [None]:
y_preds3 = fitted3.predict(X_test)

In [None]:
scores3 = cross_val_score(pipeline3, X, y, cv=5, scoring='neg_mean_squared_error')
scores3

array([-3.36072867e+09, -2.61379539e+09, -3.30925635e+09, -2.84668355e+09,
       -2.22567590e+09])

In [None]:
abs(scores3.mean())

2871227973.467162

In [None]:
fitted3.named_steps['linear_regression'].coef_

array([ 0.00000000e+00,  1.65218105e+01, -2.34340345e+04,  3.03618785e+01,
        1.82465456e-08,  1.65218105e+01,  6.18442598e+04, -6.20561601e+01,
       -1.45519152e-11,  1.65218106e+01,  3.49156268e+04, -4.01294236e+01,
       -4.74147401e-34,  1.65218106e+01, -2.63594541e+04,  1.90663253e+01,
        0.00000000e+00,  1.65218106e+01, -4.69663980e+04,  6.92791906e+01])

In [None]:
mean_squared_error(y_test, y_preds3)

3129905916.432637

### Model 4: Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(sparse_output=False), ['Bldg Type']),
        ('poly_size', PolynomialFeatures(degree=5, include_bias=False), ['Gr Liv Area']),
        ('poly_rooms', PolynomialFeatures(degree=5, include_bias=False), ['TotRms AbvGrd'])
    ],
    remainder='drop'  # Dropping all other features not specified
)

# Creating the pipeline
pipeline4 = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('linear_regression', LinearRegression())
])

In [None]:
fitted4= pipeline4.fit(X_train, y_train)

In [None]:
y_preds4 = fitted4.predict(X_test)

In [None]:
scores4 = cross_val_score(pipeline4, X, y, cv=5, scoring='neg_mean_squared_error')
scores4

array([-3.77153906e+09, -2.86095538e+09, -3.96611639e+09, -3.06340817e+09,
       -2.33192306e+09])

In [None]:
abs(scores4.mean())

3198788413.3832846

In [None]:
mean_squared_error(y_test, y_preds4)

3661439409.8980007

Model 3 emerged as the top performer with the lowest mean squared error in our cross-validation, making it a solid choice. Since cross-validation evaluates the model on different data slices, we can be more confident about its consistency and real-world performance.

# Practice Activity part2:

In [35]:
ct_poly = ColumnTransformer(
    transformers=[
        ('one_hot', OneHotEncoder(sparse_output=False), ['Bldg Type']),
        # Placeholder for polynomial features, with degree=1 as a default
        ('poly_size', PolynomialFeatures(), ['Gr Liv Area']),
        ('poly_rooms', PolynomialFeatures(), ['TotRms AbvGrd'])
    ],
    remainder='drop'  # Dropping all other features not specified
)


In [36]:
# Create a pipeline that creates interaction terms after preprocessing
pipeline4 = Pipeline([
    ('preprocess', ct_poly),
    ('linear_regression', LinearRegression())
])

In [37]:
degrees = {
    'preprocess__poly_size__degree': np.arange(1, 11),
    'preprocess__poly_rooms__degree': np.arange(1, 11)
}

In [42]:
gscv = GridSearchCV(pipeline4, degrees, cv=5, scoring='r2')
gscv.fit(X, y)

In [None]:
gscv_fitted.cv_results_['mean_test_score']

In [48]:
pd.DataFrame(degrees)

Unnamed: 0,preprocess__poly_size__degree,preprocess__poly_rooms__degree
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6
6,7,7
7,8,8
8,9,9
9,10,10


In [64]:
results_df = pd.DataFrame(gscv.cv_results_)
results_df = results_df[['param_preprocess__poly_size__degree', 'param_preprocess__poly_rooms__degree', 'mean_test_score']]

results_df.rename(columns={
    'param_preprocess__poly_size__degree': 'poly_size_degree',
    'param_preprocess__poly_rooms__degree': 'poly_rooms_degree',
    'mean_test_score': 'r2'
}, inplace=True)
results_df

# best model is one with these degrees

Unnamed: 0,poly_size_degree,poly_rooms_degree,r2
0,1,1,0.532882
1,2,1,0.537472
2,3,1,0.557641
3,4,1,0.549247
4,5,1,0.451860
...,...,...,...
95,6,10,0.054013
96,7,10,0.400104
97,8,10,-0.968095
98,9,10,-4.545593


In [None]:
results_df = pd.DataFrame(gscv.cv_results_)
results_df = results_df['param_preprocess__poly_size__degree', 'param_preprocess__poly_rooms__degree', 'mean_test_score']]

results_df.rename(columns={
    'param_preprocess__poly_size__degree': 'poly_size_degree',
    'param_preprocess__poly_rooms__degree': 'poly_rooms_degree',
    'mean_test_score': 'r2'
}, inplace=True)
results_df

# best model is one with these degrees

In [55]:
pd.DataFrame(results_df.sort_values('r2', ascending=False))

Unnamed: 0,poly_size_degree,poly_rooms_degree,r2
2,3,1,0.557641
12,3,2,0.556857
33,4,4,0.556835
43,4,5,0.556443
22,3,3,0.554039
...,...,...,...
89,10,9,-16.188834
99,10,10,-16.188835
90,1,10,-184.221206
91,2,10,-189.473646


1 method

In [29]:
gscv = GridSearchCV(pipeline4, degrees, cv = 5, scoring='r2')

In [30]:
gscv_fitted = gscv.fit(X, y)

In [24]:
cv_results = gscv_fitted.cv_results_

In [25]:
# Retrieve the best degree for 'poly_size'
best_degree_size = gscv_fitted.best_params_['preprocess__poly_size__degree']

# Retrieve the best degree for 'poly_rooms'
best_degree_rooms = gscv_fitted.best_params_['preprocess__poly_rooms__degree']

# Retrieve the best R-squared score achieved
best_score = gscv_fitted.best_score_

In [26]:
best_degree_size

3

In [27]:
best_degree_rooms

1

In [28]:
best_score

0.5576406065380459

Q1: Which model performed the best?

Model 3 performs the best with degrees size of 3 for size and 1 for total rooms

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

Each model configuration requires a separate training and validation process. If you're testing 100 different configurations, this can become computationally expensive and time-consuming, especially with larger datasets and more complex models. There is a risk of overfitting so we need to make sure the data is predicted on new unseen data. It also may be difficult to interpret more complex models.