In [135]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector, TransformedTargetRegressor
from sklearn.metrics import mean_squared_error


### Load Data

In [162]:
train = pd.read_csv('datasets/train_cleaned.csv', index_col='Id')

In [163]:
X = train.drop(columns = 'SalePrice')
y = train['SalePrice']

In [164]:
pd.DataFrame(X.isna().sum())

Unnamed: 0,0
MS Zoning,0
Lot Frontage,0
Lot Area,0
Street,0
Alley,0
...,...
Cond Pos,0
Cond RR,0
Porch SF,0
is Remodeled,0


In [165]:
X.head()

Unnamed: 0_level_0,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,Garage Ratio,is Shed,is PUD,Cond Feeder,Cond Artery,Cond Pos,Cond RR,Porch SF,is Remodeled,Last Remodel
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,RL,69.0552,13517,1,,1,Lvl,AllPub,CulDSac,0,...,237.5,0,0,0,0,0,1,44,1,5
544,RL,43.0,11492,1,,1,Lvl,AllPub,CulDSac,0,...,279.5,0,0,0,0,0,0,74,1,12
153,RL,68.0,7922,1,,0,Lvl,AllPub,Inside,0,...,246.0,0,0,0,0,0,0,52,1,3
318,RL,73.0,9802,1,,0,Lvl,AllPub,Inside,0,...,200.0,0,0,0,0,0,0,100,1,3
255,RL,82.0,14235,1,,1,Lvl,AllPub,Inside,0,...,242.0,0,0,0,0,0,0,59,1,17


In [88]:
X.dtypes

Id                int64
MS Zoning        object
Lot Frontage    float64
Lot Area          int64
Street            int64
                 ...   
Cond Pos          int64
Cond RR           int64
Porch SF          int64
is Remodeled      int64
Last Remodel      int64
Length: 65, dtype: object

### Helper Functions

#### Fit model, score on train, predict, and save outputs

In [225]:
def run_model(model, out, X = X, X_test = X_test):
    model.fit(X, y)
    print(model.score(X, y))
    preds = model.predict(X_test)
    out_df = pd.DataFrame(preds, columns = ['SalePrice'], index = X_test.index)
    out_df.to_csv('submission/' + out + '.csv')

### Linear Regression model

In [167]:
encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    (OneHotEncoder(), ['Mo Sold', 'Yr Sold', 'MoYr Sold']),
    (StandardScaler(), make_column_selector(dtype_include = [int, float])),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [168]:
pipe = make_pipeline(
    encoder,
    LinearRegression()
)

In [169]:
pipe.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder-1',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F2B27F0>),
                                                 ('onehotencoder-2',
                                                  OneHotEncoder(),
                                                  ['Mo Sold', 'Yr Sold',
                                                   'MoYr Sold']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F2B3520>)],
                                   verbose_feature_

In [170]:
pipe.score(X, y)

0.9049148098459069

In [171]:
pipe.named_steps['linearregression'].coef_

array([ 5.88558335e+03, -1.37619760e+03, -2.28987069e+03, -2.21951507e+03,
        8.24889197e+02,  8.10937067e+02, -1.63582626e+03, -1.01244546e+04,
        1.05922754e+04,  4.69389301e+02, -9.37210113e+02,  6.01173525e+03,
       -4.18783857e+04,  3.58666505e+04,  2.73571348e+03,  6.84109024e+03,
       -1.47476482e+03, -1.06280071e+04,  2.52596819e+03,  3.87896944e+03,
        8.70654673e+03, -1.06656664e+04, -7.40160200e+03, -9.88865354e+03,
        5.17934037e+03, -2.25313267e+04, -1.32843609e+04, -1.67008946e+04,
        3.81355755e+03, -1.08466895e+04, -1.18610898e+04,  1.39977973e+04,
       -1.61396023e+04,  2.65798652e+04,  3.71799788e+04, -1.87490600e+04,
       -1.77647442e+04, -1.19209927e+04, -1.14892512e+04,  6.99968064e+03,
        4.77778487e+04, -6.43448076e+03,  6.72676022e+03,  2.48380700e+04,
        5.80390566e+03, -4.56151362e+01, -5.50019744e+03, -6.21546856e+03,
        5.95737549e+03,  3.53777582e+03,  2.91845808e+03,  1.03210973e+04,
       -1.59861232e+04,  

In [177]:
X_test = pd.read_csv('datasets/test_cleaned.csv', index_col = 'Id')

In [178]:
X_test.head()

Unnamed: 0_level_0,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,is Shed,is PUD,Cond Feeder,Cond Artery,Cond Pos,Cond RR,Porch SF,MoYr Sold,is Remodeled,Last Remodel
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,RM,69.0,9142,1,Grvl,0,Lvl,AllPub,Inside,0,...,0,0,0,0,0,0,172,2006.333333,1,56
2718,RL,69.0552,9662,1,,1,Lvl,AllPub,Inside,0,...,0,0,0,0,0,0,170,2006.666667,0,29
2414,RL,58.0,17104,1,,1,Lvl,AllPub,Inside,0,...,0,0,0,0,0,0,124,2006.75,0,0
1989,RM,60.0,8520,1,,0,Lvl,AllPub,Inside,0,...,0,0,0,0,0,0,184,2007.583333,1,1
625,RL,69.0552,9500,1,,1,Lvl,AllPub,Inside,0,...,0,0,0,0,0,0,261,2009.583333,0,46


In [179]:
X_test['SalePrice'] = pipe.predict(X_test)

In [180]:
X_test[['SalePrice']].to_csv('submission/LinearRegression.csv') 

Overfitting by about 2000

In [181]:
# Kaggle RMSE 26406
mean_squared_error(y, pipe.predict(X), squared=False)

24434.151484886108

### Log Transformation

In [182]:
log_lr_model = TransformedTargetRegressor(regressor = pipe, func = np.log, inverse_func = np.exp)

In [185]:
run_model(log_lr_model, out = 'lr_logTransform') 

0.904611294274912


*Under*-fitting by about 700.

In [186]:
mean_squared_error(y, log_lr_model.predict(X), squared = False)
# Kaggle score 23733

24473.117788152013

### Ridge Regression

In [187]:
pipe_ridge = make_pipeline(
    encoder,
    Ridge()
)

In [188]:
params_ridge = {'ridge__alpha': np.logspace(-4, 4)}

In [189]:
gs_ridge = GridSearchCV(pipe_ridge, params_ridge)

In [190]:
# gs_ridge.fit(X, y)

In [220]:
run_model(gs_ridge, 'ridge')

0.8998374870481313


In [131]:
pd.DataFrame(gs_ridge.cv_results_).query('rank_test_score < 10')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ridge__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,0.060398,0.007618,0.016214,0.002229,2.559548,{'ridge__alpha': 2.559547922699533},0.857861,0.890371,0.801383,0.886305,0.824904,0.852165,0.034582,9
28,0.057594,0.009563,0.016009,0.001895,3.727594,{'ridge__alpha': 3.727593720314938},0.858455,0.891536,0.800326,0.888916,0.826139,0.853075,0.035499,7
29,0.059804,0.008975,0.017194,0.001468,5.428675,{'ridge__alpha': 5.428675439323859},0.859428,0.892833,0.800914,0.890715,0.825834,0.853945,0.036047,5
30,0.061802,0.006555,0.0172,0.000755,7.906043,{'ridge__alpha': 7.9060432109076855},0.860044,0.89417,0.799945,0.892461,0.825645,0.854453,0.037025,3
31,0.068203,0.00659,0.018799,0.001173,11.513954,{'ridge__alpha': 11.513953993264458},0.860495,0.895415,0.797338,0.894614,0.825222,0.854617,0.038584,2
32,0.057998,0.005375,0.016398,0.001355,16.768329,{'ridge__alpha': 16.768329368110066},0.860381,0.896407,0.796255,0.895967,0.824909,0.854784,0.039441,1
33,0.056614,0.005209,0.017,0.001095,24.420531,{'ridge__alpha': 24.420530945486497},0.860045,0.896983,0.794641,0.897298,0.823051,0.854404,0.040594,4
34,0.055197,0.008418,0.015603,0.001627,35.564803,{'ridge__alpha': 35.564803062231285},0.859174,0.897009,0.793779,0.897768,0.821187,0.853783,0.04122,6
35,0.051974,0.005199,0.015598,0.002063,51.794747,{'ridge__alpha': 51.79474679231202},0.857963,0.896394,0.790926,0.897728,0.818167,0.852236,0.04236,8


In [207]:
# gs_ridge.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

Overfitting about 300

In [151]:
# Kaggle score 25364
mean_squared_error(y, gs_ridge.predict(X), squared=False)

25067.16291470853

#### Ridge w/ log transform

In [153]:
log_ridge = TransformedTargetRegressor(regressor=gs_ridge, func=np.log, inverse_func=np.exp)

In [155]:
run_model(log_ridge, 'ridge_logT')

0.8868039167547876


In [212]:
log_ridge.regressor_.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder-1',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F39B940>),
                                                 ('onehotencoder-2',
                                                  OneHotEncoder(),
                                                  ['Mo Sold', 'Yr Sold',
                                                   'MoYr Sold']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F39BA30>)],
                                   verbose_feature_

In [156]:
mean_squared_error(y, log_ridge.predict(X), squared = False)

26659.777610783283

### Feature squares

In [194]:
X_num_cols = [c for c in X.columns if X.dtypes[c] != object and X[c].max() > 1]

In [196]:
len(X_num_cols)

34

In [200]:
X_squared = X[X_num_cols]**2
X_test_squared = X_test[X_num_cols]**2

In [204]:
X_all = X.join(X_squared, rsuffix = '**2')
X_test_all = X_test.join(X_test_squared, rsuffix = '**2')

In [205]:
X_test_all.head()

Unnamed: 0_level_0,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,Fence**2,Misc Val**2,Mo Sold**2,Yr Sold**2,MoYr Sold**2,Lot Ratio**2,Sum Bath**2,Garage Ratio**2,Porch SF**2,Last Remodel**2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,RM,69.0,9142,1,Grvl,0,Lvl,AllPub,Inside,0,...,0,0,16,4024036,4025373.0,3.68711,4.0,193600.0,29584,3136
2718,RL,69.0552,9662,1,,1,Lvl,AllPub,Inside,0,...,0,0,64,4024036,4026711.0,5.331275,4.0,84100.0,28900,841
2414,RL,58.0,17104,1,,1,Lvl,AllPub,Inside,0,...,0,0,81,4024036,4027046.0,25.85136,12.25,45369.0,15376,0
1989,RM,60.0,8520,1,,0,Lvl,AllPub,Inside,0,...,0,0,49,4028049,4030391.0,5.601111,1.0,57600.0,33856,1
625,RL,69.0552,9500,1,,1,Lvl,AllPub,Inside,0,...,0,0,49,4036081,4038425.0,5.331275,6.25,66049.0,68121,2116


In [218]:
pipe_squares_ridge = make_pipeline(
    encoder,
    Ridge()
)

In [219]:
gs_squares = GridSearchCV(pipe_squares_ridge, params_ridge)

In [226]:
run_model(gs_squares, 'squares_ridge', X = X_all, X_test = X_test_all)

0.927189921490802


In [277]:
gs_squares.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder-1',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F540190>),
                                                 ('onehotencoder-2',
                                                  OneHotEncoder(),
                                                  ['Mo Sold', 'Yr Sold',
                                                   'MoYr Sold']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F540250>)],
                                   verbose_feature_

About 3500 overfitting.

In [231]:
# Kaggle score 24899
mean_squared_error(y, gs_squares.predict(X_all), squared = False)

21381.421354297152

#### Squares ridge + log

In [227]:
squares_ridge_log = TransformedTargetRegressor(
    regressor = gs_squares, func = np.log, inverse_func = np.exp)

In [232]:
run_model(squares_ridge_log, 'squares_ridge_log', X_all, X_test_all)

0.9437144528539231


In [236]:
squares_ridge_log.regressor_.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder-1',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F3C9AF0>),
                                                 ('onehotencoder-2',
                                                  OneHotEncoder(),
                                                  ['Mo Sold', 'Yr Sold',
                                                   'MoYr Sold']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F3C9A30>)],
                                   verbose_feature_

Overfitting about 2800

In [233]:
# Kaggle score 21616
mean_squared_error(y, squares_ridge_log.predict(X_all), squared = False)

18799.195680701443

### Elastic net

In [267]:
pipe_elastic_net = make_pipeline(
    encoder,
    ElasticNet(max_iter=10_000, selection='random')
)

In [272]:
params_elastic_net = {
    'elasticnet__alpha': np.logspace(-2, 3),
    'elasticnet__l1_ratio': np.linspace(0.05, 0.5, 10)
}

In [273]:
gs_elastic_net = GridSearchCV(pipe_elastic_net, params_elastic_net, n_jobs = 1, verbose=2)

In [274]:
squares_elastic_net_log = TransformedTargetRegressor(
    regressor = gs_elastic_net, func = np.log, inverse_func = np.exp
)

In [275]:
run_model(squares_elastic_net_log, 'squares_elastic_net_log', X_all, X_test_all)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV] END ..elasticnet__alpha=0.01, elasticnet__l1_ratio=0.05; total time=   0.1s
[CV] END ..elasticnet__alpha=0.01, elasticnet__l1_ratio=0.05; total time=   0.1s
[CV] END ..elasticnet__alpha=0.01, elasticnet__l1_ratio=0.05; total time=   0.1s
[CV] END ..elasticnet__alpha=0.01, elasticnet__l1_ratio=0.05; total time=   0.1s
[CV] END ..elasticnet__alpha=0.01, elasticnet__l1_ratio=0.05; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.2s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END elasticnet__alpha=0.01, elasticnet__l1_ratio=0.15000000000000002; total time=   0.1s
[CV] END elasticnet__alpha=0.01,

Elastic net is converging to ridge (low l1_ratio) and linear regression (low alpha)

In [276]:
squares_elastic_net_log.regressor_.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder-1',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F980280>),
                                                 ('onehotencoder-2',
                                                  OneHotEncoder(),
                                                  ['Mo Sold', 'Yr Sold',
                                                   'MoYr Sold']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002CF8F980850>)],
                                   verbose_feature_

Overfitting about 1400

In [278]:
# Kaggle score 21432
mean_squared_error(y, squares_elastic_net_log.predict(X_all), squared = False)

20075.49532231719