In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
%matplotlib inline

In [13]:
df = pd.read_csv('home_data.csv')

# Polynomial Regression
- Split the original data into train-validate set (90%) and test set (10%) using random state = 0.
- Build a polynomial regression model with the degree = 15 using just ‘sqft_living’ on the train-
validation set. Remember to run standard scaler to normalize the features before building your
model.
- Report the model’s performance on the train-validate set and the test set

In [14]:
train_data, test_data = train_test_split(df, test_size=0.1, random_state=0)

In [15]:
X = train_data[["sqft_living"]]

In [16]:
y = train_data['price']

In [17]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

poly15 = PolynomialFeatures(degree=15, include_bias=False)
scaler = StandardScaler()
linreg = LinearRegression()
pipeline_reg = Pipeline([('poly', poly15), ('scal', scaler), ('lin', linreg)])
pipeline_reg.fit(X, y)

##### train-validate set score

In [18]:
print('R-squared score: {:.3f}'
     .format(pipeline_reg.score(X, y)))

R-squared score: 0.557


##### test set score

In [19]:
X2 = test_data[["sqft_living"]]
y2 = test_data['price']

In [20]:
print('R-squared score: {:.3f}'
     .format(pipeline_reg.score(X2, y2)))

R-squared score: 0.515


# Ridge Regression
- Split the original data into train-validate set (90%) and test set (10%) using random state = 0.
- For each L2_penalty λ in [10^3, 10^3.5, 10^4, 10^4.5, ..., 10^9], use GridSearchCV and 10-fold
cross validation to compare the performance of the ridge regression with polynomial degree = 15
using just ‘sqft_living’ on the train-validation set. Remember to run standard scaler to normalize
the features before building your model.
- Report which L2 penalty λ produced the lowest average validation error. Report the best model’s
performance on the test set

In [21]:
train_data2, test_data2 = train_test_split(df, test_size=0.1, random_state=0)

In [22]:
X3 = train_data2[["sqft_living"]]
y3 = train_data2['price']

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

In [24]:
pipe = make_pipeline(PolynomialFeatures(degree=15),StandardScaler(),Ridge())
param_grid = {'ridge__alpha': np.logspace(3, 9, 13)}

grid = GridSearchCV(pipe, param_grid, cv=10, return_train_score=True)
grid.fit(X3, y3)

In [25]:
results = pd.DataFrame(grid.cv_results_)

In [26]:
results['params'][results['mean_test_score'].idxmax()]

{'ridge__alpha': 1000.0}

In [27]:
print(grid.best_params_)
poly = grid.best_estimator_.named_steps['polynomialfeatures'].get_feature_names_out()
print(poly)
print(pd.DataFrame({'Features':poly,'Coefficients':grid.best_estimator_.named_steps['ridge'].coef_}).sort_values(by='Coefficients', ascending=False))

{'ridge__alpha': 1000.0}
['1' 'sqft_living' 'sqft_living^2' 'sqft_living^3' 'sqft_living^4'
 'sqft_living^5' 'sqft_living^6' 'sqft_living^7' 'sqft_living^8'
 'sqft_living^9' 'sqft_living^10' 'sqft_living^11' 'sqft_living^12'
 'sqft_living^13' 'sqft_living^14' 'sqft_living^15']
          Features   Coefficients
2    sqft_living^2  109179.051390
1      sqft_living   98811.703978
3    sqft_living^3   68490.832263
4    sqft_living^4   24715.682250
5    sqft_living^5    7469.574950
6    sqft_living^6    3721.002132
7    sqft_living^7    2355.985425
8    sqft_living^8     208.732870
0                1       0.000000
9    sqft_living^9   -2815.812719
10  sqft_living^10   -6203.095035
11  sqft_living^11   -9533.419421
12  sqft_living^12  -12578.037879
13  sqft_living^13  -15248.622136
14  sqft_living^14  -17537.027587
15  sqft_living^15  -19473.606516


All polynomial degrees are chosen except for degree of zero

In [28]:
X4 = test_data[["sqft_living"]]
y4 = test_data['price']

In [29]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_pred= grid.predict(X4)
print('Ridge best model RMSE on test data {}'.format(np.sqrt(mean_squared_error(y_pred,y4))))
print('Ridge best model R-squared on test data {}'.format(r2_score(y4, y_pred)))


Ridge best model RMSE on test data 238585.1484575821
Ridge best model R-squared on test data 0.5108856832219169


# Lasso Regression
### Part 1
- Create new features by performing following transformation on inputs: (assume you have named
your data frame “sales”)

from math import log, sqrt

sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)

sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)

sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']

sales['floors_square'] = sales['floors']*sales['floors']

 Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and
lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly
affect houses with many bedrooms.

 On the other hand, taking square root of sqft_living will decrease the separation between big
house and small house. The owner may not be exactly twice as happy for getting a house
that is twice as big.
- Split the data into train-validate set (90%) and test set (10%) using random state = 0.
- Run Lasso regression with λ = 100 using the following features on the train-validate set.
Remember to run standard scaler to normalize the features before building your model.

['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt', 'sqft_lot',
'sqft_lot_sqrt', 'floors', 'floors_square', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
'sqft_basement', 'yr_built', 'yr_renovated']

- Which features have been chosen by LASSO, i.e. which features were assigned nonzero
weights? Report the model’s performance on the test set.

In [30]:
from sklearn.linear_model import Lasso
from math import log, sqrt
df['sqft_living_sqrt'] = df['sqft_living'].apply(sqrt)
df['sqft_lot_sqrt'] = df['sqft_lot'].apply(sqrt)
df['bedrooms_square'] = df['bedrooms']**2
df['floors_square'] = df['floors']**2

In [31]:
train_data3, test_data3 = train_test_split(df, test_size=0.1, random_state=0)

In [32]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'sqft_living_sqrt',
       'sqft_lot_sqrt', 'bedrooms_square', 'floors_square'],
      dtype='object')

In [33]:
X3 = train_data3[['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt', 'sqft_lot', 'sqft_lot_sqrt', 'floors', 'floors_square', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']]
y3 = train_data3['price']
X4=test_data3[['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt', 'sqft_lot', 'sqft_lot_sqrt', 'floors', 'floors_square', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']]
y4=test_data3['price']

In [34]:
pipe2 = make_pipeline(StandardScaler(), Lasso(alpha=100, max_iter=int(1e6)))
lasso_model=pipe2.fit(X3, y3)
pipe2.score(X3, y3)

0.6870570175342237

In [35]:
from sklearn.linear_model import lars_path
# lars_path computes the exact regularization path which is piecewise linear.
X_train, X_test, y_train, y_test = train_test_split(X3.values, y3.values, random_state=0)
alphas, active, coefs = lars_path(X_train, y_train, eps=0.00001, method="lasso")



In [36]:
X3.columns[np.where(pipe2['lasso'].coef_!=0)]

Index(['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living',
       'sqft_living_sqrt', 'sqft_lot', 'sqft_lot_sqrt', 'floors',
       'floors_square', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'yr_built', 'yr_renovated'],
      dtype='object')

In [37]:
X3.columns[np.where(pipe2['lasso'].coef_==0)]

Index(['sqft_basement'], dtype='object')

In [38]:
y4_pred = pipe2.predict(X4)
# RMSE
rmse = np.sqrt(mean_squared_error(y4, y4_pred))
print(f"Best Lasso model RMSE on test data: {rmse}")
print(r2_score(y4, y4_pred))


Best Lasso model RMSE on test data: 196690.99000057345
0.6675759046117227


Part 2 (continue from Part 1)
- For each L1_penalty λ in [10^1, 10^1.5, 10^2, 10^2.5, ..., 10^7], use GridSearchCV and 10-fold
cross validation to compare the performance of the lasso regression using all the features used in
Part 1 on the train-validation set. Remember to run standard scaler to normalize the features.


In [39]:
param_grid = {'lasso__alpha': np.logspace(1, 7, 13)}
pipe3 = make_pipeline(StandardScaler(), Lasso())
grid2=GridSearchCV(cv=10, param_grid=param_grid, estimator=pipe3, return_train_score=True)
grid2.fit(X3, y3)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [40]:
grid2_df=pd.DataFrame(grid2.cv_results_)
grid2_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lasso__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.556186,0.143959,0.00711,0.002079,10.0,{'lasso__alpha': 10.0},0.67867,0.687395,0.730026,0.634527,...,0.679759,0.692209,0.692425,0.686497,0.686851,0.684996,0.687084,0.68809,0.687217,0.003387
1,0.761437,0.626492,0.007592,0.004566,31.62278,{'lasso__alpha': 31.622776601683793},0.678772,0.687234,0.729825,0.63456,...,0.679757,0.692207,0.692423,0.686495,0.686849,0.684993,0.687081,0.688088,0.687215,0.003387
2,0.601366,0.207382,0.007394,0.002434,100.0,{'lasso__alpha': 100.0},0.679078,0.68671,0.729172,0.634652,...,0.679734,0.692186,0.692403,0.686473,0.686828,0.684972,0.687046,0.688067,0.687192,0.003388
3,1.027878,0.558428,0.015874,0.005295,316.2278,{'lasso__alpha': 316.22776601683796},0.6799,0.684931,0.726953,0.634794,...,0.679508,0.691979,0.692202,0.686254,0.686614,0.684762,0.686743,0.687862,0.686971,0.003394
4,0.286345,0.017253,0.00655,0.001939,1000.0,{'lasso__alpha': 1000.0},0.681286,0.678821,0.718975,0.634874,...,0.677692,0.690458,0.690647,0.684521,0.68491,0.683211,0.685033,0.686244,0.685323,0.003462
5,0.338883,0.153962,0.011501,0.004539,3162.278,{'lasso__alpha': 3162.2776601683795},0.677541,0.652537,0.683508,0.630221,...,0.663691,0.678024,0.678652,0.670785,0.671849,0.670266,0.67238,0.673609,0.672367,0.003974
6,0.121795,0.202432,0.008364,0.003481,10000.0,{'lasso__alpha': 10000.0},0.659519,0.617632,0.646208,0.614812,...,0.644132,0.650648,0.651644,0.643509,0.646493,0.643826,0.645831,0.647304,0.646839,0.002663
7,0.053603,0.026325,0.007208,0.002561,31622.78,{'lasso__alpha': 31622.776601683792},0.62968,0.577243,0.600773,0.592229,...,0.608246,0.615386,0.617086,0.607352,0.610714,0.60816,0.609933,0.612491,0.611449,0.003076
8,0.073507,0.037176,0.008099,0.002026,100000.0,{'lasso__alpha': 100000.0},0.479916,0.419294,0.432964,0.449339,...,0.444754,0.454977,0.45926,0.450002,0.450328,0.44931,0.452343,0.454346,0.451858,0.00376
9,0.051372,0.032088,0.007673,0.003116,316227.8,{'lasso__alpha': 316227.7660168379},-0.002185,-0.002577,-0.000974,-0.000402,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Report which L1 penalty λ produced the lowest average validation error. Which features have
been chosen by the best model, i.e. which features were assigned nonzero weights? Report the
best model’s performance on the test set.


In [41]:
grid2_df['params'][grid2_df['mean_test_score'].idxmax()] #core

{'lasso__alpha': 316.22776601683796}

In [42]:
grid2.best_params_

{'lasso__alpha': 316.22776601683796}

In [43]:
from sklearn.metrics import r2_score
print(pd.DataFrame({'Features':grid2.feature_names_in_,'Coefficients':grid2.best_estimator_.named_steps['lasso'].coef_},index=grid2.feature_names_in_).sort_values(by='Coefficients', ascending=False))



                          Features   Coefficients
sqft_living            sqft_living  379217.892998
grade                        grade  152153.624660
sqft_above              sqft_above  139441.579353
sqft_basement        sqft_basement   75576.668667
waterfront              waterfront   50819.323800
floors_square        floors_square   39016.355713
bathrooms                bathrooms   38747.418655
view                          view   30564.350175
sqft_lot                  sqft_lot   25565.784291
condition                condition   17542.267941
yr_renovated          yr_renovated    7058.373629
bedrooms_square    bedrooms_square    3740.430080
bedrooms                  bedrooms  -16776.570355
floors                      floors  -29034.991245
sqft_lot_sqrt        sqft_lot_sqrt  -42954.312417
yr_built                  yr_built  -94948.222365
sqft_living_sqrt  sqft_living_sqrt -395121.298555


In [44]:
X3.columns[grid2.best_estimator_.named_steps['lasso'].coef_!=0] #chosen

Index(['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living',
       'sqft_living_sqrt', 'sqft_lot', 'sqft_lot_sqrt', 'floors',
       'floors_square', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'],
      dtype='object')

In [45]:
X3.columns[grid2.best_estimator_.named_steps['lasso'].coef_==0] #not chosen

Index([], dtype='object')

In [46]:
y4_pred = grid2.predict(X4)
# RMSE
rmse = np.sqrt(mean_squared_error(y4, y4_pred))
print(f"Best Lasso model RMSE on test data: {rmse}")
print(r2_score(y4, y4_pred))


Best Lasso model RMSE on test data: 196799.80980243604
0.6672079738695584
