In [23]:
import pandas as pd
import numpy as np

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype = dtype_dict)
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype = dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype = dtype_dict)

sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


Create new features by performing following transformation on inputs:

 *   Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly affect houses with many bedrooms.
 *   On the other hand, taking square root of sqft_living will decrease the separation between big house and small house. The owner may not be exactly twice as happy for getting a house that is twice as big.

In [24]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

Using the entire house dataset, learn regression weights using an L1 penalty of 5e2. Make sure to add "normalize=True" when creating the Lasso object. Refer to the following code snippet for the list of features.

Note. From here on, the list 'all_features' refers to the list defined in this snippet.

In [25]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

# Question 1
We learn weights on the entire house dataset, using an L1 penalty of 1e10 (or 5e2, if using scikit-learn). Some features are transformations of inputs; see the reading.

Which of the following features have been chosen by LASSO, i.e. which features were assigned nonzero weights? (Choose all that apply)

In [26]:
weights_all = model_all.coef_
non_zero_index = [np.where(weights_all == weight)[0][0] for weight in weights_all if weight != 0.0]
selected_features = [all_features[index] for index in non_zero_index]
selected_features

['sqft_living', 'view', 'grade']

To find a good L1 penalty, we will explore multiple values using a validation set. Let us do three way split into train, validation, and test sets. Download the provided csv files containing training, validation and test sets.

In [27]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

Now for each l1_penalty in [10^1, 10^1.5, 10^2, 10^2.5, ..., 10^7] (to get this in Python, type np.logspace(1, 7, num=13).)

  *  Learn a model on TRAINING data using the specified l1_penalty. Make sure to specify normalize=True in the constructor:
  *  Compute the RSS on VALIDATION for the current model (print or save the RSS)

Report which L1 penalty produced the lower RSS on VALIDATION.

In [28]:
from sklearn.metrics import mean_squared_error

RSS_VALIDATION = []
for l1_penalty in np.logspace(1, 7, num = 13):
    model = linear_model.Lasso(alpha = l1_penalty, normalize = True)
    model.fit(training[all_features], training['price'])
    prediction = model.predict(validation[all_features])
    RSS_VALIDATION.append(mean_squared_error(validation['price'], prediction))
    
print("the L1 penalty which produced the lower RSS on VALIDATION: ", np.logspace(1, 7, num = 13)[RSS_VALIDATION.index(min(RSS_VALIDATION))])

the L1 penalty which produced the lower RSS on VALIDATION:  10.0


# Question 2
We split the house sales dataset into training set, test set, and validation set and choose the l1_penalty that minimizes the error on the validation set.

In which of the following ranges does the best l1_penalty fall?

* Between 0 and 100

# Question 3
Using the best value of l1_penalty as mentioned in the previous question, how many nonzero weights do you have?

In [29]:
l1_penalty_q3 = 10.0
model_q3 = linear_model.Lasso(alpha = l1_penalty_q3, normalize = True)
model_q3.fit(training[all_features], training['price'])
print("number of nonzero weights: ", (model_q3.coef_ != 0).sum() + (model_q3.intercept_ != 0).sum())

number of nonzero weights:  15


You are going to implement a simple, two phase procedure to achieve this goal:

 *   Explore a large range of ‘l1_penalty’ values to find a narrow region of ‘l1_penalty’ values where models are likely to have the desired number of non-zero weights.
 *   Further explore the narrow region you found to find a good value for ‘l1_penalty’ that achieves the desired sparsity. Here, we will again use a validation set to choose the best value for ‘l1_penalty’.
 
Assign 7 to the variable ‘max_nonzeros’.

Exploring large range of l1_penalty

For l1_penalty in np.logspace(1, 4, num=20):

 *   Fit a regression model with a given l1_penalty on TRAIN data. Add "alpha=l1_penalty" and "normalize=True" to the parameter list.
 *   Extract the weights of the model and count the number of nonzeros. Take account of the intercept as we did in #8, adding 1 whenever the intercept is nonzero. Save the number of nonzeros to a list.

In [30]:
max_nonzeros = 7

def find_nonzero_weights(model):
    return (model.coef_ != 0).sum() + (model.intercept_ != 0).sum()

In [31]:
for l1_penalty in np.logspace(1, 4, num = 20):
    model_q4 = linear_model.Lasso(alpha = l1_penalty, normalize = True)
    model_q4.fit(training[all_features], training['price'])
    print(l1_penalty, ":", find_nonzero_weights(model_q4))

10.0 : 15
14.3844988829 : 15
20.6913808111 : 15
29.7635144163 : 15
42.8133239872 : 13
61.5848211066 : 12
88.586679041 : 11
127.42749857 : 10
183.298071083 : 7
263.665089873 : 6
379.269019073 : 6
545.559478117 : 6
784.759970351 : 5
1128.83789168 : 3
1623.77673919 : 3
2335.72146909 : 2
3359.81828628 : 1
4832.93023857 : 1
6951.92796178 : 1
10000.0 : 1


# Question 4
We explore a wide range of l1_penalty values to find a narrow region of l1_penaty values where models are likely to have the desired number of non-zero weights (max_nonzeros=7).

What value did you find for l1_penalty_max?

* 263

Exploring narrower range of l1_penalty

We now explore the region of l1_penalty we found: between ‘l1_penalty_min’ and ‘l1_penalty_max’. We look for the L1 penalty in this range that produces exactly the right number of nonzeros and also minimizes RSS on the VALIDATION set.

For l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):

  *  Fit a regression model with a given l1_penalty on TRAIN data. As before, use "alpha=l1_penalty" and "normalize=True".
  *  Measure the RSS of the learned model on the VALIDATION set

Find the model that the lowest RSS on the VALIDATION set and has sparsity equal to ‘max_nonzeros’. (Again, take account of the intercept when counting the number of nonzeros.)

In [32]:
l1_penalty_min = 127.42749857
l1_penalty_max = 263.665089873
for l1_penalty in np.linspace(l1_penalty_min, l1_penalty_max, 20):
    model_q5 = linear_model.Lasso(alpha = l1_penalty, normalize = True)
    model_q5.fit(training[all_features], training['price'])
    prediction_q5 = model_q5.predict(validation[all_features])
    RSS_q5 = mean_squared_error(prediction_q5, validation['price'])
    print(l1_penalty, RSS_q5, find_nonzero_weights(model_q5))

127.42749857 45186785376.5 10
134.597898112 45356432706.2 10
141.768297655 45483770460.5 8
148.938697197 45579547254.8 8
156.109096739 45670717723.2 7
163.279496281 45747533953.5 7
170.449895824 45829444534.5 7
177.620295366 45916597113.5 7
184.790694908 46009000194.5 7
191.96109445 46106879141.3 7
199.131493993 46209729096.3 7
206.301893535 46317477619.6 6
213.472293077 46405077263.6 6
220.642692619 46496957742.8 6
227.813092162 46593119530.2 6
234.983491704 46693562625.8 6
242.153891246 46798287366.7 6
249.324290788 46907361355.0 6
256.494690331 47020646016.4 6
263.665089873 47138211693.1 6


# Question 5
We then explore the narrow range of l1_penalty values between l1_penalty_min and l1_penalty_max.

What value of l1_penalty in our narrow range has the lowest RSS on the VALIDATION set and has sparsity equal to max_nonzeros?



* 156.109096739

# Question 6
Consider the model learned with the l1_penalty found in the previous question. Which of the following features has non-zero coefficients? (Choose all that apply)

In [33]:
model_q6 = linear_model.Lasso(alpha = 156.109096739, normalize = True)
model_q6.fit(training[all_features], training['price'])

weights_q6 = model_q6.coef_
nonzero_index_q6 = [np.where(weights_q6 == weight)[0][0] for weight in weights_q6 if weight != 0.0]
[all_features[index] for index in nonzero_index_q6]


['bathrooms', 'sqft_living', 'waterfront', 'view', 'grade', 'yr_built']