In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [3]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [4]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [5]:
model_all.coef_

array([    0.        ,     0.        ,     0.        ,   134.43931396,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        , 24750.00458561,     0.        ,
       61749.10309071,     0.        ,     0.        ,    -0.        ,
           0.        ])

 ***Quiz Question: Which features have been chosen by LASSO, i.e. which features were assigned nonzero weights?***
 

sqft_living, view,grade

In [42]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [43]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [44]:
l1_penalties = np.logspace(1, 7, num=13)

In [45]:
l1_penalties

array([1.00000000e+01, 3.16227766e+01, 1.00000000e+02, 3.16227766e+02,
       1.00000000e+03, 3.16227766e+03, 1.00000000e+04, 3.16227766e+04,
       1.00000000e+05, 3.16227766e+05, 1.00000000e+06, 3.16227766e+06,
       1.00000000e+07])

In [46]:
rssList = []
#features = list(training.columns)
#features.remove('price')
#features.remove('date')

for penalties in l1_penalties:
    model_all = linear_model.Lasso(alpha=penalties, normalize=True)
    model_all.fit(training[all_features], training['price']) # learn weights
    rssList.append(sum(((model_all.predict(validation[all_features]) - validation['price'])**2)))

In [47]:
rssList

[398213327300134.4,
 399041900253348.5,
 429791604072558.44,
 463739831045119.44,
 645898733633803.2,
 1222506859427156.8,
 1222506859427156.8,
 1222506859427156.8,
 1222506859427156.8,
 1222506859427156.8,
 1222506859427156.8,
 1222506859427156.8,
 1222506859427156.8]

*** Quiz Question: Which was the best value for the l1_penalty, i.e. which value of l1_penalty produced the lowest RSS on VALIDATION data?***

In [48]:
l1_penalties[0]

10.0

Now that you have selected an L1 penalty, compute the RSS on TEST data for the model with the best L1 penalty.

In [49]:
best_model = linear_model.Lasso(alpha=l1_penalties[0], normalize=True)
best_model.fit(training[all_features], training['price']) # learn weights
sum(((best_model.predict(testing[all_features]) - testing['price'])**2))

98467402552698.86

*** Quiz Question: Using the best L1 penalty, how many nonzero weights do you have? Count the number of nonzero coefficients first, and add 1 if the intercept is also nonzero. A succinct way to do this is***
    

In [50]:
best_model.coef_

array([-1.61445628e+04,  3.73245384e+02,  5.08412433e+04,  6.17853560e+02,
       -4.44113549e+04,  7.85623065e-01, -7.01194765e+02, -0.00000000e+00,
        5.01420046e+03,  6.19488752e+05,  3.80418557e+04,  2.49987718e+04,
        1.28716235e+05,  0.00000000e+00,  0.00000000e+00, -3.29383118e+03,
        1.00573209e+01])

In [40]:
best_model.intercept_

14038593.227685915

**15 non zero weights**

In [51]:
np.count_nonzero(best_model.coef_) + np.count_nonzero(best_model.intercept_)

15

### Limit the number of non zero weights


In [52]:
max_nonzeros = 7

In [76]:
l1_penalty_values = np.logspace(1, 4, num=20)
print(l1_penalty_values)

[   10.            14.38449888    20.69138081    29.76351442
    42.81332399    61.58482111    88.58667904   127.42749857
   183.29807108   263.66508987   379.26901907   545.55947812
   784.75997035  1128.83789168  1623.77673919  2335.72146909
  3359.81828628  4832.93023857  6951.92796178 10000.        ]


In [77]:
dict_p_nz = {}
for penalties in l1_penalty_values:
    model = linear_model.Lasso(alpha=penalties, normalize=True)
    model.fit(training[all_features], training['price']) # learn weights
    dict_p_nz[penalties] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

In [78]:
dict_p_nz

{10.0: 15,
 14.38449888287663: 15,
 20.6913808111479: 15,
 29.76351441631318: 15,
 42.81332398719393: 13,
 61.58482110660264: 12,
 88.58667904100822: 11,
 127.42749857031335: 10,
 183.29807108324357: 7,
 263.6650898730358: 6,
 379.26901907322497: 6,
 545.5594781168514: 6,
 784.7599703514607: 5,
 1128.8378916846884: 3,
 1623.776739188721: 3,
 2335.7214690901214: 2,
 3359.818286283781: 1,
 4832.930238571752: 1,
 6951.927961775606: 1,
 10000.0: 1}

More formally, find:

- The largest l1_penalty that has more non-zeros than max_nonzero (if we pick a penalty smaller than this value, we will definitely have too many non-zero weights)
        Store this value in the variable l1_penalty_min (we will use it later)
- The smallest l1_penalty that has fewer non-zeros than max_nonzero (if we pick a penalty larger than this value, we will definitely have too few non-zero weights)
        Store this value in the variable l1_penalty_max (we will use it later)


In [93]:
l1_penalty_max = []
l1_penalty_min = []
for key, value in dict_p_nz.items():
    if value > max_nonzeros:
        l1_penalty_min.append(key)
    if value < max_nonzeros:
        l1_penalty_max.append(key)
        
l1_penalty_max = min(l1_penalty_max)
l1_penalty_min = max(l1_penalty_min)

In [94]:
print(l1_penalty_min)
print(l1_penalty_max)

127.42749857031335
263.6650898730358


### Exploring the narrow range of values to find the solution with the right number of non-zeros that has lowest RSS on the validation set

In [95]:
l1_penalty_values = np.linspace(l1_penalty_min,l1_penalty_max,20)

rest can be done easily..just repitation