In [1]:
import pandas as pd
import numpy as np
from math import log, sqrt
from sklearn import linear_model

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [4]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [5]:
sales.columns

Index([u'id', u'date', u'price', u'bedrooms', u'bathrooms', u'sqft_living',
       u'sqft_lot', u'floors', u'waterfront', u'view', u'condition', u'grade',
       u'sqft_above', u'sqft_basement', u'yr_built', u'yr_renovated',
       u'zipcode', u'lat', u'long', u'sqft_living15', u'sqft_lot15',
       u'sqft_living_sqrt', u'sqft_lot_sqrt', u'bedrooms_square',
       u'floors_square'],
      dtype='object')

In [6]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [7]:
reg = linear_model.Lasso(alpha = 5e2, normalize=True)
model_all = reg.fit(sales[all_features], sales['price'])

In [8]:
model_all.coef_

array([     0.        ,      0.        ,      0.        ,    134.43931396,
            0.        ,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ,  24750.00458561,      0.        ,
        61749.10309071,      0.        ,      0.        ,     -0.        ,
            0.        ])

In [9]:
model_all.intercept_

-218136.21403515921

In [10]:
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [11]:
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']
training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)

In [12]:
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)

In [13]:
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']
validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)

In [14]:
RSS = np.zeros(13)
i = 0
for l1_penalty in np.logspace(1,7,13):
    reg = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model = reg.fit(training[all_features], training['price'])
    
    pred = model.predict(validation[all_features])
    error = np.array((pred-validation['price'])*(pred-validation['price']))
    RSS[i] = error.sum()
    i = i+1

In [15]:
print RSS

[  3.98213327e+14   3.99041900e+14   4.29791604e+14   4.63739831e+14
   6.45898734e+14   1.22250686e+15   1.22250686e+15   1.22250686e+15
   1.22250686e+15   1.22250686e+15   1.22250686e+15   1.22250686e+15
   1.22250686e+15]


In [16]:
l1_penalty=10
reg = linear_model.Lasso(alpha=l1_penalty, normalize=True)
model = reg.fit(training[all_features], training['price'])

pred = model.predict(testing[all_features])
error = np.array((pred-testing['price'])*(pred-testing['price']))
RSS = error.sum()

print RSS

9.84674025527e+13


In [17]:
np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

15

In [18]:
num_nz = np.zeros(20)
i = 0
for l1_penalty in np.logspace(1,4,20):
    reg = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model = reg.fit(training[all_features], training['price'])
    
    num_nz[i] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    i = i+1

In [19]:
num_nz

array([ 15.,  15.,  15.,  15.,  13.,  12.,  11.,  10.,   7.,   6.,   6.,
         6.,   5.,   3.,   3.,   2.,   1.,   1.,   1.,   1.])

In [20]:
k = np.logspace(1,4,20)

In [21]:
l1_penalty_min = k[7]
l1_penalty_max = k[9]

In [22]:
l1_penalty_min

127.42749857031335

In [23]:
l1_penalty_max

263.66508987303581

In [24]:
RSS = np.zeros(20)
num_nz = np.zeros(20)
i = 0
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    reg = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model = reg.fit(training[all_features], training['price'])
    
    pred = model.predict(validation[all_features])
    error = np.array((pred-validation['price'])*(pred-validation['price']))
    RSS[i] = error.sum()
    num_nz[i] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    i = i+1

In [25]:
num_nz[4:11]

array([ 7.,  7.,  7.,  7.,  7.,  7.,  7.])

In [26]:
RSS[4:11]

array([  4.40037365e+14,   4.40777490e+14,   4.41566698e+14,
         4.42406413e+14,   4.43296717e+14,   4.44239781e+14,
         4.45230740e+14])

In [27]:
t = np.linspace(l1_penalty_min,l1_penalty_max,20)

In [28]:
t[4]

156.10909673930755

In [29]:
reg = linear_model.Lasso(alpha=t[4], normalize=True)
model = reg.fit(training[all_features], training['price'])

In [30]:
model.coef_

array([ -0.00000000e+00,  -0.00000000e+00,   1.06108903e+04,
         1.63380252e+02,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.06451687e+05,   4.19600436e+04,   0.00000000e+00,
         1.16253554e+05,   0.00000000e+00,   0.00000000e+00,
        -2.61223488e+03,   0.00000000e+00])