In [1]:
import graphlab

In [3]:
sales = graphlab.SFrame('course-2/kc_house_data.gl/')

In [4]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']

# In the dataset, 'floors' was defined with type string, 
# so we'll convert them to float, before creating a new feature.
sales['floors'] = sales['floors'].astype(float) 
sales['floors_square'] = sales['floors']*sales['floors']

In [5]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [6]:
model_all = graphlab.linear_regression.create(sales, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=1e10)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 21613
PROGRESS: Number of features          : 17
PROGRESS: Number of unpacked features : 17
PROGRESS: Number of coefficients    : 18
PROGRESS: Starting Accelerated Gradient (FISTA)
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+-----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Step size | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+-----------+--------------+--------------------+---------------+
PROGRESS: Tuning step size. First iteration could take longer than subsequent iterations.
PROGRESS: | 1         | 2        | 0.000002  | 1.296752     | 6962915.603493     | 426631.749026 |
PROGRESS: | 2         | 3        | 0.000002  | 1.330751     | 6843144.200219     | 392488.929838 |
PROGRESS: | 3         | 4      

In [31]:
model_all.get('coefficients').print_rows(20)

+------------------+-------+---------------+--------+
|       name       | index |     value     | stderr |
+------------------+-------+---------------+--------+
|   (intercept)    |  None |  274873.05595 |  None  |
|     bedrooms     |  None |      0.0      |  None  |
| bedrooms_square  |  None |      0.0      |  None  |
|    bathrooms     |  None | 8468.53108691 |  None  |
|   sqft_living    |  None | 24.4207209824 |  None  |
| sqft_living_sqrt |  None | 350.060553386 |  None  |
|     sqft_lot     |  None |      0.0      |  None  |
|  sqft_lot_sqrt   |  None |      0.0      |  None  |
|      floors      |  None |      0.0      |  None  |
|  floors_square   |  None |      0.0      |  None  |
|    waterfront    |  None |      0.0      |  None  |
|       view       |  None |      0.0      |  None  |
|    condition     |  None |      0.0      |  None  |
|      grade       |  None | 842.068034898 |  None  |
|    sqft_above    |  None | 20.0247224171 |  None  |
|  sqft_basement   |  None |

In [8]:
(training_and_validation, testing) = sales.random_split(.9,seed=1) # initial train/test split
(training, validation) = training_and_validation.random_split(0.5, seed=1) # split training into train and validate

In [9]:
import numpy as np
l1_penalty = np.logspace(1,7,num = 13)

In [10]:
l1_penalty

array([  1.00000000e+01,   3.16227766e+01,   1.00000000e+02,
         3.16227766e+02,   1.00000000e+03,   3.16227766e+03,
         1.00000000e+04,   3.16227766e+04,   1.00000000e+05,
         3.16227766e+05,   1.00000000e+06,   3.16227766e+06,
         1.00000000e+07])

In [26]:

for penalty in l1_penalty:
    rss=0
    model = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=penalty,verbose =False)
    predicted = model.predict(validation)
    rss = sum((validation['price']-predicted)*(validation['price']-predicted))
    
    print "For l1_penalty=",penalty," rss= ",rss

For l1_penalty= 10.0  rss=  6.25766285142e+14
For l1_penalty= 31.6227766017  rss=  6.25766285362e+14
For l1_penalty= 100.0  rss=  6.25766286058e+14
For l1_penalty= 316.227766017  rss=  6.25766288257e+14
For l1_penalty= 1000.0  rss=  6.25766295212e+14
For l1_penalty= 3162.27766017  rss=  6.25766317206e+14
For l1_penalty= 10000.0  rss=  6.25766386761e+14
For l1_penalty= 31622.7766017  rss=  6.25766606749e+14
For l1_penalty= 100000.0  rss=  6.25767302792e+14
For l1_penalty= 316227.766017  rss=  6.25769507644e+14
For l1_penalty= 1000000.0  rss=  6.25776517727e+14
For l1_penalty= 3162277.66017  rss=  6.25799062845e+14
For l1_penalty= 10000000.0  rss=  6.25883719085e+14


In [22]:
update_model = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=10,verbose =False)

In [23]:
rss = sum((update_model.predict(testing)-testing['price'])*(update_model.predict(testing)-testing['price']))

In [24]:
rss

156983602381664.38

In [28]:
update_model.get('coefficients').print_rows(20)

+------------------+-------+------------------+--------+
|       name       | index |      value       | stderr |
+------------------+-------+------------------+--------+
|   (intercept)    |  None |  18993.4272128   |  None  |
|     bedrooms     |  None |  7936.96767903   |  None  |
| bedrooms_square  |  None |  936.993368193   |  None  |
|    bathrooms     |  None |  25409.5889341   |  None  |
|   sqft_living    |  None |  39.1151363797   |  None  |
| sqft_living_sqrt |  None |  1124.65021281   |  None  |
|     sqft_lot     |  None | 0.00348361822299 |  None  |
|  sqft_lot_sqrt   |  None |  148.258391011   |  None  |
|      floors      |  None |   21204.335467   |  None  |
|  floors_square   |  None |  12915.5243361   |  None  |
|    waterfront    |  None |  601905.594545   |  None  |
|       view       |  None |  93312.8573119   |  None  |
|    condition     |  None |  6609.03571245   |  None  |
|      grade       |  None |  6206.93999188   |  None  |
|    sqft_above    |  None |  4

In [32]:
max_nonzeros = 7

In [33]:
l1_penalty_values = np.logspace(8, 10, num=20)

In [34]:
for penalty in l1_penalty_values:
    rss=0
    model = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=penalty,verbose =False)
    print "For l1_penalty=",penalty,"    No of NonZero Variables",model['coefficients']['value'].nnz()

For l1_penalty= 100000000.0     No of NonZero Variables 18
For l1_penalty= 127427498.57     No of NonZero Variables 18
For l1_penalty= 162377673.919     No of NonZero Variables 18
For l1_penalty= 206913808.111     No of NonZero Variables 18
For l1_penalty= 263665089.873     No of NonZero Variables 17
For l1_penalty= 335981828.628     No of NonZero Variables 17
For l1_penalty= 428133239.872     No of NonZero Variables 17
For l1_penalty= 545559478.117     No of NonZero Variables 17
For l1_penalty= 695192796.178     No of NonZero Variables 17
For l1_penalty= 885866790.41     No of NonZero Variables 16
For l1_penalty= 1128837891.68     No of NonZero Variables 15
For l1_penalty= 1438449888.29     No of NonZero Variables 15
For l1_penalty= 1832980710.83     No of NonZero Variables 13
For l1_penalty= 2335721469.09     No of NonZero Variables 12
For l1_penalty= 2976351441.63     No of NonZero Variables 10
For l1_penalty= 3792690190.73     No of NonZero Variables 6
For l1_penalty= 4832930238.57

In [35]:
l1_penalty_min = 2976351441.63

In [36]:
l1_penalty_max = 3792690190.73

In [37]:
l1_penalty_values = np.linspace(l1_penalty_min,l1_penalty_max,20)

In [38]:
for penalty in l1_penalty_values:
    rss=0
    model = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=penalty,verbose =False)
    predicted = model.predict(validation)
    rss = sum((validation['price']-predicted)*(validation['price']-predicted))
    
    print "For l1_penalty=",penalty,"       rss= ",rss, "    No of NonZero Variables",model['coefficients']['value'].nnz()

For l1_penalty= 2976351441.63        rss=  9.66925692362e+14     No of NonZero Variables 10
For l1_penalty= 3019316638.95        rss=  9.74019450085e+14     No of NonZero Variables 10
For l1_penalty= 3062281836.27        rss=  9.81188367942e+14     No of NonZero Variables 10
For l1_penalty= 3105247033.59        rss=  9.89328342459e+14     No of NonZero Variables 10
For l1_penalty= 3148212230.91        rss=  9.98783211266e+14     No of NonZero Variables 10
For l1_penalty= 3191177428.24        rss=  1.00847716702e+15     No of NonZero Variables 10
For l1_penalty= 3234142625.56        rss=  1.01829878055e+15     No of NonZero Variables 10
For l1_penalty= 3277107822.88        rss=  1.02824799221e+15     No of NonZero Variables 10
For l1_penalty= 3320073020.2        rss=  1.03461690923e+15     No of NonZero Variables 8
For l1_penalty= 3363038217.52        rss=  1.03855473594e+15     No of NonZero Variables 8
For l1_penalty= 3406003414.84        rss=  1.04323723787e+15     No of NonZero Vari

In [40]:
l1_penalty = 3448968612.16
final = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=l1_penalty,verbose =False)
final.get('coefficients').print_rows(20)

+------------------+-------+---------------+--------+
|       name       | index |     value     | stderr |
+------------------+-------+---------------+--------+
|   (intercept)    |  None | 222253.192544 |  None  |
|     bedrooms     |  None | 661.722717782 |  None  |
| bedrooms_square  |  None |      0.0      |  None  |
|    bathrooms     |  None | 15873.9572593 |  None  |
|   sqft_living    |  None | 32.4102214513 |  None  |
| sqft_living_sqrt |  None | 690.114773313 |  None  |
|     sqft_lot     |  None |      0.0      |  None  |
|  sqft_lot_sqrt   |  None |      0.0      |  None  |
|      floors      |  None |      0.0      |  None  |
|  floors_square   |  None |      0.0      |  None  |
|    waterfront    |  None |      0.0      |  None  |
|       view       |  None |      0.0      |  None  |
|    condition     |  None |      0.0      |  None  |
|      grade       |  None | 2899.42026975 |  None  |
|    sqft_above    |  None | 30.0115753022 |  None  |
|  sqft_basement   |  None |