In [24]:
import graphlab
from math import log, sqrt
import numpy as np

In [3]:
sales = graphlab.SFrame('kc_house_data.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1500021909.log


This non-commercial license of GraphLab Create for academic use is assigned to mukesh.mithrakumar@jacks.sdstate.edu and will expire on June 17, 2018.


In [6]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']

# In the dataset, 'floors' was defined with type string, 
# so we'll convert them to float, before creating a new feature.
sales['floors'] = sales['floors'].astype(float) 
sales['floors_square'] = sales['floors']*sales['floors']

In [8]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [21]:
model_all = graphlab.linear_regression.create(sales, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=1e10,
                                             verbose = False)

In [19]:
names= model_all.get('coefficients')
names.print_rows(18,4)

+------------------+-------+---------------+--------+
|       name       | index |     value     | stderr |
+------------------+-------+---------------+--------+
|   (intercept)    |  None |  274873.05595 |  None  |
|     bedrooms     |  None |      0.0      |  None  |
| bedrooms_square  |  None |      0.0      |  None  |
|    bathrooms     |  None | 8468.53108691 |  None  |
|   sqft_living    |  None | 24.4207209824 |  None  |
| sqft_living_sqrt |  None | 350.060553386 |  None  |
|     sqft_lot     |  None |      0.0      |  None  |
|  sqft_lot_sqrt   |  None |      0.0      |  None  |
|      floors      |  None |      0.0      |  None  |
|  floors_square   |  None |      0.0      |  None  |
|    waterfront    |  None |      0.0      |  None  |
|       view       |  None |      0.0      |  None  |
|    condition     |  None |      0.0      |  None  |
|      grade       |  None | 842.068034898 |  None  |
|    sqft_above    |  None | 20.0247224171 |  None  |
|  sqft_basement   |  None |

In [60]:
#zero weights
non_zero_weight = model_all["coefficients"][model_all["coefficients"]["value"] == 0]
non_zero_weight.print_rows(num_rows=20)

+-----------------+-------+-------+--------+
|       name      | index | value | stderr |
+-----------------+-------+-------+--------+
|     bedrooms    |  None |  0.0  |  None  |
| bedrooms_square |  None |  0.0  |  None  |
|     sqft_lot    |  None |  0.0  |  None  |
|  sqft_lot_sqrt  |  None |  0.0  |  None  |
|      floors     |  None |  0.0  |  None  |
|  floors_square  |  None |  0.0  |  None  |
|    waterfront   |  None |  0.0  |  None  |
|       view      |  None |  0.0  |  None  |
|    condition    |  None |  0.0  |  None  |
|  sqft_basement  |  None |  0.0  |  None  |
|     yr_built    |  None |  0.0  |  None  |
|   yr_renovated  |  None |  0.0  |  None  |
+-----------------+-------+-------+--------+
[12 rows x 4 columns]



In [64]:
# non zero weights
zero_weight = model_all["coefficients"][model_all["coefficients"]["value"] > 0]
zero_weight.print_rows(num_rows=20)

+------------------+-------+---------------+--------+
|       name       | index |     value     | stderr |
+------------------+-------+---------------+--------+
|   (intercept)    |  None |  274873.05595 |  None  |
|    bathrooms     |  None | 8468.53108691 |  None  |
|   sqft_living    |  None | 24.4207209824 |  None  |
| sqft_living_sqrt |  None | 350.060553386 |  None  |
|      grade       |  None | 842.068034898 |  None  |
|    sqft_above    |  None | 20.0247224171 |  None  |
+------------------+-------+---------------+--------+
[6 rows x 4 columns]



In [20]:
(training_and_validation, testing) = sales.random_split(.9,seed=1) # initial train/test split
(training, validation) = training_and_validation.random_split(0.5, seed=1) # split training into train and validate

In [182]:
penalties = np.logspace(1, 7, num=13)
RSS_list = []
for l1_penalty in penalties:
    #print l1_penalty
    model_penalty = graphlab.linear_regression.create(training,
                                                     target = 'price',
                                                     features =all_features,
                                                     validation_set = None,
                                                     l1_penalty = l1_penalty,
                                                     l2_penalty=0.,
                                                     verbose = False)
    error =  model_penalty.predict(validation) - validation['price']
    RSS = sum(error*error)
    RSS_list.append(RSS)
print "RSS index: ", RSS_list.index(min(RSS_list))
print "minimum RSS: ", min(RSS_list)
print "best penalty: ", penalties[RSS_list.index(min(RSS_list))]

RSS index:  0
minimum RSS:  6.25766285142e+14
best penalty:  10.0


In [57]:
model1 = graphlab.linear_regression.create(sales, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=10,
                                             verbose = False)

In [59]:
names2= model1.get('coefficients')
names2.print_rows(18,4)

+------------------+-------+------------------+--------+
|       name       | index |      value       | stderr |
+------------------+-------+------------------+--------+
|   (intercept)    |  None |  20459.2475219   |  None  |
|     bedrooms     |  None |  8155.38098737   |  None  |
| bedrooms_square  |  None |  1479.73787423   |  None  |
|    bathrooms     |  None |  24576.2383172   |  None  |
|   sqft_living    |  None |  37.4911504798   |  None  |
| sqft_living_sqrt |  None |  1109.39597073   |  None  |
|     sqft_lot     |  None | -0.0168499198461 |  None  |
|  sqft_lot_sqrt   |  None |  149.569423985   |  None  |
|      floors      |  None |  20983.5137368   |  None  |
|  floors_square   |  None |  12278.1023451   |  None  |
|    waterfront    |  None |  581971.306649   |  None  |
|       view       |  None |  92988.9899686   |  None  |
|    condition     |  None |  6924.28719657   |  None  |
|      grade       |  None |  6205.64105779   |  None  |
|    sqft_above    |  None |  4

In [156]:
max_nonzeros = 7
l1_penalty_values = np.logspace(8, 10, num=20)

nonzero_dict = {}

for i in l1_penalty_values:
    model_narrow = graphlab.linear_regression.create(training,
                                                     target = 'price',
                                                     features =all_features,
                                                     validation_set = None,
                                                     l1_penalty = i,
                                                     l2_penalty=0.,
                                                     verbose = False)
    nonzero_dict[i] = model_narrow['coefficients']['value'].nnz()

In [157]:
import pprint
pprint.pprint(nonzero_dict)

{100000000.0: 18,
 127427498.57031322: 18,
 162377673.91887242: 18,
 206913808.11147901: 18,
 263665089.87303555: 17,
 335981828.62837881: 17,
 428133239.8719396: 17,
 545559478.11685145: 17,
 695192796.17755914: 17,
 885866790.41008317: 16,
 1128837891.6846883: 15,
 1438449888.2876658: 15,
 1832980710.8324375: 13,
 2335721469.0901213: 12,
 2976351441.6313133: 10,
 3792690190.7322536: 6,
 4832930238.5717525: 5,
 6158482110.6602545: 3,
 7847599703.5146227: 1,
 10000000000.0: 1}


In [158]:
l1_penalty_min = 2976351441.6313133
l1_penalty_max = 3792690190.7322536

In [172]:
newl1_penalty_values = np.linspace(l1_penalty_min,l1_penalty_max,20)
validation_rss = {}
max_nonzeros = 7

for x in newl1_penalty_values:
    model_final = graphlab.linear_regression.create(training,
                                                     target = 'price',
                                                     features =all_features,
                                                     validation_set = None,
                                                     l1_penalty = x,
                                                     l2_penalty=0.,
                                                     verbose = False)
    error =  model_final.predict(validation) - validation['price']
    RSS = sum(error*error)
    validation_rss[x] = RSS, model_final['coefficients']['value'].nnz()

In [178]:
for k,v in validation_rss.iteritems():
    if (v[1] == max_nonzeros) and (v[0] < RSS):
        bestRSS = v[0]
        print k, v

3577864204.13 (1060799531763289.9, 7)
3448968612.16 (1046937488751713.5, 7)
3491933809.48 (1051147625612863.0, 7)
3534899006.81 (1055992735342998.9, 7)


In [179]:
model_quiz = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, verbose = False,
                                              l2_penalty=0., l1_penalty=3448968612.16)

In [180]:
non_zero_weight_test = model_quiz["coefficients"][model_quiz["coefficients"]["value"] > 0]
non_zero_weight_test.print_rows(num_rows=8)

+------------------+-------+---------------+--------+
|       name       | index |     value     | stderr |
+------------------+-------+---------------+--------+
|   (intercept)    |  None | 222253.192544 |  None  |
|     bedrooms     |  None | 661.722717782 |  None  |
|    bathrooms     |  None | 15873.9572593 |  None  |
|   sqft_living    |  None | 32.4102214513 |  None  |
| sqft_living_sqrt |  None | 690.114773313 |  None  |
|      grade       |  None | 2899.42026975 |  None  |
|    sqft_above    |  None | 30.0115753022 |  None  |
+------------------+-------+---------------+--------+
[7 rows x 4 columns]

