In [6]:
import graphlab
import numpy as np
import matplotlib.pyplot as plt
from math import log,sqrt

In [10]:
sales = graphlab.SFrame('kc_house_data.gl/')
sales.head()

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


In [17]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors'] = sales['floors'].astype(float)
sales['floors_square'] = sales['floors'] * sales['floors']

In [18]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [27]:
model_all = graphlab.linear_regression.create(sales , target='price',features=all_features,l1_penalty=1e10 ,l2_penalty=0.,
                                             validation_set=None , verbose = False)

In [28]:
 (model_all.coefficients)

name,index,value,stderr
(intercept),,274873.05595,
bedrooms,,0.0,
bedrooms_square,,0.0,
bathrooms,,8468.53108691,
sqft_living,,24.4207209824,
sqft_living_sqrt,,350.060553386,
sqft_lot,,0.0,
sqft_lot_sqrt,,0.0,
floors,,0.0,
floors_square,,0.0,


In [30]:
model_all.coefficients.print_rows(num_rows=18)

+------------------+-------+---------------+--------+
|       name       | index |     value     | stderr |
+------------------+-------+---------------+--------+
|   (intercept)    |  None |  274873.05595 |  None  |
|     bedrooms     |  None |      0.0      |  None  |
| bedrooms_square  |  None |      0.0      |  None  |
|    bathrooms     |  None | 8468.53108691 |  None  |
|   sqft_living    |  None | 24.4207209824 |  None  |
| sqft_living_sqrt |  None | 350.060553386 |  None  |
|     sqft_lot     |  None |      0.0      |  None  |
|  sqft_lot_sqrt   |  None |      0.0      |  None  |
|      floors      |  None |      0.0      |  None  |
|  floors_square   |  None |      0.0      |  None  |
|    waterfront    |  None |      0.0      |  None  |
|       view       |  None |      0.0      |  None  |
|    condition     |  None |      0.0      |  None  |
|      grade       |  None | 842.068034898 |  None  |
|    sqft_above    |  None | 20.0247224171 |  None  |
|  sqft_basement   |  None |

In [31]:
(training_and_validation, testing) = sales.random_split(.9,seed=1) # initial train/test split
(training, validation) = training_and_validation.random_split(0.5, seed=1)

In [36]:
l1_penalty = np.logspace(1,7,num=13)
l1_penalty

array([  1.00000000e+01,   3.16227766e+01,   1.00000000e+02,
         3.16227766e+02,   1.00000000e+03,   3.16227766e+03,
         1.00000000e+04,   3.16227766e+04,   1.00000000e+05,
         3.16227766e+05,   1.00000000e+06,   3.16227766e+06,
         1.00000000e+07])

In [44]:
rss = {}
for i in l1_penalty:
    model = graphlab.linear_regression.create(training, target='price', features = all_features,verbose=False,
                                             validation_set=None,l1_penalty=i,l2_penalty=0.)
    predictions = model.predict(validation)
    error = validation['price']-predictions
    r = error * error 
    rss[i]=r.sum()

In [49]:
rss

{10.0: 625766285142459.9,
 31.622776601683793: 625766285362394.1,
 100.0: 625766286057885.0,
 316.2277660168379: 625766288257224.6,
 1000.0: 625766295212186.8,
 3162.2776601683795: 625766317206080.5,
 10000.0: 625766386760658.1,
 31622.776601683792: 625766606749278.5,
 100000.0: 625767302791634.1,
 316227.76601683791: 625769507643886.2,
 1000000.0: 625776517727024.0,
 3162277.6601683791: 625799062845467.0,
 10000000.0: 625883719085425.2}

In [57]:
model1 = graphlab.linear_regression.create(testing, target='price', features = all_features,verbose=False,
                                             validation_set=None,l1_penalty=10,l2_penalty=0.)
error1 = testing['price'] - model.predict(testing)
res = error1**2
res.sum()


155205417245177.44

In [123]:
model1['coefficients']['value'].nnz()  #17 non zero rows

18

In [71]:
max_nonzeros=7
nz={}
for l1_penalty in np.logspace(8, 10, num=20):
    model2 = graphlab.linear_regression.create(training, target='price', features=all_features, validation_set=None,
                                              l1_penalty=l1_penalty, l2_penalty=0.,verbose=False)
    nz[l1_penalty] = model2['coefficients']['value'].nnz()


In [72]:

nz

{100000000.0: 18,
 127427498.57031322: 18,
 162377673.91887242: 18,
 206913808.11147901: 18,
 263665089.87303555: 17,
 335981828.62837881: 17,
 428133239.8719396: 17,
 545559478.11685145: 17,
 695192796.17755914: 17,
 885866790.41008317: 16,
 1128837891.6846883: 15,
 1438449888.2876658: 15,
 1832980710.8324375: 13,
 2335721469.0901213: 12,
 2976351441.6313128: 10,
 3792690190.7322536: 6,
 4832930238.5717525: 5,
 6158482110.6602545: 3,
 7847599703.5146227: 1,
 10000000000.0: 1}

In [108]:
minval = max(nz.values())
maxval = min(nz.values())
for i,j in nz.iteritems():
    if j > max_nonzeros and j < minval:
        minval = j
        l1_penalty_max = i
    if j < max_nonzeros and j > maxval:
        maxval = j
        l1_penalty_min = i

In [127]:
l1_penalty_min

3792690190.7322536

In [128]:
l1_penalty_max

2976351441.6313128

In [111]:
l1_penalty_values = np.linspace(l1_penalty_max,l1_penalty_min,20)
residual ={}
sparsity ={}
for x in l1_penalty_values:
    model4 = graphlab.linear_regression.create(training , target='price' , features=all_features , validation_set=None ,
                                              verbose=False , l1_penalty=x , l2_penalty=0.)
    
    predicn = model4.predict(validation)
    err = validation['price'] - predicn
    r = err ** 2
    residual[x] = r.sum()
    sparsity[x] = model4['coefficients']['value'].nnz()
print residual
print "printing sparsity\n"
print sparsity

{3749724993.4111514: 1077632775581416.0, 3148212230.9157214: 998783211265891.2, 3363038217.5212321: 1038554735941040.8, 3577864204.1267428: 1060799531763287.8, 3792690190.7322536: 1081867592324110.6, 2976351441.6313128: 966925692362084.5, 3191177428.2368236: 1008477167020094.0, 3406003414.8423343: 1043237237871703.0, 3620829401.447845: 1065707689498230.1, 3019316638.952415: 974019450084556.1, 3234142625.5579257: 1018298780553819.8, 3448968612.1634364: 1046937488751711.1, 3663794598.7689471: 1069464335425586.5, 3062281836.2735171: 981188367942452.8, 3277107822.8790278: 1028247992205977.2, 3491933809.4845386: 1051147625612860.9, 3706759796.0900493: 1073504549585599.6, 3105247033.5946193: 989328342459474.0, 3320073020.20013: 1034616909232828.1, 3534899006.8056407: 1055992735342999.1}
printing sparsity

{3749724993.4111514: 6, 3148212230.9157214: 10, 3363038217.5212321: 8, 3577864204.1267428: 7, 3792690190.7322536: 6, 2976351441.6313128: 10, 3191177428.2368236: 10, 3406003414.8423343: 8, 3

In [118]:
l = []
for i, j in sparsity.iteritems():
    if j == 7:
        l.append(i)
l


[3577864204.1267428,
 3448968612.1634364,
 3491933809.4845386,
 3534899006.8056407]

In [117]:
y=[]
for i in l:
    print residual[i], i

1.06079953176e+15 3577864204.13
1.04693748875e+15 3448968612.16
1.05114762561e+15 3491933809.48
1.05599273534e+15 3534899006.81


In [120]:
model41 = graphlab.linear_regression.create(training , target='price' , features=all_features , validation_set=None ,
                                              verbose=False , l1_penalty='3491933809.4845386' , l2_penalty=0.)
model41['coefficients'].print_rows(num_rows=20)

+------------------+-------+---------------+--------+
|       name       | index |     value     | stderr |
+------------------+-------+---------------+--------+
|   (intercept)    |  None | 224545.136501 |  None  |
|     bedrooms     |  None | 496.983429977 |  None  |
| bedrooms_square  |  None |      0.0      |  None  |
|    bathrooms     |  None | 15640.8229131 |  None  |
|   sqft_living    |  None | 32.2039341994 |  None  |
| sqft_living_sqrt |  None | 678.904419357 |  None  |
|     sqft_lot     |  None |      0.0      |  None  |
|  sqft_lot_sqrt   |  None |      0.0      |  None  |
|      floors      |  None |      0.0      |  None  |
|  floors_square   |  None |      0.0      |  None  |
|    waterfront    |  None |      0.0      |  None  |
|       view       |  None |      0.0      |  None  |
|    condition     |  None |      0.0      |  None  |
|      grade       |  None |  2825.4694254 |  None  |
|    sqft_above    |  None |  29.715599776 |  None  |
|  sqft_basement   |  None |

In [107]:
b

{1: 2, 3: 4}

In [None]:
c=4
