In [69]:
import numpy as np
import pandas as pd
from sklearn import linear_model

In [70]:
data = pd.read_csv('kc_house_data.csv')

In [71]:
data.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [72]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [73]:
data = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [74]:
data.dtypes

id                object
date              object
price            float64
bedrooms         float64
bathrooms        float64
sqft_living      float64
sqft_lot           int32
floors           float64
waterfront         int32
view               int32
condition          int32
grade              int32
sqft_above         int32
sqft_basement      int32
yr_built           int32
yr_renovated       int32
zipcode           object
lat              float64
long             float64
sqft_living15    float64
sqft_lot15       float64
dtype: object

In [75]:
data.shape

(21613, 21)

In [76]:
from math import log, sqrt
data['sqft_living_sqrt'] = data['sqft_living'].apply(sqrt)
data['sqft_lot_sqrt'] = data['sqft_lot'].apply(sqrt)
data['bedrooms_square'] = data['bedrooms']*data['bedrooms']
data['floors_square'] = data['floors']*data['floors']

In [77]:
data.shape

(21613, 25)

In [99]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [100]:
data_feature=data[all_features]

In [101]:
data_feature.shape

(21613, 17)

In [102]:
model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
#model_all.fit(data_feature, data['price'])

In [121]:
model_all.fit(data_feature, data['price'])

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [122]:
model_all.coef_

array([     0.        ,      0.        ,      0.        ,    134.43931396,
            0.        ,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ,  24750.00458561,      0.        ,
        61749.10309071,      0.        ,      0.        ,     -0.        ,
            0.        ])

# ---------------------------------------------------------------------------------------------------

In [123]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [124]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [125]:
train_feature=training[all_features]
valid_feature=validation[all_features]
test_feature=testing[all_features]

In [126]:
train_feature.head()

Unnamed: 0,bedrooms,bedrooms_square,bathrooms,sqft_living,sqft_living_sqrt,sqft_lot,sqft_lot_sqrt,floors,floors_square,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated
0,4,16,3.0,1960,44.271887,5000,70.710678,1.0,1.0,0,0,5,7,1050,910,1965,0
1,4,16,4.5,5420,73.620649,101930,319.26478,1.0,1.0,0,0,3,11,3890,1530,2001,0
2,2,4,1.0,1160,34.058773,6000,77.459667,1.0,1.0,0,0,4,7,860,300,1942,0
3,3,9,1.0,1430,37.815341,19901,141.070904,1.5,2.25,0,0,4,7,1430,0,1927,0
4,3,9,1.75,1370,37.013511,9680,98.386991,1.0,1.0,0,0,4,7,1370,0,1977,0


In [170]:
l1_penalty=np.logspace(1, 7, num=13)
l1_penalty

array([  1.00000000e+01,   3.16227766e+01,   1.00000000e+02,
         3.16227766e+02,   1.00000000e+03,   3.16227766e+03,
         1.00000000e+04,   3.16227766e+04,   1.00000000e+05,
         3.16227766e+05,   1.00000000e+06,   3.16227766e+06,
         1.00000000e+07])

In [171]:
rss_dict={}
for i in l1_penalty:
    rss=0.0
    print i
    name='rss_'+str(i)
    model=linear_model.Lasso(alpha=i,normalize=True)
    model.fit(train_feature,training['price'])
    rss=np.sum((validation['price']-model.predict(valid_feature))**2)
    print rss
    rss_dict[name]=rss
print rss_dict

10.0
3.982133273e+14
31.6227766017
3.99041900253e+14
100.0
4.29791604073e+14
316.227766017
4.63739831045e+14
1000.0
6.45898733634e+14
3162.27766017
1.22250685943e+15
10000.0
1.22250685943e+15
31622.7766017
1.22250685943e+15
100000.0
1.22250685943e+15
316227.766017
1.22250685943e+15
1000000.0
1.22250685943e+15
3162277.66017
1.22250685943e+15
10000000.0
1.22250685943e+15
{'rss_10000.0': 1222506859427156.8, 'rss_100.0': 429791604072558.25, 'rss_31.6227766017': 399041900253348.1, 'rss_316.227766017': 463739831045119.56, 'rss_1000000.0': 1222506859427156.8, 'rss_3162277.66017': 1222506859427156.8, 'rss_1000.0': 645898733633810.4, 'rss_10.0': 398213327300134.44, 'rss_10000000.0': 1222506859427156.8, 'rss_100000.0': 1222506859427156.8, 'rss_316227.766017': 1222506859427156.8, 'rss_31622.7766017': 1222506859427156.8, 'rss_3162.27766017': 1222506859427156.8}


In [172]:
min(rss_dict,key=rss_dict.get)

'rss_10.0'

In [173]:
    model=linear_model.Lasso(alpha=10.0**1,normalize=True)
    model.fit(train_feature,training['price'])
    print np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

15


In [174]:
model.coef_

array([ -1.61445628e+04,   3.73245384e+02,   5.08412433e+04,
         6.17853560e+02,  -4.44113549e+04,   7.85623065e-01,
        -7.01194765e+02,  -0.00000000e+00,   5.01420046e+03,
         6.19488752e+05,   3.80418557e+04,   2.49987718e+04,
         1.28716235e+05,   0.00000000e+00,   0.00000000e+00,
        -3.29383118e+03,   1.00573209e+01])

In [187]:
l1_penalty= np.logspace(1, 4, num=20)

In [190]:
max_nonzeroes=7
rss_dict={}
for i in l1_penalty:
    rss=0.0
    model=linear_model.Lasso(alpha=i,normalize=True)
    model.fit(train_feature,training['price'])    
    non_zeroes=np.count_nonzero(model.coef_) 
    if non_zeroes<=max_nonzeroes:
        name='rss_'+str(i)
        #rss=np.sum((validation['price']-model.predict(valid_feature))**2)
        rss_dict[name]=non_zeroes
print rss_dict

{'rss_3359.81828628': 0, 'rss_263.665089873': 5, 'rss_183.298071083': 6, 'rss_1623.77673919': 2, 'rss_10000.0': 0, 'rss_545.559478117': 5, 'rss_1128.83789168': 2, 'rss_2335.72146909': 1, 'rss_4832.93023857': 0, 'rss_6951.92796178': 0, 'rss_379.269019073': 5, 'rss_784.759970351': 4}


In [191]:
min(rss_dict,key=rss_dict.get)

'rss_3359.81828628'

In [192]:
max(rss_dict,key=rss_dict.get)

'rss_183.298071083'

In [197]:
l1_penalty=np.linspace(183.298071083,3359.81828628,20)

In [198]:
rss_dict={}
for i in l1_penalty:
    print i
    rss=0.0
    name='rss_'+str(i)
    model=linear_model.Lasso(alpha=i,normalize=True)
    model.fit(train_feature,training['price'])
    rss=np.sum((validation['price']-model.predict(valid_feature))**2)
    print rss
    rss_dict[name]=rss
print rss_dict

183.298071083
4.43107216261e+14
350.483345567
4.71165276546e+14
517.668620051
5.20915057333e+14
684.853894535
5.70043280448e+14
852.039169019
6.09042030746e+14
1019.2244435
6.5115759263e+14
1186.40971799
6.86395309366e+14
1353.59499247
7.26127771423e+14
1520.78026696
7.71665466379e+14
1687.96554144
8.23005987215e+14
1855.15081592
8.80152768001e+14
2022.33609041
9.43105271514e+14
2189.52136489
1.00491768322e+15
2356.70663938
1.06966278949e+15
2523.89191386
1.13952249162e+15
2691.07718834
1.21449678962e+15
2858.26246283
1.22250685943e+15
3025.44773731
1.22250685943e+15
3192.6330118
1.22250685943e+15
3359.81828628
1.22250685943e+15
{'rss_684.853894535': 570043280447779.6, 'rss_3025.44773731': 1222506859427156.8, 'rss_183.298071083': 443107216261366.25, 'rss_350.483345567': 471165276545646.44, 'rss_1855.15081592': 880152768001002.6, 'rss_517.668620051': 520915057332820.56, 'rss_3359.81828628': 1222506859427156.8, 'rss_1687.96554144': 823005987215079.8, 'rss_2189.52136489': 1004917683215689

In [199]:
min(rss_dict,key=rss_dict.get)

'rss_183.298071083'

In [200]:
model=linear_model.Lasso(alpha=183.298071083,normalize=True)
model.fit(train_feature,training['price'])

Lasso(alpha=183.298071083, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [201]:
model.coef_

array([ -0.00000000e+00,  -0.00000000e+00,   4.84964317e+03,
         1.65210126e+02,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         4.84780808e+05,   4.14997727e+04,   0.00000000e+00,
         1.13406888e+05,   0.00000000e+00,   0.00000000e+00,
        -2.41386679e+03,   0.00000000e+00])

In [202]:
valid_feature.head()

Unnamed: 0,bedrooms,bedrooms_square,bathrooms,sqft_living,sqft_living_sqrt,sqft_lot,sqft_lot_sqrt,floors,floors_square,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated
0,3,9,1.0,1180,34.351128,5650,75.166482,1,1,0,0,3,7,1180,0,1955,0
1,3,9,2.25,2570,50.695167,7242,85.099941,2,4,0,0,3,7,2170,400,1951,1991
2,2,4,1.0,770,27.748874,10000,100.0,1,1,0,0,3,6,770,0,1933,0
3,3,9,2.0,1680,40.987803,8080,89.88882,1,1,0,0,3,8,1680,0,1987,0
4,3,9,2.25,1715,41.412558,6819,82.577237,2,4,0,0,3,7,1715,0,1995,0
