In [2]:
import pandas as pd
from math import log, sqrt
import numpy as np

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('../data/kc_house_data.csv', dtype=dtype_dict)

In [3]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [4]:
def polynomial_sframe(feature, degree):
    # assume that degree >= 1
    # initialize the SFrame:
    poly_frame = pd.DataFrame()
    # and set poly_sframe['power_1'] equal to the passed feature
    poly_frame['power_1']=feature
    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees
        # range usually starts at 0 and stops at the endpoint-1. We want it to start at 2 and stop at degree
        for power in range(2, degree+1): 
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # then assign poly_sframe[name] to the appropriate power of feature
            poly_frame[name]=np.power(feature,power)
#             print(power,(feature)**power)

    return poly_frame

def get_rss(predictions,output):
    RSS=predictions-output
    RSS=RSS*RSS
    RSS=RSS.sum()
    return RSS


In [5]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights
for i in range(len(model_all.coef_)):
    print(all_features[i],model_all.coef_[i])

bedrooms 0.0
bedrooms_square 0.0
bathrooms 0.0
sqft_living 134.43931395541438
sqft_living_sqrt 0.0
sqft_lot 0.0
sqft_lot_sqrt 0.0
floors 0.0
floors_square 0.0
waterfront 0.0
view 24750.004585609488
condition 0.0
grade 61749.10309070811
sqft_above 0.0
sqft_basement 0.0
yr_built -0.0
yr_renovated 0.0


In [6]:
testing = pd.read_csv('../data/wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('../data/wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('../data/wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [7]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

feature_4=["sqft_living_sqrt","sqft_lot_sqrt","bedrooms_square","floors_square"]

In [8]:
 l1_penalty=np.logspace(1, 7, num=13)
l1_penalty

array([1.00000000e+01, 3.16227766e+01, 1.00000000e+02, 3.16227766e+02,
       1.00000000e+03, 3.16227766e+03, 1.00000000e+04, 3.16227766e+04,
       1.00000000e+05, 3.16227766e+05, 1.00000000e+06, 3.16227766e+06,
       1.00000000e+07])

In [9]:
for i in range(len(l1_penalty)):
    model = linear_model.Lasso(alpha=l1_penalty[i], normalize=True)
    model.fit(training[all_features], training['price'])
    print(model.coef_)
    predictions=model.predict(validation[all_features])
    
    print("RSS",get_rss(predictions,validation["price"]))
    print("\n")

[-1.61445628e+04  3.73245384e+02  5.08412433e+04  6.17853560e+02
 -4.44113549e+04  7.85623065e-01 -7.01194765e+02 -0.00000000e+00
  5.01420046e+03  6.19488752e+05  3.80418557e+04  2.49987718e+04
  1.28716235e+05  0.00000000e+00  0.00000000e+00 -3.29383118e+03
  1.00573209e+01]
RSS 398213327300135.0


[-1.73819665e+04  1.19275717e+02  4.26517309e+04  4.19222928e+02
 -2.46029253e+04  2.39763524e-01 -3.55793616e+02  0.00000000e+00
  5.36190642e+03  5.98660591e+05  4.05138886e+04  1.74113669e+04
  1.25151941e+05  0.00000000e+00  0.00000000e+00 -3.33566856e+03
  4.30857032e+00]
RSS 399041900253346.9


[-1.00629243e+04  0.00000000e+00  2.21030560e+04  1.69564623e+02
 -0.00000000e+00 -0.00000000e+00 -9.88978280e+01  0.00000000e+00
  2.01034593e+03  5.48282396e+05  4.25386798e+04  2.85578129e+03
  1.19373956e+05  0.00000000e+00  0.00000000e+00 -3.03956199e+03
  0.00000000e+00]
RSS 429791604072559.6


[-0.00000000e+00  0.00000000e+00  0.00000000e+00  1.62053761e+02
  0.00000000e+00 -0.00000000e

In [10]:
# best_l1_penalty=l1_penalty[0]
# print(best_l1_penalty)
model = linear_model.Lasso(alpha=10.0, normalize=True)
model.fit(training[all_features], training['price'])
predictions=model.predict(testing[all_features])
print(model.coef_)
print("RSS",get_rss(predictions,testing["price"]))
print(np.count_nonzero(model.coef_))
print(np.count_nonzero(model.intercept_))
from decimal import Decimal

'%.2E' % Decimal(get_rss(predictions,testing["price"]))

[-1.61445628e+04  3.73245384e+02  5.08412433e+04  6.17853560e+02
 -4.44113549e+04  7.85623065e-01 -7.01194765e+02 -0.00000000e+00
  5.01420046e+03  6.19488752e+05  3.80418557e+04  2.49987718e+04
  1.28716235e+05  0.00000000e+00  0.00000000e+00 -3.29383118e+03
  1.00573209e+01]
RSS 98467402552698.81
14
1


'9.85E+13'

In [11]:
l1_penalty=np.logspace(1, 4, num=20)
max_nonzeros=7
l=[]
l_min=[]
l_max=[]
for i in range(len(l1_penalty)):
    model = linear_model.Lasso(alpha=l1_penalty[i], normalize=True)
    model.fit(training[all_features], training['price'])
    
#     predictions=model.predict(validation[all_features])
    
#     print("RSS",get_rss(predictions,validation["price"]))
    print(np.count_nonzero(model.coef_))
    print(np.count_nonzero(model.intercept_))
    l.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))
    print(l1_penalty[i])
    
    print("\n")
    if np.count_nonzero(model.coef_)+np.count_nonzero(model.intercept_)>max_nonzeros:
        l_min.append(l1_penalty[i])
    elif np.count_nonzero(model.coef_)+np.count_nonzero(model.intercept_)<max_nonzeros:
        l_max.append(l1_penalty[i])

14
1
10.0


14
1
14.38449888287663


14
1
20.6913808111479


14
1
29.76351441631318


12
1
42.81332398719393


11
1
61.58482110660264


10
1
88.58667904100822


9
1
127.42749857031335


6
1
183.29807108324357


5
1
263.6650898730358


5
1
379.26901907322497


5
1
545.5594781168514


4
1
784.7599703514607


2
1
1128.8378916846884


2
1
1623.776739188721


1
1
2335.7214690901214


0
1
3359.818286283781


0
1
4832.930238571752


0
1
6951.927961775606


0
1
10000.0




In [12]:
l1_penalty_max=min(l_max)
print(min(l_max))
l1_penalty_min=max(l_min)
print(max(l_min))

263.6650898730358
127.42749857031335


In [15]:
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    print(l1_penalty)
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    
    predictions=model.predict(validation[all_features])
    
    print("RSS",get_rss(predictions,validation["price"]))
    
    print(np.count_nonzero(model.coef_))
    print(np.count_nonzero(model.intercept_))
    print("\n")

127.42749857031335
RSS 435374677102680.7
9
1


134.5978981125619
RSS 437009229124471.3
9
1


141.76829765481045
RSS 438236128386912.25
7
1


148.938697197059
RSS 439158937799660.0
7
1


156.10909673930755
RSS 440037365263316.56
6
1


163.2794962815561
RSS 440777489641605.25
6
1


170.44989582380464
RSS 441566698090139.94
6
1


177.6202953660532
RSS 442406413188666.25
6
1


184.79069490830176
RSS 443296716874315.06
6
1


191.96109445055032
RSS 444239780526141.6
6
1


199.13149399279888
RSS 445230739842614.2
6
1


206.3018935350474
RSS 446268896864706.3
5
1


213.47229307729594
RSS 447112919434640.6
5
1


220.6426926195445
RSS 447998187851564.94
5
1


227.81309216179307
RSS 448924706673255.06
5
1


234.98349170404163
RSS 449892475899711.0
5
1


242.1538912462902
RSS 450901498778123.1
5
1


249.32429078853872
RSS 451952426654987.0
5
1


256.49469033078725
RSS 453043924367599.25
5
1


263.6650898730358
RSS 454176669662635.25
5
1




In [16]:

model = linear_model.Lasso(alpha=np.linspace(l1_penalty_min,l1_penalty_max,20)[2], normalize=True)
model.fit(training[all_features], training['price'])
print(model.coef_)
predictions=model.predict(validation[all_features])

print("RSS",get_rss(predictions,validation["price"]))

print(np.count_nonzero(model.coef_))
print(np.count_nonzero(model.intercept_))
for i in range(len(model.coef_)):
    print(all_features[i],model.coef_[i])
print("\n")

[-0.00000000e+00 -0.00000000e+00  1.31781404e+04  1.63173823e+02
  0.00000000e+00 -0.00000000e+00 -2.18494884e+01  0.00000000e+00
  0.00000000e+00  5.18208433e+05  4.22785669e+04  0.00000000e+00
  1.17693998e+05  0.00000000e+00  0.00000000e+00 -2.71400283e+03
  0.00000000e+00]
RSS 438236128386912.25
7
1
bedrooms -0.0
bedrooms_square -0.0
bathrooms 13178.140395834267
sqft_living 163.17382345822446
sqft_living_sqrt 0.0
sqft_lot -0.0
sqft_lot_sqrt -21.849488354420263
floors 0.0
floors_square 0.0
waterfront 518208.4328703311
view 42278.566912155664
condition 0.0
grade 117693.99776333066
sqft_above 0.0
sqft_basement 0.0
yr_built -2714.0028280975134
yr_renovated 0.0


