In [471]:
import pandas as pd
import numpy as np
import math

In [472]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [473]:
sales=pd.read_csv('data_set/kc_house_data.csv',dtype=dtype_dict)
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [474]:
def get_numpy_data(data_set,features,output):
    data_set['constant']=1
    features=['constant']+features
    feature_matrix=np.array(data_set[features])
    output_array=np.array(data_set[output])
    return feature_matrix,output_array

In [475]:
def predict_output(feature_matrix,weights):
    predictions=np.dot(feature_matrix,weights)
    return predictions

## Note :
 In the house dataset, features vary wildly in their relative magnitude: ‘sqft_living’ is very large overall compared to ‘bedrooms’, for instance. As a result, weight for ‘sqft_living’ would be much smaller than weight for ‘bedrooms’. This is problematic because “small” weights are dropped first as l1_penalty goes up.
<br>
To give equal considerations for all features, we need to normalize features as discussed in the lectures: we divide each feature by its 2-norm so that the transformed feature has norm 1.

## 1.
Write a short function called ‘normalize_features(feature_matrix)’, which normalizes columns of a given feature matrix. The function should return a pair ‘(normalized_features, norms)’, where the second item contains the norms of original features. As discussed in the lectures, we will use these norms to normalize the test data in the same way as we normalized the training data.

In [476]:
def normalize_features(features):
    norms=np.linalg.norm(features,axis=0)
    normalized_features=features/norms
    return normalized_features,norms

In [489]:
feature_matrix,output_data=get_numpy_data(sales,['sqft_living','bedrooms'],'price')

In [490]:
feature_matrix.shape

(21613, 3)

In [491]:
initial_weights=[1,4,1]
predictions=predict_output(feature_matrix,initial_weights)


In [492]:
normalized_features,norms=normalize_features(feature_matrix)
norms[0]
print(feature_matrix[:,1]/norms[1])
normalized_features[:,1]

[0.00353021 0.00768869 0.00230361 ... 0.00305154 0.00478673 0.00305154]


array([0.00353021, 0.00768869, 0.00230361, ..., 0.00305154, 0.00478673,
       0.00305154])

In [493]:
def ro_value(i,feature_matrix,output,predictions,w):
    feature_i=feature_matrix[:,i]
    ro_i=(feature_i*(np.array(output-predictions) + w[i]*feature_i)).sum()
    return ro_i

In [494]:
def lasso_coordinate_descent_step(i,feature_matrix,output,weights,l1_penalty):
    predictions=predict_output(feature_matrix,weights)
    ro_i=ro_value(i,feature_matrix,output,predictions,weights)
    if i==0:
        return ro_i
    elif ro_i<-l1_penalty/2:
        return (ro_i + l1_penalty/2)
    elif ro_i>l1_penalty/2:
        return (ro_i - l1_penalty/2)
    else:
        return 0
        

In [495]:
print(lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1))

0.4255588466910251


In [496]:
lasso_coordinate_descent_step(2,normalized_features,output_data,initial_weights,2.3e8)

0

In [551]:
def lasso_cyclical_coordinate_descent(feature_matrix,output,initial_weights,l1_penalty,tolerance):
    weights=[]
    changed_weight=[]
    for i in range(0,len(initial_weights)):
        weights.append(0.)
        changed_weight.append(0.)
    max_change=9e40
    while max_change>tolerance:
        for i in range(0,len(weights)):
            weights[i]=lasso_coordinate_descent_step(i,feature_matrix,output,weights,l1_penalty)
        print("Initialweight",initial_weights)
        print("Changed : ",weights)
        for i in range(0,len(initial_weights)):
            changed_weight[i]=weights[i] - initial_weights[i]
            initial_weights[i]=weights[i]
        max_change=max(changed_weight)
        print(max_change)
    return weights

In [552]:
feature_matrix,output_data=get_numpy_data(sales,['sqft_living','bedrooms'],'price')
normalized_data,norms=normalize_features(feature_matrix)

In [553]:
initial_weights=[0.,0.,0.]
l1_penalty=1e7
tolerance=1.0
weights=lasso_cyclical_coordinate_descent(normalized_data,output_data,initial_weights,l1_penalty,tolerance)
weights

Initialweight [0.0, 0.0, 0.0]
Changed :  [79400304.63764462, 10305258.704949208, -299724.1696074158]
79400304.63764462
Initialweight [79400304.63764462, 10305258.704949208, -299724.1696074158]
Changed :  [70262136.26121683, 18947595.76476732, 0]
8642337.059818111
Initialweight [70262136.26121683, 18947595.76476732, 0]
Changed :  [62067326.742834166, 26161208.263501395, 0]
7213612.498734076
Initialweight [62067326.742834166, 26161208.263501395, 0]
Changed :  [55468421.66091432, 32197788.172365278, 0]
6036579.908863883
Initialweight [55468421.66091432, 32197788.172365278, 0]
Changed :  [49946248.430094436, 37249389.849402145, 0]
5051601.677036867
Initialweight [49946248.430094436, 37249389.849402145, 0]
Changed :  [45325118.58921603, 41476730.477490425, 0]
4227340.628088281
Initialweight [45325118.58921603, 41476730.477490425, 0]
Changed :  [41458010.45738979, 45014303.29664588, 0]
3537572.8191554546
Initialweight [41458010.45738979, 45014303.29664588, 0]
Changed :  [38221891.62537834, 4

[21624998.819060422, 63157246.42159398, 0]

In [554]:
def rss_value(predictions,output):
    value=(predictions-output)**2
    return (value.sum(axis=0))

### Quiz Question: What is the RSS of the learned model on the normalized dataset?

In [555]:
predictions=predict_output(normalized_data,weights)
rss_normalized=rss_value(predictions,output_data)
rss_normalized

1630492484578343.8

In [556]:
training_data=pd.read_csv('data_set/kc_house_train_data.csv')
testing_data=pd.read_csv('data_set/kc_house_test_data.csv')

In [570]:
features=['bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition','grade','sqft_above','sqft_basement','yr_built','yr_renovated']
feature_matrix_training,output_training=get_numpy_data(training_data,features,'price')
normalized_train,norms_train=normalize_features(feature_matrix_training)

In [571]:
l1_penalty=1e7
initial_weights=[]
for i in range(normalized_train.shape[1]):
    initial_weights.append(0.)
initial_weights
tolerance=1
weights1e7=lasso_cyclical_coordinate_descent(normalized_train,output_training,initial_weights,l1_penalty,tolerance)
weights1e7

Initialweight [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Changed :  [71114625.71488702, 0, 3743972.4759632815, 5271064.356821975, 0, 0, 7173100.282256907, 7025131.9562600795, -5530804.703545937, 0, 394565.6901768083, 2242690.309176728, -2160960.4929358056, 0]
71114625.71488702
Initialweight [71114625.71488702, 0, 3743972.4759632815, 5271064.356821975, 0, 0, 7173100.282256907, 7025131.9562600795, -5530804.703545937, 0, 394565.6901768083, 2242690.309176728, -2160960.4929358056, 0]
Changed :  [66090269.344339535, 0, 5640929.989801493, 9576074.111328045, 0, 0, 4372018.711480632, 8681305.937415263, -5916544.490841541, 0, 0, 2344289.547079201, -2427081.439324638, 0]
4305009.75450607
Initialweight [66090269.344339535, 0, 5640929.989801493, 9576074.111328045, 0, 0, 4372018.711480632, 8681305.937415263, -5916544.490841541, 0, 0, 2344289.547079201, -2427081.439324638, 0]
Changed :  [61077944.42835728, 0, 6858126.993043939, 13615612.32414962, 0, 0, 3592460.427319616, 8

Initialweight [24429631.23492065, 0, 0, 48389145.57886011, 0, 0, 3317511.174311176, 7329963.724810248, 0, 0, 0, 0, 0, 0]
Changed :  [24429625.64684917, 0, 0, 48389150.84105716, 0, 0, 3317511.181631513, 7329963.379960611, 0, 0, 0, 0, 0, 0]
5.262197047472
Initialweight [24429625.64684917, 0, 0, 48389150.84105716, 0, 0, 3317511.181631513, 7329963.379960611, 0, 0, 0, 0, 0, 0]
Changed :  [24429620.93643379, 0, 0, 48389155.27677958, 0, 0, 3317511.1878021285, 7329963.089272669, 0, 0, 0, 0, 0, 0]
4.435722418129444
Initialweight [24429620.93643379, 0, 0, 48389155.27677958, 0, 0, 3317511.1878021285, 7329963.089272669, 0, 0, 0, 0, 0, 0]
Changed :  [24429616.965830848, 0, 0, 48389159.01583251, 0, 0, 3317511.193003593, 7329962.844239838, 0, 0, 0, 0, 0, 0]
3.739052936434746
Initialweight [24429616.965830848, 0, 0, 48389159.01583251, 0, 0, 3317511.193003593, 7329962.844239838, 0, 0, 0, 0, 0, 0]
Changed :  [24429613.618846245, 0, 0, 48389162.16763409, 0, 0, 3317511.197388118, 7329962.637691582, 0, 0, 

[24429601.087565593,
 0,
 0,
 48389173.968139574,
 0,
 0,
 3317511.2138040178,
 7329961.8643644005,
 0,
 0,
 0,
 0,
 0,
 0]

In [583]:
def features_selected(features,weights):
    features=['constant']+features
    sparse_feature=[]
    for i in range(0,len(weights)):
        if weights[i]!=0:
            sparse_feature.append(features[i])
            
    return sparse_feature

### Quiz Question: What features had non-zero weight in this case?

In [584]:
features_ele=features_selected(features,weights1e7)
features_ele

['constant', 'sqft_living', 'waterfront', 'view']

In [585]:
l1_penalty=1e8
initial_weights=[]
for i in range(normalized_train.shape[1]):
    initial_weights.append(0.)
initial_weights
tolerance=1
weights1e8=lasso_cyclical_coordinate_descent(normalized_train,output_training,initial_weights,l1_penalty,tolerance)

Initialweight [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Changed :  [71114625.71488702, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
71114625.71488702
Initialweight [71114625.71488702, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Changed :  [71114625.71488702, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
0.0


### Quiz Question: What features had non-zero weight in this case?

In [586]:
features_selected_1e8=features_selected(features,weights1e8)
features_selected_1e8

['constant']

In [587]:
l1_penalty=1e4
initial_weights=[]
for i in range(normalized_train.shape[1]):
    initial_weights.append(0.)
initial_weights
tolerance=5e5
weights1e4=lasso_cyclical_coordinate_descent(normalized_train,output_training,initial_weights,l1_penalty,tolerance)

Initialweight [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Changed :  [71114625.71488702, 3956380.1252755085, 4963442.492451265, 5351785.211708793, -1029778.654481313, -8681336.160022503, 12537789.745971119, 10816991.98219515, -8854350.549900116, 3644520.8230692567, 6539708.944943917, 2976689.810737183, -11580046.476377277, 3082552.869888812]
71114625.71488702
Initialweight [71114625.71488702, 3956380.1252755085, 4963442.492451265, 5351785.211708793, -1029778.654481313, -8681336.160022503, 12537789.745971119, 10816991.98219515, -8854350.549900116, 3644520.8230692567, 6539708.944943917, 2976689.810737183, -11580046.476377277, 3082552.869888812]
Changed :  [70455350.84811671, 6432329.798404917, 8491899.278893393, 8936642.640010761, -1426943.8516939548, -13544624.122897785, 7609816.936091678, 12241226.678371388, -12608025.428223873, 6493184.733755886, 11819968.69046741, 3477917.46346643, -20021322.522218987, 3255140.760253438]
5280259.745523493
Initialweight [704

558725.2524971366
Initialweight [77639949.70902942, -21927916.596697286, 14330929.561102446, 88043316.3676707, -2018666.4809046397, -4889012.556020884, 6480070.981708261, 7198681.773760723, 3467001.194869687, 10672583.924453659, -10641281.480160767, -3908992.3764428087, -79517278.12273702, 2799183.158726414]
Changed :  [77724928.12195767, -21955826.56981621, 14174019.528708586, 88595744.01860194, -2018506.1979400006, -4819615.858480925, 6480087.439001661, 7192031.808792083, 3624179.9377311515, 10992600.791146452, -11118857.151938513, -4025437.56550594, -79970042.93271278, 2801697.005311134]
552427.6509312391
Initialweight [77724928.12195767, -21955826.56981621, 14174019.528708586, 88595744.01860194, -2018506.1979400006, -4819615.858480925, 6480087.439001661, 7192031.808792083, 3624179.9377311515, 10992600.791146452, -11118857.151938513, -4025437.56550594, -79970042.93271278, 2801697.005311134]
Changed :  [77809705.77066754, -21980803.17252018, 14020718.494362136, 89142040.41751498, -20

### Quiz Question: What features had non-zero weight in this case?


In [597]:
features_selected_1e4=features_selected(features,weights1e4)
features_selected_1e4

['constant',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']

In [589]:
weights_normalized_1e7 = weights1e7 / norms_train
weights_normalized_1e8 = weights1e8 / norms_train
weights_normalized_1e4 = weights1e4/norms_train

In [590]:
weights_normalized_1e7[3]

161.31745496775082

In [594]:
feature_matrix_test,output_test=get_numpy_data(testing_data,features,'price')
predictions=predict_output(feature_matrix_test,weights_normalized_1e7)
rss_1e7=rss_value(predictions,output_test)
rss_1e7

275962077477488.8

In [595]:
predictions=predict_output(feature_matrix_test,weights_normalized_1e8)
rss_1e8=rss_value(predictions,output_test)
rss_1e8

537166151497322.75

In [596]:
predictions=predict_output(feature_matrix_test,weights_normalized_1e4)
rss_1e4=rss_value(predictions,output_test)
rss_1e4

228459958971393.25