In [1]:
import pandas as pd
import numpy as np

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('../data/kc_house_data.csv', dtype=dtype_dict)
train=pd.read_csv('../data/kc_house_train_data.csv',dtype=dtype_dict)
test=pd.read_csv('../data/kc_house_test_data.csv',dtype=dtype_dict)


In [2]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # this is how you add a constant column to an SFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # this is how you combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_sframe=data_sframe[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_array=data_sframe[output].to_numpy()
    # the following will convert the SArray into a numpy array by first converting it to a list
#     output_array = output_sarray.to_numpy()
    return(feature_matrix, output_array)

In [3]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [4]:
def normalize_features(features):
    norms = np.linalg.norm(X, axis=0)
    X_normalized = X / norms
    return (normalized_features, norms)

In [5]:
# def cost(predictions,output,weights,lam):
#     c=[(prediction - output)^2].sum() + lam*(abs(weights).sum())
#     return c

In [6]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    normalized_features = feature_matrix/ norms
    return (normalized_features, norms)

In [8]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_output(feature_matrix,weights)
#     prediction = 
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i=np.sum([[feature_matrix[:,i]]*(output - prediction + np.dot(weights[i],[feature_matrix[:,i]])) ])

    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i 
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i+l1_penalty/2
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i-l1_penalty/2
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [9]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    converged = False
    weights =np.array(initial_weights) # make sure it's a numpy array
#     print(initial_weights)
    while not converged:
        w_diff=[]
        for i in range(len(weights)):
            old_weights_i = weights[i]
            weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            w_diff.append(old_weights_i-weights[i])
        if max(w_diff)<tolerance:
            converged=True
    return weights

In [10]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)

In [11]:
simple_feature_matrix, norms = normalize_features(simple_feature_matrix)

In [12]:
weights = np.array([1., 4., 1.])


In [13]:
prediction = predict_output(simple_feature_matrix,weights)
print(output)

[221900. 538000. 180000. ... 402101. 400000. 325000.]


In [16]:
train_data=pd.read_csv("../data/kc_house_train_data.csv",dtype=dtype_dict)

In [17]:
all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']
len(all_features)

13

In [18]:
(feature_matrix, output) = get_numpy_data(train_data, all_features, my_output)


In [19]:
feature_matrix=feature_matrix.astype(float)

In [20]:
normalize_features,simple_norms=normalize_features(feature_matrix)

In [21]:
simple_norms

array([1.31848398e+02, 4.60040216e+02, 2.96850552e+02, 2.99962419e+05,
       5.81709718e+06, 2.09458827e+02, 1.15325626e+01, 1.05933942e+02,
       4.57793622e+02, 1.02101959e+03, 2.59726472e+05, 7.01224951e+04,
       2.59922094e+05, 5.36953839e+04])

In [22]:
normalize_features

array([[0.00758447, 0.00652117, 0.0033687 , ..., 0.        , 0.00752148,
        0.        ],
       [0.00758447, 0.00652117, 0.00757957, ..., 0.0057043 , 0.0075061 ,
        0.03707954],
       [0.00758447, 0.00434745, 0.0033687 , ..., 0.        , 0.00743684,
        0.        ],
       ...,
       [0.00758447, 0.00652117, 0.00842175, ..., 0.        , 0.00772924,
        0.        ],
       [0.00758447, 0.00652117, 0.00842175, ..., 0.        , 0.00771   ,
        0.        ],
       [0.00758447, 0.00434745, 0.00252652, ..., 0.        , 0.00772539,
        0.        ]])

In [23]:
l1_penalty = 1e7
tolerance = 1.0
initial_weights=np.zeros(len(all_features)+1)
print(initial_weights)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [24]:
weights1e7 = lasso_cyclical_coordinate_descent(normalize_features, output,
                                            initial_weights, l1_penalty, tolerance)

In [25]:
weights1e7

array([24429600.23440312,        0.        ,        0.        ,
       48389174.77154896,        0.        ,        0.        ,
        3317511.21492165,  7329961.81171425,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ])

In [28]:
pd.Series(weights1e7,index=['intercept']+all_features)

intercept        2.442960e+07
bedrooms         0.000000e+00
bathrooms        0.000000e+00
sqft_living      4.838917e+07
sqft_lot         0.000000e+00
floors           0.000000e+00
waterfront       3.317511e+06
view             7.329962e+06
condition        0.000000e+00
grade            0.000000e+00
sqft_above       0.000000e+00
sqft_basement    0.000000e+00
yr_built         0.000000e+00
yr_renovated     0.000000e+00
dtype: float64

In [29]:
l1_penalty = 1e8
tolerance = 1.0
initial_weights=np.zeros(len(all_features)+1)
weights1e8 = lasso_cyclical_coordinate_descent(normalize_features, output,
                                            initial_weights, l1_penalty, tolerance)
weights1e8

array([71114625.71488702,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ])

In [30]:
pd.Series(weights1e8,index=['intercept']+all_features)

intercept        7.111463e+07
bedrooms         0.000000e+00
bathrooms        0.000000e+00
sqft_living      0.000000e+00
sqft_lot         0.000000e+00
floors           0.000000e+00
waterfront       0.000000e+00
view             0.000000e+00
condition        0.000000e+00
grade            0.000000e+00
sqft_above       0.000000e+00
sqft_basement    0.000000e+00
yr_built         0.000000e+00
yr_renovated     0.000000e+00
dtype: float64

In [31]:
l1_penalty = 1e4
tolerance = 5e5
initial_weights=np.zeros(len(all_features)+1)
weights1e4 = lasso_cyclical_coordinate_descent(normalize_features, output,
                                            initial_weights, l1_penalty, tolerance)
weights1e4

array([ 76952003.53436852, -21577770.46155855,  15724329.40694158,
        83382212.59479392,  -2016684.51627024,  -5518113.00193007,
         6482223.88448927,   7253631.09099746,   2046534.02912275,
         8241759.62585616,  -6736050.93130816,  -2956206.20723013,
       -75906661.13572001,   2777206.4031928 ])

In [32]:
pd.Series(weights1e4,index=['intercept']+all_features)

intercept        7.695200e+07
bedrooms        -2.157777e+07
bathrooms        1.572433e+07
sqft_living      8.338221e+07
sqft_lot        -2.016685e+06
floors          -5.518113e+06
waterfront       6.482224e+06
view             7.253631e+06
condition        2.046534e+06
grade            8.241760e+06
sqft_above      -6.736051e+06
sqft_basement   -2.956206e+06
yr_built        -7.590666e+07
yr_renovated     2.777206e+06
dtype: float64

In [38]:
weights_normalized = weights / norms
weights1e4_normalized=weights1e4/simple_norms
weights1e7_normalized=weights1e7/simple_norms
weights1e8_normalized=weights1e8/simple_norms
weights1e8_normalized

array([539366.62793373,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ])

In [39]:
def get_rss(predictions,output):
    RSS=predictions-output
    RSS=RSS*RSS
    RSS=RSS.sum()
    return RSS

In [40]:
test_data=pd.read_csv("../data/kc_house_test_data.csv",dtype=dtype_dict)

In [41]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, all_features, 'price')

In [42]:
predictions=predict_output(test_feature_matrix.astype(float),weights1e4_normalized)
print(get_rss(predictions,test_output))

232202384262612.44


In [43]:
predictions=predict_output(test_feature_matrix.astype(float),weights1e7_normalized)
print(get_rss(predictions,test_output))

275962075920366.78


In [44]:
predictions=predict_output(test_feature_matrix.astype(float),weights1e8_normalized)
print(get_rss(predictions,test_output))

537166151497322.75
