In [1]:
import pandas as pd
import numpy as np
import graphlab
from math import log, sqrt

In [2]:
sales = graphlab.SFrame('kc_house_data.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to santosh.chilkunda@gmail.com and will expire on July 20, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1484032219.log


In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [4]:
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [5]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['one'] = 1
    features = ['one'] + features
    new_sframe = data_sframe[features]
    feature_matrix1 = new_sframe.to_numpy()
    feature_matrix = np.asarray(feature_matrix1, dtype=float)
    
    out_sframe = data_sframe[output]
    output_array = out_sframe.to_numpy()
    
    return (feature_matrix, output_array)

In [6]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return predictions

In [7]:
def normalize_features(features):
    #norms = np.sqrt(np.sum(features*features,axis=0))
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features/norms
    return (normalized_features, norms)

In [8]:
simple_features = ['sqft_living', 'bedrooms']

In [9]:
fm, oa = get_numpy_data(sales, simple_features, 'price')

In [10]:
nm, norms = normalize_features(fm)

In [11]:
weights = np.array([1,4,1])

In [12]:
pred = predict_output(nm, weights)

In [13]:
def get_residual_sum_of_squares(target, pred):
    error = (target - pred);
    sq_err = (error*error);
    RSS = np.sum(sq_err);
    return(RSS)  

In [14]:
RSS = get_residual_sum_of_squares(pred, oa)

In [15]:
print RSS

9.2173241093e+15


In [16]:
r01 = np.sum(nm[:,1]*(oa - pred + weights[1]*nm[:,1]))
r02 = np.sum(nm[:,2]*(oa - pred + weights[2]*nm[:,2]))

In [17]:
r01

87939470.772991076

In [18]:
r02

80966698.675965652

In [19]:
q1_lamba = r02*2+1

In [20]:
q1_lamba

161933398.3519313

In [21]:
q2_lamba = r01*2+1

In [22]:
q2_lamba

175878942.54598215

In [23]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    pred = predict_output(feature_matrix, weights)
    
    r0_i = np.sum(feature_matrix[:,i]*(output - pred + weights[i]*feature_matrix[:,i]))
    
    if(0 == i):
        new_weight_i = r0_i
    elif(r0_i < (-l1_penalty/2)):
        new_weight_i = r0_i + (l1_penalty/2)
    elif(r0_i > (l1_penalty/2)):
        new_weight_i = r0_i - (l1_penalty/2)
    else:
        new_weight_i = 0
    
    return new_weight_i

In [24]:
import math
print lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1)

0.425558846691


In [25]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    num_features = np.size(feature_matrix, axis=1)
    
    weights = initial_weights
    
    max_tolerance = 1e10
    while(max_tolerance > tolerance):
        for i in range(0, num_features):
            new_weight_i = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            if(np.abs(new_weight_i - weights[i]) < max_tolerance):
                max_tolerance = np.abs(new_weight_i - weights[i])
            weights[i] = new_weight_i
        print "iter", max_tolerance
    return weights

In [26]:
init_weights = np.array([0,0,0])
l1_penalty = 1e7
tolerance = 1

In [27]:
final_weights = lasso_cyclical_coordinate_descent(nm, oa, init_weights, l1_penalty, tolerance)

iter 299723.82367
iter 299723
iter 0


In [28]:
final_weights

array([62067326, 26161208,        0])

In [29]:
pred = predict_output(nm, final_weights)

In [30]:
RSS = get_residual_sum_of_squares(pred, oa)

In [31]:
print RSS

2.26732797219e+15


In [32]:
all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']

In [33]:
training2, test2 = sales.random_split(0.8, seed=0)

In [34]:
my_feature_matrix, output_array = get_numpy_data(training2, all_features, 'price')

In [35]:
normalized_features, norms = normalize_features(my_feature_matrix)

In [36]:
init_weights = np.zeros(np.size(normalized_features, axis=1))
l1_penalty = 1e7
tolerance = 1
weights1e7 = lasso_cyclical_coordinate_descent(normalized_features, output_array, init_weights, l1_penalty, tolerance)
np.count_nonzero(weights1e7)

iter 0.0


9

In [37]:
weights1e7

array([ 71114625.75280938,         0.        ,   3743972.43191673,
         5271064.34696085,         0.        ,         0.        ,
         7173100.28480826,   7025132.06642577,  -5530804.65691784,
               0.        ,    394565.5843951 ,   2242690.39485069,
        -2160960.47385677,         0.        ])

In [38]:
init_weights = np.zeros(np.size(normalized_features, axis=1))
l1_penalty = 1e8
tolerance = 1
weights1e8 = lasso_cyclical_coordinate_descent(normalized_features, output_array, init_weights, l1_penalty, tolerance)
np.count_nonzero(weights1e8)

iter 0.0


1

In [39]:
weights1e8

array([ 71114625.75280938,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ])

In [40]:
init_weights = np.zeros(np.size(normalized_features, axis=1))
l1_penalty = 1e4
tolerance = 5e5
weights1e4 = lasso_cyclical_coordinate_descent(normalized_features, output_array, init_weights, l1_penalty, tolerance)
np.count_nonzero(weights1e4)

iter 1029778.70128
iter 172587.888262


14

In [41]:
weights1e4

array([ 70455350.90750059,   6432329.74961583,   8491899.2358642 ,
         8936642.61604182,  -1426943.90338363, -13544624.06951698,
         7609816.90017398,  12241226.807533  , -12608025.33310922,
         6493184.62045919,  11819968.60902575,   3477917.58789896,
       -20021322.46005002,   3255140.66285117])

In [42]:
my_test_matrix, test_output_array = get_numpy_data(test2, all_features, 'price')

In [43]:
normalized_test, norms = normalize_features(my_test_matrix)

In [44]:
pred = predict_output(normalized_test, weights1e4)
RSS = get_residual_sum_of_squares(pred, test_output_array)
print RSS

1.65871404524e+15


In [45]:
pred = predict_output(normalized_test, weights1e7)
RSS = get_residual_sum_of_squares(pred, test_output_array)
print RSS

2.03834401828e+15


In [46]:
pred = predict_output(normalized_test, weights1e8)
RSS = get_residual_sum_of_squares(pred, test_output_array)
print RSS

1.81870623159e+15
