In [2]:
import graphlab
import numpy as np
from sklearn import linear_model as lm

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1477819119.log


This non-commercial license of GraphLab Create for academic use is assigned to srijitcnair@hotmail.com and will expire on September 03, 2017.




In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

train_data = graphlab.SFrame.read_csv('kc_house_train_data.csv/',column_type_hints=dtype_dict)
test_data = graphlab.SFrame.read_csv('kc_house_test_data.csv/',column_type_hints=dtype_dict)

In [27]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [20]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [21]:
def feature_derivative(feature,errors):
    derivative = 2 * np.dot(feature , errors)    
    return(derivative)

In [56]:
t1=[[1,1],[2,2]]
t2=[4,5]
np.dot(t1,t2)

array([ 9, 18])

In [23]:
feature_derivative(t1,t2)

array([18, 36])

In [64]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    iters = 0
    maxiters = 500
    weights = np.array(initial_weights)
    while (not converged) and (iters < maxiters):
        # compute the predictions based on feature_matrix and weights:
        pred_output = predict_outcome(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = pred_output - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):            
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]            
            # compute the derivative for weight[i]:
            d_wt = feature_derivative(feature_matrix[:,i],errors)
            # update the weight based on step size and derivative:            
            weights[i] =  weights[i] - (step_size * d_wt)
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += ( d_wt ** 2)                        
            
        iters = iters + 1
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        print "%i iteration complete, error is %d" %(iters,gradient_magnitude)
        if gradient_magnitude < tolerance:
            converged = True
    
    print("Gradient descent completed")
    return(weights)

In [51]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [52]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,tolerance)

2 iteration complete, error is 50551530784973
4 iteration complete, error is 13127451026296
6 iteration complete, error is 3408996083241
8 iteration complete, error is 885263580285
10 iteration complete, error is 229889265767
12 iteration complete, error is 59698688272
14 iteration complete, error is 15502826425
16 iteration complete, error is 4025844402
18 iteration complete, error is 1045449748
20 iteration complete, error is 271487891
22 iteration complete, error is 70504114
24 iteration complete, error is 18320017
Gradient descent completed


In [53]:
simple_weights

array([-46999.88716555,    281.91211918])

In [66]:
(test_simple_feature_matrix, output_test) = get_numpy_data(test_data, simple_features, my_output)

In [67]:
test_simple_feature_matrix

array([[  1.00000000e+00,   1.43000000e+03],
       [  1.00000000e+00,   2.95000000e+03],
       [  1.00000000e+00,   1.71000000e+03],
       ..., 
       [  1.00000000e+00,   2.52000000e+03],
       [  1.00000000e+00,   2.31000000e+03],
       [  1.00000000e+00,   1.02000000e+03]])

In [68]:
test_simple_pred = np.dot(test_simple_feature_matrix, simple_weights)

In [69]:
test_simple_pred

array([ 356134.443255  ,  784640.86440132,  435069.83662406, ...,
        663418.65315598,  604217.10812919,  240550.47439317])

In [70]:
test_simple_error = test_simple_pred - output_test
test_simple_rss = np.sum(test_simple_error ** 2)
test_simple_rss

275400044902128.31

In [88]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [89]:
model_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size,tolerance)

1 iteration complete, error is 73072020556001
2 iteration complete, error is 22673220967534
3 iteration complete, error is 7060794583490
4 iteration complete, error is 2275682396587
5 iteration complete, error is 928984110636
6 iteration complete, error is 656307431862
7 iteration complete, error is 610615358663
8 iteration complete, error is 593078772020
9 iteration complete, error is 578705926685
10 iteration complete, error is 564945682565
11 iteration complete, error is 551538687674
12 iteration complete, error is 538452428981
13 iteration complete, error is 525676918665
14 iteration complete, error is 513204549505
15 iteration complete, error is 501028105997
16 iteration complete, error is 489140564644
17 iteration complete, error is 477535070644
18 iteration complete, error is 466204932037
19 iteration complete, error is 455143615657
20 iteration complete, error is 444344743347
21 iteration complete, error is 433802088282
22 iteration complete, error is 423509571375
23 iteration 

In [90]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

In [94]:
model_pred = np.dot(test_feature_matrix, model_weights)
print test_feature_matrix.shape
print model_weights.shape
print model_pred.shape
print test_output.shape

(4229, 3)
(3,)
(4229,)
(4229,)


In [97]:
print model_pred[0]
print test_output[0]
model_error = model_pred - test_output
model_rss = np.sum(model_error ** 2)
model_rss

366651.411629
310000.0


270263443629803.56

In [98]:
test_simple_rss > model_rss

True