In [2]:
import graphlab
import numpy as np
from sklearn import linear_model as lm

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1477819119.log


This non-commercial license of GraphLab Create for academic use is assigned to srijitcnair@hotmail.com and will expire on September 03, 2017.




In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

train_data = graphlab.SFrame.read_csv('kc_house_train_data.csv/',column_type_hints=dtype_dict)
test_data = graphlab.SFrame.read_csv('kc_house_test_data.csv/',column_type_hints=dtype_dict)

In [27]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [20]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [21]:
def feature_derivative(feature,errors):
    derivative = 2 * np.dot(feature , errors)    
    return(derivative)

In [22]:
t1=[[1,1],[2,2]]
t2=[4,5]
np.dot(t1[0],t2)

9

In [23]:
feature_derivative(t1,t2)

array([18, 36])

In [42]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    iters = 0
    maxiters = 100
    weights = np.array(initial_weights)
    while (not converged) and (iters < maxiters):
        # compute the predictions based on feature_matrix and weights:
        pred_output = predict_outcome(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = pred_output - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):            
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]            
            # compute the derivative for weight[i]:
            d_wt = feature_derivative(feature_matrix[:,i],errors)
            # update the weight based on step size and derivative:            
            weights[i] =  weights[i] - (step_size * d_wt)
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += ( d_wt ** 2)            
            iters = iters + 1
            
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        print "%i iteration complete, error is %d" %(iters,gradient_magnitude)
        if gradient_magnitude < tolerance:
            converged = True
    
    print("Gradient descent completed")
    return(weights)

In [43]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [44]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,tolerance)

2 iteration complete, error is 50551530784973
4 iteration complete, error is 114230512596244
6 iteration complete, error is 258124923328322
8 iteration complete, error is 583280898675076
10 iteration complete, error is 1318030829306894
12 iteration complete, error is 2978333888439479
14 iteration complete, error is 6730095043142224
16 iteration complete, error is 15207891722798156
18 iteration complete, error is 34365037814439264
20 iteration complete, error is 77654144671314304
22 iteration complete, error is 175473870193140160
24 iteration complete, error is 396515591677542720
26 iteration complete, error is 896000152446256768
28 iteration complete, error is 2024677692464119808
30 iteration complete, error is 4575132880468639232
32 iteration complete, error is 10338357039174162432
34 iteration complete, error is 23361425572079747072
36 iteration complete, error is 52789452201334611968
38 iteration complete, error is 119287509022888509440
40 iteration complete, error is 26955213997325