In [95]:
import graphlab
import numpy as np
import pandas as pd
from math import sqrt

graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 8)

sales = graphlab.SFrame('kc_house_data.gl/')

#converts the SFrame into a Pandas dataframe
sales_panda = sales.to_dataframe()
#convert the dataframe into a numpy matrix
sales_numpyarray = sales_panda.as_matrix()

a function that will accept an SFrame, a list of feature names (e.g. ['sqft_living', 'bedrooms']) and an target feature e.g. ('price') and will return two things:
* A numpy matrix whose columns are the desired features plus a constant column (this is how we create an 'intercept')
* A numpy array containing the values of the output

In [136]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix
    features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy()
    return(features_matrix, output_array)

In [137]:
#test get_numpy_data
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') #the [] around 'sqft_living' makes it a list
print example_features[0,:] # this accesses the first row of the data the ':' indicates 'all columns'
print example_output[0] # and the corresponding output

[  1.00000000e+00   1.18000000e+03]
221900.0


In [140]:
#function ‘predict_output’ which accepts a 2D array ‘feature_matrix’ and a 1D array ‘weights’ & returns a 1D array ‘predictions’
def predict_outcome(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [141]:
#test predict_outcome
my_weights = np.array([1., 1.]) # the example weights
test_predictions = predict_outcome(example_features, my_weights)
print test_predictions[0] # should be 1181.0
print test_predictions[1] # should be 2571.0

1181.0
2571.0


In [117]:
#cost function is the sum over the data points of the squared difference between an observed output and a predicted output.
#errors= (predictions - output)

#a function that accepts a ‘feature’ array and ‘error’ array and returns the ‘derivative’ 
def feature_derivative(errors, feature):
    derivative = 2*np.dot(errors, feature)
    return(derivative)

In [23]:
#test feature_derivative

(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_outcome(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print derivative
print -np.sum(example_output)*2 # should be the same as derivative

-23345850022.0
-23345850022.0


#### Write a gradient descent function that does the following:

* Accepts a numpy feature_matrix 2D array, a 1D output array, an array of initial weights, a step size and a convergence tolerance.
* While not converged updates each feature weight by subtracting the step size times the derivative for that feature given the current weights
* At each step computes the magnitude/length of the gradient (square root of the sum of squared components)
* When the magnitude of the gradient is smaller than the input tolerance returns the final weight vector.

In [118]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix,weights)      
        # compute the errors as predictions - output:
        errors = predictions - output     
        gradient_sum_squares = 0 # initialize the gradient
        
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivatives = feature_derivative(feature_matrix[:, i], errors)          
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += derivatives**2        
            # update the weight based on step size and derivative:
            weights[i] -= step_size*derivatives         
        # compute the square-root of the gradient sum of squares to get the gradient magnitude:
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [119]:
train_data,test_data = sales.random_split(.8,seed=0)

In [120]:
#Use these parameters to estimate the slope and intercept for predicting prices based only on ‘sqft_living’.
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

simple_weights = regression_gradient_descent(simple_feature_matrix,
                                             output,
                                             initial_weights,
                                             step_size,
                                             tolerance)

In [167]:
# What is the value of the weight for sqft_living -- the second element of ‘simple_weights’ (rounded to 1 decimal place)?

print round(simple_weights[1], 1)

281.9


In [160]:
#Now build a corresponding ‘test_simple_feature_matrix’ and ‘test_output’ using test_data. 
#Using ‘test_simple_feature_matrix’ and ‘simple_weights’ compute the predicted house prices on all the test data

(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
predict_simple_features = predict_outcome(test_simple_feature_matrix, simple_weights)

In [171]:
#What is the predicted price for the 1st house in the Test data set for model 1 (round to nearest dollar)?

print round(predict_simple_features[0],0)

356134.0


In [151]:
#compute RSS on all test data for this model
residual = test_output-predict_simple_features
RSS = (residual*residual).sum()
print RSS

2.75400047593e+14


In [153]:
#Note that sqft_living_15 is the average square feet of the nearest 15 neighbouring houses.
#Run gradient descent on a model with ‘sqft_living’ and ‘sqft_living_15’ as well as an intercept with the above parameters. 

model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

multiple_regression_weights = regression_gradient_descent(feature_matrix,
                                                  output,
                                                  initial_weights,
                                                  step_size,
                                                  tolerance)

In [155]:
#Use the regression weights from this second model (using sqft_living and sqft_living_15) and
#predict the outcome of all the house prices on the TEST data.

(test_multiple_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)
predict_multiple_features = predict_outcome(test_multiple_feature_matrix, multiple_regression_weights)

In [173]:
#What is the predicted price for the 1st house in the TEST data set for model 2 (round to nearest dollar)?
#What is the actual price for the 1st house in the Test data set?
#Which estimate was closer to the true price for the 1st house on the TEST data set, model 1 or model 2?

print "predicted price model 1:",round(predict_simple_features[0],0)
print "predicted price model 2:", round(predict_multiple_features[0],)
print "actual price:", sales['price'][0]

predicted price model 1: 356134.0
predicted price model 2: 366651.0
actual price: 221900.0


In [174]:
#compute RSS on all test data for the second model.
#Which model (1 or 2) has lowest RSS on all of the TEST data?

residual2 = test_output-predict_multiple_features
RSS2 = (residual2*residual2).sum()
print "RSS for model 1", RSS
print "RSS for model 2", RSS2

RSS for model 1 2.75400047593e+14
RSS for model 2 2.70263446465e+14
