In [1]:
import graphlab

In [115]:
sales = graphlab.SFrame('kc_house_data.gl/')

In [3]:
sales.head(2)

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0


In [4]:
import numpy as np

In [5]:
def get_numpy_data(data_sframe,features,output):
    # add a constant column to an SFrame 
    data_sframe['constant'] =1 
    features = ['constant'] + features 
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe =data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray=data_sframe[output]
    # this will convert the SArray into a numpy array
    output_array = output_sarray.to_numpy()
    return(features_matrix,output_array)


In [None]:
#For testing let's use the 'sqft_living' feature and a constant as our features and price as our output:

In [6]:
example_features,example_output = get_numpy_data(sales,['sqft_living'],'price')

In [25]:
example_features,example_output

(array([[  1.00000000e+00,   1.18000000e+03],
        [  1.00000000e+00,   2.57000000e+03],
        [  1.00000000e+00,   7.70000000e+02],
        ..., 
        [  1.00000000e+00,   1.02000000e+03],
        [  1.00000000e+00,   1.60000000e+03],
        [  1.00000000e+00,   1.02000000e+03]]),
 array([ 221900.,  538000.,  180000., ...,  402101.,  400000.,  325000.]))

In [None]:
'''
Predicting output given regression weights

Suppose we had the weights [1.0, 1.0] and the features [1.0, 1180.0] and 
we wanted to compute the predicted output 1.0*1.0 + 1.0*1180.0 = 1181.0 this is the dot product between these two arrays. 
If they're numpy arrayws we can use np.dot() to compute this:
'''

In [29]:
my_weights = np.array([1,1])
my_featurs = example_features[0,]
predicted_value = np.dot(my_weights,my_featurs)


In [30]:
my_weights,my_featurs,predicted_value
 

(array([1, 1]), array([  1.00000000e+00,   1.18000000e+03]), 1181.0)

In [10]:
def predict_outcome(feature_matrix,weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [31]:
# test the code 
test_predictions = predict_outcome(example_features, my_weights)


In [32]:
example_features, my_weights

(array([[  1.00000000e+00,   1.18000000e+03],
        [  1.00000000e+00,   2.57000000e+03],
        [  1.00000000e+00,   7.70000000e+02],
        ..., 
        [  1.00000000e+00,   1.02000000e+03],
        [  1.00000000e+00,   1.60000000e+03],
        [  1.00000000e+00,   1.02000000e+03]]), array([1, 1]))

In [12]:
test_predictions

array([ 1181.,  2571.,   771., ...,  1021.,  1601.,  1021.])

In [13]:
def feature_derivative(errors,feature):
    derivative =2*np.dot(feature,errors)
    return(derivative)

In [15]:
#To test your feature derivartive run the following:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
my_weights = np.array([0,0])
test_predictions =predict_outcome(example_features,my_weights)

#here my_weights is 0 so test_predictions are 0's 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 

errors = test_predictions - example_output
# o/p - array([-221900., -538000., -180000., ..., -402101., -400000., -325000.])

# let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
feature = example_features[:,0]
#feature has array([ 1.,  1.,  1., ...,  1.,  1.,  1.])
derivative = feature_derivative(errors, feature)


In [16]:
#-np.sum(example_output)*2  # should be the same as derivative 
derivative, -np.sum(example_output)*2

(-23345850022.0, -23345850022.0)

In [18]:
# let's compute the derivative with respect to '1st feature', the ":" indicates "all rows" ,
#1st feature w0 is all const. ie 1's

feature = example_features[:,1]
derivative_new = feature_derivative(errors, feature)

In [22]:
example_features

array([[  1.00000000e+00,   1.18000000e+03],
       [  1.00000000e+00,   2.57000000e+03],
       [  1.00000000e+00,   7.70000000e+02],
       ..., 
       [  1.00000000e+00,   1.02000000e+03],
       [  1.00000000e+00,   1.60000000e+03],
       [  1.00000000e+00,   1.02000000e+03]])

In [20]:
derivative_new

-58788815060496.0

In [21]:
#Gradient Descent 
from math import sqrt

In [114]:
def regression_gradient_descent(feature_matrix,output , intial_weights,
                               step_size,tolerance ):
    weights = np.array(initial_weights)
    converged = False
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        predictions =predict_outcome(feature_matrix,weights)
        
        # compute the erros as predictions - output
        errors = predictions - output
        
        #initialise the gradient
        gradient_sum_squares = 0.0
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            feature = feature_matrix[:,i]
            derivative =feature_derivative(errors, feature)
            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)
            gradient_sum_squares= gradient_sum_squares + (derivative*derivative)
            
            # subtract the step size times the derivative from the current weight
            weights[i]=weights[i]- (step_size*derivative)
            
            # compute the square-root of the gradient sum of squares to get the gradient magnitude:
            gradient_magnitude = sqrt(gradient_sum_squares)
            if gradient_magnitude < tolerance: 
                converged = True
    return(weights)
                
                
            

In [116]:
# Running the Gradient Descent as Simple Regression
train_data,test_data = sales.random_split(.8,seed=0)

In [117]:
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output ='price'
(simple_feature_matrix, output) = get_numpy_data(train_data,simple_features,my_output)
initial_weights =np.array([-47000.,1.])
step_size = 7e-12
tolerance = 2.5e7

In [119]:
regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)


array([-46999.88717231,    281.82599715])

In [106]:
step_size,tolerance,initial_weights, simple_feature_matrix , output

(7e-12,
 25000000.0,
 array([ -4.70000000e+04,   1.00000000e+00]),
 array([[  1.00000000e+00,   1.18000000e+03],
        [  1.00000000e+00,   2.57000000e+03],
        [  1.00000000e+00,   7.70000000e+02],
        ..., 
        [  1.00000000e+00,   1.53000000e+03],
        [  1.00000000e+00,   1.60000000e+03],
        [  1.00000000e+00,   1.02000000e+03]]),
 array([ 221900.,  538000.,  180000., ...,  360000.,  400000.,  325000.]))

In [107]:
#Use these parameters to estimate the slope and intercept for predicting prices based only on ‘sqft_living’.

simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,  
                                             tolerance)

In [108]:
simple_weights

array([-46999.88717231,    281.82599715])

In [None]:
#What is the value of the weight for sqft_living -- the second element of ‘simple_weights’ (rounded to 1 decimal place)?
#281.82599715 

In [None]:
'''
Now build a corresponding ‘test_simple_feature_matrix’ and ‘test_output’ using test_data. 
Using ‘test_simple_feature_matrix’ and ‘simple_weights’ compute the predicted house prices on all the test data.

'''

In [96]:
test_simple_feature = ['sqft_living']
test_output ='price'
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data,test_simple_feature,test_output)
initial_weights =np.array([-47000.,1.])
step_size = 7e-12
tolerance = 2.5e7

In [103]:
step_size,tolerance,test_simple_feature_matrix, test_output

(7e-12, 25000000.0, array([[  1.00000000e+00,   1.43000000e+03],
        [  1.00000000e+00,   2.95000000e+03],
        [  1.00000000e+00,   1.71000000e+03],
        ..., 
        [  1.00000000e+00,   2.52000000e+03],
        [  1.00000000e+00,   2.31000000e+03],
        [  1.00000000e+00,   1.02000000e+03]]), array([ 310000.,  650000.,  233000., ...,  610685.,  400000.,  402101.]))

In [109]:
my_predictions = predict_outcome(test_simple_feature_matrix,simple_weights)


In [110]:
#What is the predicted price for the 1st house in the Test data set for model 1 (round to nearest dollar)?
my_predictions
#356011.28875178    #356134.44317605

array([ 356011.28875178,  784386.80441934,  434922.5679537 , ...,
        663201.62564497,  604018.16624353,  240462.6299204 ])

In [75]:
#actual price 
test_data['price'][0]

310000.0

In [53]:
test_weights

array([-46999.88122802,    282.35198883])

In [90]:
'''
Now we will use the gradient descent to fit a model with more than 1 predictor variable (and an intercept). 
Use the following parameters
'''
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [93]:
feature_matrix, output

(array([[  1.00000000e+00,   1.18000000e+03,   1.34000000e+03],
        [  1.00000000e+00,   2.57000000e+03,   1.69000000e+03],
        [  1.00000000e+00,   7.70000000e+02,   2.72000000e+03],
        ..., 
        [  1.00000000e+00,   1.53000000e+03,   1.53000000e+03],
        [  1.00000000e+00,   1.60000000e+03,   1.41000000e+03],
        [  1.00000000e+00,   1.02000000e+03,   1.02000000e+03]]),
 array([ 221900.,  538000.,  180000., ...,  360000.,  400000.,  325000.]))

In [94]:
#Run gradient descent on a model with ‘sqft_living’ and ‘sqft_living_15’ as well as an intercept with the above parameters. 
#Save the resulting regression weights.
new_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size,  
                                             tolerance)


In [95]:
new_weights

array([-99999.93584555,    170.91513236,    142.75832416])

In [58]:
'''
Use the regression weights from this second model (using sqft_living and sqft_living_15) and
predict the outcome of all the house prices on the TEST data.
'''
(feature_matrix_test, output_test) = get_numpy_data(test_data, model_features,my_output)


In [59]:
feature_matrix_test, output_test

(array([[  1.00000000e+00,   1.43000000e+03,   1.78000000e+03],
        [  1.00000000e+00,   2.95000000e+03,   2.14000000e+03],
        [  1.00000000e+00,   1.71000000e+03,   1.03000000e+03],
        ..., 
        [  1.00000000e+00,   2.52000000e+03,   2.52000000e+03],
        [  1.00000000e+00,   2.31000000e+03,   1.83000000e+03],
        [  1.00000000e+00,   1.02000000e+03,   1.02000000e+03]]),
 array([ 310000.,  650000.,  233000., ...,  610685.,  400000.,  402101.]))

In [61]:
test_data['price'][0]

310000.0

In [76]:
my_predictions_new = predict_outcome(feature_matrix_test, new_weights)


In [77]:
#What is the predicted price for the 1st house in the TEST data set for model 2 (round to nearest dollar)?

my_predictions_new
#398518.52043672

array([ 398518.52043672,  709702.51832772,  339306.01438045, ...,
        690457.1745909 ,  556061.75312545,  219946.9898073 ])

In [78]:
#What is the actual price for the 1st house in the Test data set?

test_data['price'][0]


310000.0

In [None]:
#Which estimate was closer to the true price for the 1st house on the TEST data set, model 1 or model 2?
# Model 1 - 356011.28875178 , Model 2 - 398518.52043672

In [80]:
#compute RSS  for model 1
features_model_1=['sqft_living']
test_data_model_1=graphlab.linear_regression.create(test_data,target='price',
                                                        features=features_model_1,validation_set=None)

In [81]:
test_data_model_1.show()
'''
Residual sum of squares	275168576560123.78
Training RMSE	255082.4479
'''

Canvas is accessible via web browser at the URL: http://localhost:50746/index.html
Opening Canvas in default web browser.


In [82]:
#compute RSS 
features_model_2=['sqft_living','sqft_living15']
test_data_model_2=graphlab.linear_regression.create(test_data,target='price',
                                                        features=features_model_2,validation_set=None)

In [83]:
test_data_model_2.show()
'''
Residual sum of squares	269847752918107.7
Training RMSE	252604.196
'''

Canvas is accessible via web browser at the URL: http://localhost:50746/index.html
Opening Canvas in default web browser.


In [None]:
'''
Which model (1 or 2) has lowest RSS on all of the TEST data?

Model 2 
'''
