In [47]:
import graphlab
import numpy as np
import math


In [48]:
sales = graphlab.SFrame('/home/ramak/kc_house_data.gl/')
#print sales


In [49]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe=data_sframe[features]
    print features_sframe
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray=data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)


In [50]:
features_matrix,output_array=get_numpy_data(sales,["sqft_living"],"price")

+----------+-------------+
| constant | sqft_living |
+----------+-------------+
|    1     |    1180.0   |
|    1     |    2570.0   |
|    1     |    770.0    |
|    1     |    1960.0   |
|    1     |    1680.0   |
|    1     |    5420.0   |
|    1     |    1715.0   |
|    1     |    1060.0   |
|    1     |    1780.0   |
|    1     |    1890.0   |
+----------+-------------+
[21613 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [51]:
print features_matrix



[[  1.00000000e+00   1.18000000e+03]
 [  1.00000000e+00   2.57000000e+03]
 [  1.00000000e+00   7.70000000e+02]
 ..., 
 [  1.00000000e+00   1.02000000e+03]
 [  1.00000000e+00   1.60000000e+03]
 [  1.00000000e+00   1.02000000e+03]]


In [52]:
print output_array


[ 221900.  538000.  180000. ...,  402101.  400000.  325000.]


In [53]:
def predict_outcome(feature_matrix, weights):
    predictions=np.dot(feature_matrix,weights)
    return(predictions)



In [54]:
def feature_derivative(errors, feature):
    derivative= 2*np.dot(feature,errors)
    return(derivative)

In [72]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    #weights=initial_weights
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions=predict_outcome(feature_matrix,weights)
        errors = predictions -output
        # compute the errors as predictions - output:
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            der=feature_derivative(errors,feature_matrix[:,i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares= gradient_sum_squares + (der * der)
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size * der            
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [56]:
train_data,test_data = sales.random_split(.8,seed=0)

In [57]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
print initial_weights



+----------+-------------+
| constant | sqft_living |
+----------+-------------+
|    1     |    1180.0   |
|    1     |    2570.0   |
|    1     |    770.0    |
|    1     |    1960.0   |
|    1     |    1680.0   |
|    1     |    5420.0   |
|    1     |    1715.0   |
|    1     |    1060.0   |
|    1     |    1780.0   |
|    1     |    1890.0   |
+----------+-------------+
[17384 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
[ -4.70000000e+04   1.00000000e+00]


In [73]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,tolerance)

ValueError: shapes (17384,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)

In [59]:
print simple_weights  ###Answer to Quiz

[-46999.88716555    281.91211912]


In [61]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

+----------+-------------+
| constant | sqft_living |
+----------+-------------+
|    1     |    1430.0   |
|    1     |    2950.0   |
|    1     |    1710.0   |
|    1     |    2320.0   |
|    1     |    1090.0   |
|    1     |    2620.0   |
|    1     |    4220.0   |
|    1     |    2250.0   |
|    1     |    1260.0   |
|    1     |    2750.0   |
+----------+-------------+
[4229 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [62]:
test_simple_weights = regression_gradient_descent(test_simple_feature_matrix, test_output,initial_weights, step_size,tolerance)

In [63]:
print test_simple_weights

[-46999.87880043    282.35945337]


In [64]:
simple_predictions = predict_outcome(test_simple_feature_matrix, simple_weights)

In [65]:
print simple_predictions[0] ## Answer to the quiz

356134.443171


In [66]:
test_errors=simple_predictions - test_output

In [67]:
RSS=sum(test_errors * test_errors)

In [68]:
print RSS ### Answer to the quiz

2.75400047593e+14


In [69]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

+----------+-------------+---------------+
| constant | sqft_living | sqft_living15 |
+----------+-------------+---------------+
|    1     |    1180.0   |     1340.0    |
|    1     |    2570.0   |     1690.0    |
|    1     |    770.0    |     2720.0    |
|    1     |    1960.0   |     1360.0    |
|    1     |    1680.0   |     1800.0    |
|    1     |    5420.0   |     4760.0    |
|    1     |    1715.0   |     2238.0    |
|    1     |    1060.0   |     1650.0    |
|    1     |    1780.0   |     1780.0    |
|    1     |    1890.0   |     2390.0    |
+----------+-------------+---------------+
[17384 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [70]:
multiple_regression=regression_gradient_descent(feature_matrix, output,initial_weights, step_size,tolerance)

In [71]:
print multiple_regression


[ -9.99999688e+04   2.45072603e+02   6.52795277e+01]


In [39]:
(test_multiple_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)
multiple_predictions = predict_outcome(test_multiple_feature_matrix, multiple_regression)

+----------+-------------+---------------+
| constant | sqft_living | sqft_living15 |
+----------+-------------+---------------+
|    1     |    1430.0   |     1780.0    |
|    1     |    2950.0   |     2140.0    |
|    1     |    1710.0   |     1030.0    |
|    1     |    2320.0   |     2580.0    |
|    1     |    1090.0   |     1570.0    |
|    1     |    2620.0   |     2620.0    |
|    1     |    4220.0   |     2410.0    |
|    1     |    2250.0   |     2250.0    |
|    1     |    1260.0   |     1290.0    |
|    1     |    2750.0   |     1510.0    |
+----------+-------------+---------------+
[4229 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [40]:
print multiple_predictions[0]  ### Answers

366651.412037


In [41]:
test_data['price'][0]  ## Answers

310000.0

In [42]:
multiple_test_errors = multiple_predictions - test_output
RSSm = sum(multiple_test_errors * multiple_test_errors)
print RSSm

2.70263446465e+14
