In [1]:
import numpy as np
import pprint
import pandas as pd
import math

# Load house sales data (train and test databases)

In [2]:
# dnames_list = ["id","date","price","bedrooms","bathrooms","sqft_living","sqft_lot","floors","waterfront","view","condition","grade","sqft_above","sqft_basement","yr_built","yr_renovated","zipcode","lat","long","sqft_living15","sqft_lot15"]
# Data types
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 
              'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 
              'sqft_lot':int, 'view':int}

train_datafile = 'kc_house_train_data.csv'
train_data = pd.read_csv(train_datafile, dtype=dtype_dict)

test_datafile = 'kc_house_test_data.csv'
test_data = pd.read_csv(test_datafile, dtype=dtype_dict)

sales_datafile = 'kc_house_data.csv'
sales_data = pd.read_csv(sales_datafile, dtype=dtype_dict)

# Necessary functions

In [3]:
def get_numpy_data(data_df, features, output):
    """Accepts a dataframe of data, a list of features, and a single output, and returns an initialized features matrix and output array.
    
    :inputs:
    data_df - a dataframe of data
    features - a list of features, each referring to a column in data
    output - a single name referring to a column in data
    
    :return:
    features_matrix - 
    output_array - 
    """
    
    # Take data_df slice and convert to np.array
    features_matrix = data_df[features].as_matrix()
    
    # Add constant as first column
    # Make the const first and the nhstack with features_matrix
    const = np.ones((features_matrix.shape[0], 1))
    features_matrix = np.hstack((const, features_matrix))

    # Extract output_array
    output_array = np.array(data_df[output])
    
    return features_matrix, output_array

In [4]:
def predict_outcome(feature_matrix, weights):
    """Accepts a feature matrix of size NxD and weight vector of size Dx1 and returns the Nx1 vector of predictions.
    """
    return np.dot(feature_matrix, weights)

In [5]:
def feature_derivative(errors, feature):
    """Returns the derivative of a single input feature as an array an array of prediction error
    NOTE: DOES NOT accept the entire feature matrix, only a single column (???) """
    return 2 * np.dot(feature, errors)

In [6]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, iter_max = 10000, debug=False):
    """Uses gradient descent to calculate the weights for the feature_matrix given the specified outputs"""
    converged = False
    weights = np.asarray(initial_weights, dtype=np.float64)
    iterations = 0
    print("Starting iterations")

    while not converged and iterations < iter_max:
        # junk = input("Iteration {}".format(iterations))
        print("iter: {}".format(iterations))
        iterations += 1

        # Compute predictions
        predictions = predict_outcome(feature_matrix, weights)
        if debug:
            print("Predictions:")
            print("Feature matrix: ")
            pprint.pprint(feature_matrix)
            print("weights")
            pprint.pprint(weights)
            print("->predictions: {}".format(predictions))

        # Compute errors (predictions - output)
        errors = predictions - output
        if debug:
            print("errors     : {}".format(errors))

        gradient_sum_squares = 0  # Initialize the gradient
        # Update the weights individually (could do this as a whole faster)
        for i in range(len(weights)):
            if debug:
                print("i: {}".format(i))
            # feature[:,i] is the feature column associated with weight i
            # Compute derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            if debug:
                print("\tderivative: {}".format(derivative))

            # Add the squared derivative to the gradient magnitude
            gradient_sum_squares += derivative * derivative

            # Update the weight based on step size and derivative
            # I think added because there should have been a negative somewhere
            # already(?)
            weights[i] -= step_size * derivative
            if debug:
                print("\tweight[i]: {}".format(weights[i]))

        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if debug:
            print(gradient_magnitude)

        if gradient_magnitude < tolerance:
            converged = True
    return weights

# Solve the assignment

Now we will run the regression_gradient_descent function on some actual data. In particular we will
use the gradient descent to estimate the model from Week 1 using just an intercept and slope. Use the
following parameters:
* features: ‘sqft_living’
* output: ‘price’
* initial weights: -47000, 1 (intercept, sqft_living respectively)
* step_size = 7e-12
* tolerance = 2.5e7

In [7]:
simple_features = ['sqft_living']
output='price'
initial_weights = [-47000., 1.]
step_size = 7e-12
tolerance = 2.5e7
iter_max = 200

In [8]:
# Get the features matrix and outputs for the simple feature list
simple_features_matrix, output_array = get_numpy_data(train_data, simple_features, output)

# Calculate the feature weights given this feature matrix/output
simple_weights = regression_gradient_descent(simple_features_matrix, output_array, initial_weights, step_size, tolerance, iter_max, debug=False)

Starting iterations
iter: 0
iter: 1
iter: 2
iter: 3
iter: 4
iter: 5
iter: 6
iter: 7
iter: 8
iter: 9
iter: 10
iter: 11


** 9. Quiz Question: What is the value of the weight for sqft_living -- the second element of
‘simple_weights’ (rounded to 1 decimal place)? **

In [9]:
print("Weight for sqft_living using TRAIN data = {:.1f}".format(simple_weights[1]))

Weight for sqft_living using TRAIN data = 281.9


Now build a corresponding ‘test_simple_feature_matrix’ and ‘test_output’ using test_data. Using
‘test_simple_feature_matrix’ and ‘simple_weights’ compute the predicted house prices on all the test
data.

In [10]:
# Extract the test_simple_feature_matrix and test_output_array, then use the above simple_weights (from training) to predict 
# the values
test_simple_features_matrix, test_output_array = get_numpy_data(test_data, simple_features, output)

** Quiz Question: What is the predicted price for the 1st house in the Test data set for model 1
(round to nearest dollar)? **

We do not run the regression again because we use the weights from the TRAINING database and apply those to the features_matrix from the TEST database.

In [11]:
test_simple_predictions = predict_outcome(test_simple_features_matrix, simple_weights) # See note above for use
print("The price of the first house in the TEST data set using the simple model = ${:.0f}".format(test_simple_predictions[0]))

The price of the first house in the TEST data set using the simple model = $356134


Now compute RSS on all test data for this model. Record the value and store it for later

In [12]:
def get_RSS(true_outcome, prediction):
    return ((true_outcome - prediction) ** 2).sum()

In [13]:
test_simple_RSS = get_RSS(test_output_array, test_simple_predictions)
print("The RSS for the test data using the simple model = {}".format(test_simple_RSS))


The RSS for the test data using the simple model = 275400044902128.3


Now we will use the gradient descent to get a model with more than 1 predictor variable (and an
intercept). Use the following parameters:
* model features = ‘sqft_living’, ‘sqft_living_15’
* output = ‘price’
* initial weights = [-100000, 1, 1] (intercept, sqft_living, and sqft_living_15 respectively)
* step size = 4e-12
* tolerance = 1e9

In [14]:
features = ['sqft_living', 'sqft_living15']
output='price'
initial_weights = [-100000., 1., 1.]
step_size = 4e-12
tolerance = 1e9
iter_max = 2000

In [15]:
features_matrix, output_array = get_numpy_data(train_data, features, output)
weights = regression_gradient_descent(features_matrix, output_array, initial_weights, step_size, tolerance, iter_max)

Starting iterations
iter: 0
iter: 1
iter: 2
iter: 3
iter: 4
iter: 5
iter: 6
iter: 7
iter: 8
iter: 9
iter: 10
iter: 11
iter: 12
iter: 13
iter: 14
iter: 15
iter: 16
iter: 17
iter: 18
iter: 19
iter: 20
iter: 21
iter: 22
iter: 23
iter: 24
iter: 25
iter: 26
iter: 27
iter: 28
iter: 29
iter: 30
iter: 31
iter: 32
iter: 33
iter: 34
iter: 35
iter: 36
iter: 37
iter: 38
iter: 39
iter: 40
iter: 41
iter: 42
iter: 43
iter: 44
iter: 45
iter: 46
iter: 47
iter: 48
iter: 49
iter: 50
iter: 51
iter: 52
iter: 53
iter: 54
iter: 55
iter: 56
iter: 57
iter: 58
iter: 59
iter: 60
iter: 61
iter: 62
iter: 63
iter: 64
iter: 65
iter: 66
iter: 67
iter: 68
iter: 69
iter: 70
iter: 71
iter: 72
iter: 73
iter: 74
iter: 75
iter: 76
iter: 77
iter: 78
iter: 79
iter: 80
iter: 81
iter: 82
iter: 83
iter: 84
iter: 85
iter: 86
iter: 87
iter: 88
iter: 89
iter: 90
iter: 91
iter: 92
iter: 93
iter: 94
iter: 95
iter: 96
iter: 97
iter: 98
iter: 99
iter: 100
iter: 101
iter: 102
iter: 103
iter: 104
iter: 105
iter: 106
iter: 107
iter: 108


In [16]:
weights

array([ -9.99999688e+04,   2.45072603e+02,   6.52795267e+01])

Use the regression weights from this second model (using sqft_living and sqft_living_15) and predict
the outcome of all the house prices on the TEST data

** 15. Quiz Question: What is the predicted price for the 1st house in the TEST data set for model 2
(round to nearest dollar)? **

In [17]:
# Extract the test_simple_feature_matrix and test_output_array, then use the above simple_weights (from training) to predict 
# the values
test_features_matrix, test_output_array = get_numpy_data(test_data, features, output)

In [18]:
test_predictions = predict_outcome(test_features_matrix, weights)
print("The price of the first house in the TEST data set using the more advanced model = ${:.0f}".format(test_predictions[0]))

The price of the first house in the TEST data set using the more advanced model = $366651


** 17. Quiz Question: Which estimate was closer to the true price for the 1st house on the TEST data
set, model 1 or model 2? **

In [19]:
print("Expected price = {}, model 1 price = {}, model 2 price = {}".format(test_output_array[0], test_simple_predictions[0], test_predictions[0]))

Expected price = 310000.0, model 1 price = 356134.4432550024, model 2 price = 366651.4116294939


** 19. Quiz Question: Which model (1 or 2) has lowest RSS on all of the TEST data? **

In [20]:
test_RSS = get_RSS(test_output_array, test_predictions)
print("The RSS for the test data using the simple model = {}".format(test_RSS))
print("RSS for model 2 is less than that for model 1")

The RSS for the test data using the simple model = 270263443629803.56
RSS for model 2 is less than that for model 1


# Notes
* The calculation of the intercept using gradient descent was REALLY bad - the constant basically didn't change throughout the iterations.  I THINK this is a problem with the method, not the implementation, as I compared to another person's implementation and found similar results.  Looking through the future assignments I think there might be some better methods added in later.
* I think the iterative gradient descent could also be replaced with a matrix solver, couldn't it?