In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [41]:
dataTrain = pd.read_csv('kc_house_train_data.csv')
dataTest = pd.read_csv('kc_house_test_data.csv')

In [42]:
dataTrain.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [60]:
def get_numpy_data(data, features, output):
    features = ['constant'] + features # this is how you combine two lists
    # create a new col in dataframe named 'constant'
    data['constant'] = 1 
    
    output_array = data[output].to_numpy()

    feature_matrix = data[features].to_numpy()

    return(feature_matrix, output_array)

In [61]:
(trainFeature, trainOutput) = get_numpy_data(dataTrain, ['sqft_living'], 'price')

In [62]:
(testFeature, testOutput) = get_numpy_data(dataTest, ['sqft_living'], 'price')

## Predicting output given regression weights

In [63]:
def predict_output(feature_matrix, weights):
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [64]:
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative = 2*np.dot(errors,feature)
    return(derivative)

In [67]:
errors = predict_output(trainFeature, [0.0,0.0]) - trainOutput

#derivative with respect to which feature?
feature = trainFeature[:,0]

#estimate the derivative
derivative = feature_derivative(errors, feature)



In [68]:
print(derivative)
print(-np.sum(trainOutput)*2) # should be the same as derivative

-18752698920.0
-18752698920.0


## Gradient Descent

In [97]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function

        # compute the errors as predictions - output
        errors = predict_output(feature_matrix, initial_weights) - output
        
        gradient_sum_squares = 0 # initialize the gradient sum of squares
        
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(initial_weights)): # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:,i])

            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)
            gradient_sum_squares+=derivative**2
            # subtract the step size times the derivative from the current weight
            initial_weights = initial_weights - step_size*derivative
            
        # compute the square-root of the gradient sum of squares to get the gradient magnitude:
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(initial_weights)

In [98]:
dataTrain = pd.read_csv('kc_house_train_data.csv')
dataTest = pd.read_csv('kc_house_test_data.csv')

In [99]:
simple_features = ['sqft_living']
my_output = 'price'

(trainFeature, trainOutput) = get_numpy_data(dataTrain, simple_features, my_output)
(testFeature, testOutput) = get_numpy_data(dataTest, ['sqft_living'], 'price')

initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

weights = regression_gradient_descent(trainFeature, trainOutput, initial_weights, step_size, tolerance)

In [100]:
weights #answer to first quiz question

array([-46719.20069412,    281.79930588])

In [102]:
predict_output(testFeature, weights) #answer to second quiz question

array([356253.8067172 , 784588.75165791, 435157.61236417, ...,
       663415.05012863, 604237.1958934 , 240716.09130556])

## Running Multiple Regression

In [103]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'

(trainFeature, trainOutput) = get_numpy_data(dataTrain, model_features, my_output)
(testFeature, testOutput) = get_numpy_data(dataTest, model_features, my_output)

initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

weights = regression_gradient_descent(trainFeature, trainOutput, initial_weights, step_size, tolerance)
weights

  after removing the cwd from sys.path.


KeyboardInterrupt: 

In [None]:
predict_output(testFeature, weights) #answer to third quiz question

In [None]:
testOutput #looking at this you can decide which model of the above predicted the house of first price better

In [None]:
rss = sum((predictedValues-outcome)**2) #edit predictedValues variable to get the best RSS among both