In [1]:
import numpy as np
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
train_data = pd.read_csv('kc_house_train_data.csv',dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv',dtype=dtype_dict)

In [4]:
type(train_data['sqft_living'])

pandas.core.series.Series

In [5]:
def get_numpy_data(data,features,output):
    data['constant'] = 1
    features = ['constant'] + features
    features_df = data[features]
    feature_matrix = np.matrix(features_df)
    output_parray = data[output]
    output_array = np.array(output_parray)
    return(feature_matrix,output_array)

In [6]:
(example_features, example_output) = get_numpy_data(train_data, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print(example_features[0,:]) # this accesses the first row of the data the ':' indicates 'all columns'
print(example_output[0]) # and the corresponding output

[[  1.00000000e+00   1.18000000e+03]]
221900.0


In [29]:
my_weights = np.array([1., 1.]) # the example weights
my_features = example_features[0,] # we'll use the first data point
predicted_value = np.squeeze(np.asarray(np.dot(my_features, my_weights)))
print(predicted_value)

1181.0


In [25]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    predictions = np.squeeze(np.asarray(predictions))
    return(predictions)

In [26]:
test_predictions = (predict_output(example_features, my_weights))

In [31]:
print(test_predictions[0])
print(test_predictions[1])

1181.0
2571.0


In [32]:
def feature_derivative(errors,features):
    derivative = 2*np.dot(features,errors)
    derivative = np.squeeze(np.asarray(derivative))
    return derivative

In [34]:
(example_features,example_output) = get_numpy_data(train_data, ['sqft_living'],'price')
my_weights = [0.0,0.0]
test_predictions = predict_output(example_features,my_weights)

In [35]:
errors = test_predictions - example_output

In [36]:
errors

array([-221900., -538000., -180000., ..., -360000., -400000., -325000.])

In [42]:
features = example_features[:,0]
features = np.squeeze(np.asarray(features))

In [44]:
derivative = feature_derivative(errors,features)
print(derivative)
print(-np.sum(example_output)*2)

-18752698920.0
-18752698920.0


In [45]:
from math import sqrt

In [61]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    
    while not converged:
        predictions = predict_output(feature_matrix, weights)
        errors = predictions - output
        gradient_sum_of_squares = 0
        
        for i in range(len(weights)):
            fe = np.squeeze(np.asarray(feature_matrix[:,i]))
            deri = feature_derivative(errors,fe)
            deri = np.squeeze(np.asarray(deri))
            gradient_sum_of_squares += deri**2
            weights[i] -= step_size*deri
            
        gradient_magnitude = sqrt(gradient_sum_of_squares)
        if gradient_magnitude<tolerance:
            converged = True
    return(weights)

In [62]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [63]:
we = regression_gradient_descent(simple_feature_matrix,output,initial_weights,step_size,tolerance)
we

array([-46999.88716555,    281.91211918])