In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [48]:
dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int,
                  'sqft_living15': float, 'grade': int, 'yr_renovated': int,
                  'price': float, 'bedrooms': float, 'zipcode': str,
                  'long': float, 'sqft_lot15': float, 'sqft_living': float,
                  'floors': str, 'condition': int, 'lat': float, 'date': str,
                  'sqft_basement': int, 'yr_built': int, 'id': str,
                  'sqft_lot': int, 'view': int}

In [49]:
train = pd.read_csv("kc_house_train_data.csv", dtype=dtype_dict)
test = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

### Setup Helper Functions

In [51]:
def get_numpy_data(dataset, features, output):
    feature_matrix = np.ones((dataset.shape[0], len(features)+1))
    for f, i in zip(features, range(len(features))):
        feature_matrix[:, i+1] = dataset[f]
    output_array = np.array(dataset[output])
    return feature_matrix, output_array

In [22]:
#### predict outcome dot prod between matrix and weights
def predict_outcome(features_matrix, weights) :
    return (np.dot(features_matrix, weights))

In [25]:
# dot product errors and orginal independent variables squared
def feature_derivative(errors, features):
    return 2 * np.dot(features, errors)

In [63]:
# create regregsssion gradients descent to find minima
def gradient_descent(feature_matrix, outcome, intital_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        predictions = predict_outcome(feature_matrix, weights)
        errors = predictions - outcome
        gradient_sum_squares = 0
        for i in range(len(weights)):
            derivative = feature_derivative(errors, feature_matrix[:, i])
            gradient_sum_squares += derivative**2
            weights[i] = weights[i] - step_size*derivative
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return weights

### Run for Simple Model

In [44]:
simple_feature = ['sqft_living']
outcome = 'price'
simple_feature_matrix, outcome = get_numpy_data(train, simple_feature, outcome)
intital_weights = np.array([-47000.0, 1.0])
step_size = 7e-12
tolerance = 2.5e7

In [45]:
simple_weights = gradient_descent(simple_feature_matrix, outcome, intital_weights, step_size, tolerance)

test_simple_feature_matrix, test_outcome = get_numpy_data(test, ['sqft_living'], 'price')

test_preds = predict_outcome(test_simple_feature_matrix, simple_weights)

In [93]:
print "Simple Weight:" ,simple_weights[1]

Simple Weight: 281.912119175


In [92]:
print "Predicted Ouctome:", test_preds[0]
print "Actual Outcome:", outcome[0]

Predicted Ouctome: 356134.443255
Actual Outcome: 221900.0


### Run More Complex Model

In [70]:
mod_features = ['sqft_living', 'sqft_living15']
outcome = 'price'
initial_weights = [-100000.0, 1.0, 1.0]
step_size = 4e-12
tolerance = 1e9

In [71]:
feature_matrix, outcome = get_numpy_data(train, mod_features, outcome)

comp_weights = gradient_descent(feature_matrix, outcome, intital_weights, step_size, tolerance)

test_feature_matrix, test_outcome = get_numpy_data(test, ['sqft_living', 'sqft_living15'], 'price')

preds = predict_outcome(test_feature_matrix, comp_weights)

In [87]:
#comp_weights

In [91]:
print "1st Predicted Outcome :" , preds[0]
print "Real Outcome:", outcome[0]

1st Predicted Outcome : 366651.411629
Real Outcome: 221900.0


In [90]:
print "Compare RSS"
print "Mod 1 RSS:", np.sum((test_preds - test_outcome)**2)
print "Mod 2 RSS :", np.sum((preds - test_outcome)**2)

Compare RSS
Mod 1 RSS: 2.75400044902e+14
Mod 2 RSS : 2.7026344363e+14


('blah', '\n')


In [74]:
def gradient_descent(X,y, weights, step_size, tolerance):
    converged = False
    while not converged:
        preds = np.dot(X, weights)
        err = preds - y
        gradient_sum_squares = 0
        # while not converged update weigh
        for i in range(len(weights)):
            deriv = np.dot(X.T[i] , weights[i])
            weights = weights[i] - (step_size * deriv)
            gradient_sum_squares += deriv**2
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance :
                converged = True
    return weights

In [None]:
def gradient_descent(mat, depvar, weights, step_size, tolerance):
    converged = False
    weights = np.array(weights)
    while not converged:
        preds = predict_outcome(mat, weights)
        err = preds - depvar
        
        gradient_sum_squares = 0
        # while not converged update weight
        for i in range(len(weights)):
            deriv = invar_derivative(weights[:,i])
            gradient_sum_squares.append(derive)
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance :
            converged = True
    return(weights)                

# http://pythonfiddle.com/gradient-descent/

In [12]:
def compute_cost(features, values, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    """
    
    m = len(values)
    sse = np.square(np.dot(features, theta) - values).sum()
    cost = sse / (2*m)

    return cost

def gradient_descent(features, values, theta, alpha, num_iters):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    """
    
    m = len(values)
    cost_history = []

    for i in range( num_iters):
        predicted_values = np.dot(features, theta)
        theta = theta + alpha/m * np.dot((values - predicted_values), features)
        cost_history.append(compute_cost(features, values, theta))

    return theta, pd.Series(cost_history)

In [52]:
## run on instial data set
invar = ['sqft_living']
(X, y) = get_numpy_array(train,invar)
theta = np.zeros(len(X.T))
alpha =  7e-12
num_iters = 10000

In [53]:
theta_out, cost_hist  = gradient_descent(X,y, theta,alpha, num_iters )

In [54]:
theta_out

array([  3.15856075e-02,   7.99433422e+01])

In [55]:
# run on test data
(t_X, t_y) = get_numpy_array(test,invar)
coefs = t_X *theta_out
t_preds =coefs[:,0] + coefs[:,1]
t_preds[0]

114319.01096045914

In [56]:
np.sum((t_y - t_preds)**2)

1007424041956315.5

In [69]:
# fit another model
invar = ['sqft_living','sqft_living15']
(X, y) = get_numpy_array(train,invar)
theta = np.zeros(len(X.T))
alpha = 4e-12
num_iters = 100000

In [70]:
theta_out, cost_hist  = gradient_descent(X,y, theta,alpha, num_iters )

In [71]:
theta_out

array([  5.05851290e-02,   1.46030346e+02,   1.21159510e+02])

In [72]:
# run on test data
(t_X, t_y) = get_numpy_array(test,invar)
coefs = t_X *theta_out
t_preds =coefs[:,0] + coefs[:,1]
t_preds[0]

208823.4459724426

In [73]:
np.sum((t_y - t_preds)**2)

575131210799517.25

In [74]:
test.price[0]

310000.0

In [75]:
1007424041956315.5 >575131210799517.25

True

In [76]:
5 > 2

True