# Simple Linear Regression

## Import

In [17]:
sales = graphlab.SFrame('kc_house_data.gl/')

In [18]:
sales[0:1]

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0


## Split data

In [15]:
train_data, test_data = sales.random_split(0.8, seed = 0)

## Linear Regression

In [19]:
def simple_linear_regression(x, y):
    sum = x + y
    product = x * y
    sum_of_product = product.sum()
    x_squared = x * x
    sum_x_squared = x_squared.sum()
    
    numerator = sum_of_product - ((x.sum() * y.sum()) / x.size())
    denominator = sum_x_squared - ((x.sum() * x.sum()) / x.size())
    slope = numerator / denominator
    
    intercept = y.mean() - (slope * x.mean())
    
    return (intercept, slope)

### Test function

In [20]:
test_feature = graphlab.SArray(range(5))
test_output = graphlab.SArray(1 + 1 * test_feature)
(test_intercept, test_slope) =  simple_linear_regression(test_feature, test_output)
print test_feature
print test_output
print "Intercept: " + str(test_intercept)
print "Slope: " + str(test_slope)

[0L, 1L, 2L, 3L, 4L]
[1L, 2L, 3L, 4L, 5L]
Intercept: 1.0
Slope: 1


In [21]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])
print "Intercept: " + str(sqft_intercept)
print "Slope: " + str(sqft_slope)

Intercept: -47116.0765749
Slope: 281.958838568


### Calculate predicted output

In [22]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_values = intercept + (slope * input_feature)
    return predicted_values

In [23]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, sqft_intercept, sqft_slope)
print "The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price)

The estimated price for a house with 2650 squarefeet is $700074.85


### Residual sum of squares

In [24]:
def get_residual_sum_of_squares(input_feature, actual_output, intercept, slope):
    predicted_output = intercept + (slope * input_feature)
    residuals = actual_output - predicted_output

    residuals_squared = residuals * residuals
    residual_sum_squares = residuals_squared.sum()

    return(residual_sum_squares)

In [25]:
rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 1.20191835632e+15


### Function to predict the squarefeet of a house from a given price

In [26]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_feature = (output - intercept) / slope
    return estimated_feature

In [27]:
my_house_price = 800000
estimated_squarefeet = inverse_regression_predictions(my_house_price, sqft_intercept, sqft_slope)
print "The estimated squarefeet for a house worth $%.2f is %d" % (my_house_price, estimated_squarefeet)

The estimated squarefeet for a house worth $800000.00 is 3004


### House price from number of bedrooms

In [28]:
bedrm_intercept, bedrm_slope = simple_linear_regression(train_data['bedrooms'], train_data['price'])
print "Intercept: " + str(bedrm_intercept)
print "Slope: " + str(bedrm_slope)

Intercept: 109473.180469
Slope: 127588.952175


### Test Linear Regression Algorithm

In [29]:
get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], sqft_intercept, sqft_slope)

275402936247141.3

In [30]:
get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], bedrm_intercept, bedrm_slope)

493364582868287.94