# Regression Week 1: Simple Linear Regression

In [2]:
import graphlab as gl

In [11]:
sales = gl.SFrame('kc_house_data.gl/')
sales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


In [6]:
train_data,test_data = sales.random_split(.8,seed=0)

### Build a generic simple linear regression function

In [8]:
def simple_linear_regression(input_feature,output):
    mean_input=input_feature.mean();
    mean_output=output.mean();
    mean_input_output=(input_feature*output).mean();
    mean_squared_input=(input_feature*input_feature).mean();
    numerator=mean_input_output-(mean_input*mean_output);
    denominator=mean_squared_input-(mean_input*mean_input);
    slope=numerator/denominator;
    intercept=mean_output-(slope*mean_input);
    return(intercept,slope)

In [10]:
# Testing the function
test_feature = gl.SArray(range(5))
test_output = gl.SArray(1 + 1*test_feature)
(test_intercept, test_slope) =  simple_linear_regression(test_feature, test_output)
print "Intercept: " + str(test_intercept)
print "Slope: " + str(test_slope)

Intercept: 1.0
Slope: 1.0


### Build a regression model for predicting price

In [12]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])
print "Intercept: " + str(sqft_intercept)
print "Slope: " + str(sqft_slope)

Intercept: -47116.0765749
Slope: 281.958838568


### Prediction function using learned intercept and slope

In [13]:
def get_regression_prediction(input_feature,intercept,slope):
    predicted_output=intercept+(slope*input_feature)
    return(predicted_output) 

### Predicting house price

In [15]:
my_house_sqft = 2650
estimated_price = get_regression_prediction(my_house_sqft, sqft_intercept, sqft_slope)
print "The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price)

The estimated price for a house with 2650 squarefeet is $700074.85


### Building Residual Sum of Squares function

In [16]:
def get_residual_sum_of_squares(input_feature,output,intercept,slope):
    predicted_output=get_regression_prediction(input_feature,intercept,slope)
    residuals=output-predicted_output;
    RSS=(residuals*residuals).sum();
    return(RSS)

In [17]:
#Testing the function
print get_residual_sum_of_squares(test_feature, test_output, test_intercept, test_slope)

0.0


### Computing RSS on training data

In [18]:
rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], 
                                                 train_data['price'], 
                                                 sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 1.20191835632e+15


### Buiding inverse regression prediction function

In [22]:
def inverse_regression_prediction(output,intercept,slope):
    estimated_feature=(output-intercept)/slope
    return(estimated_feature)

### Predicting sqft given price

In [23]:
my_house_price = 800000
estimated_squarefeet = inverse_regression_prediction(my_house_price, sqft_intercept, sqft_slope)
print "The estimated squarefeet for a house worth $%.2f is %d" % (my_house_price, estimated_squarefeet)

The estimated squarefeet for a house worth $800000.00 is 3004


## Estimating price from bedrooms

In [24]:
bedroom_intercept,bedroom_slope=simple_linear_regression(train_data['bedrooms'],train_data['price'])

In [27]:
rss_prices_on_bedrooms=get_residual_sum_of_squares(train_data['bedrooms'],
                                                  train_data['price'],
                                                  bedroom_intercept,bedroom_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_bedrooms)

The RSS of predicting Prices based on Square Feet is : 2.14324449423e+15


RSS is less with sqft i.e. sqft is more related to price than number of bedrooms

## RSS on test data

In [28]:
# using sqft
rss_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'], 
                                                 test_data['price'], 
                                                 sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 2.75402936247e+14


In [29]:
#using bedrooms
rss_prices_on_bedrooms = get_residual_sum_of_squares(test_data['sqft_living'], 
                                                 test_data['price'], 
                                                 bedroom_intercept, bedroom_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_bedrooms)

The RSS of predicting Prices based on Square Feet is : 3.52859832694e+20


RSS s