In [1]:
import pandas as pd
from sklearn import datasets, linear_model
import numpy as np
import matplotlib.pyplot as plt

## Read Data 讀取資料

In [2]:
train_data = pd.read_csv("kc_house_train_data.csv")
test_date = pd.read_csv("kc_house_test_data.csv")

## Define simple linear regression function
by this formula, We can calculate w0 and w1
根據下列公式，我們可以算出 w0 跟 w1
![simple linear regression coefficients formula](http://i.imgur.com/sXyQmo6.png)

In [3]:
def simple_linear_regression(input_feature, output):
    # compute the sum of input_feature and output
    input_sum = input_feature.sum()
    output_sum = output.sum()

    # compute the product of the output and the input_feature and its sum
    sum_of_input_mul_output = (input_feature * output).sum()
    # compute the squared value of the input_feature and its sum
    sum_of_input_mul_input = (input_feature * input_feature).sum()
    # use the formula for the slope(w1)
    slope = (sum_of_input_mul_output - input_sum * output_sum  / input_feature.count()) / \
            (sum_of_input_mul_input - input_sum * input_sum / input_feature.count())
    # use the formula for the intercept(w0)
    intercept = output.mean() - slope * input_feature.mean()
    return (intercept, slope)

### 測試看看結果對不對
用已知結果的測試資料，來看看結果跟自己想的有沒有一樣
這裡我們用 output = 1 + 1*input_feature 產生資料，所以可以知道 slope 跟 intercept 都要為 1

In [4]:
test_date_11x = pd.read_csv("1+1x.csv")
(test_intercept, test_slope) =  simple_linear_regression(test_date_11x["x"], test_date_11x["y"])
print ("Intercept: " + str(test_intercept))
print ("Slope: " + str(test_slope))

Intercept: 1.0
Slope: 1.0


### 建立模型

In [5]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])

print ("Intercept: " + str(sqft_intercept))
print ("Slope: " + str(sqft_slope))

Intercept: -47116.0790729
Slope: 281.95883963


## 建立預測函式 以及反向預測函式 

In [6]:
# Predicting Values
def get_regression_predictions(input_feature, intercept, slope):
    # calculate the predicted values:
    predicted_values = intercept + slope * input_feature
    return predicted_values

# inverse regression predictions
def inverse_regression_predictions(output, intercept, slope):
    # solve output = intercept + slope*input_feature for input_feature. Use this equation to compute the inverse predictions:
    estimated_feature = (output - intercept) / slope
    return estimated_feature

## Residual Sum of Squares (RSS)

In [7]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    # First get the predictions
    predictions = intercept + slope * input_feature
    # then compute the residuals (since we are squaring it doesn't matter which order you subtract)
    residuals = predictions - output
    # square the residuals and add them up
    RSS = (residuals ** 2).sum()
    return(RSS)

### Test RSS

In [8]:
print (get_residual_sum_of_squares(test_date_11x["x"], test_date_11x["y"], test_intercept, test_slope)) # should be 0.0

0.0


## HW

1.Using your Slope and Intercept from predicting prices from square feet, what is the predicted price for a house with 2650 sqft? Use American-style decimals without comma separators (e.g. 300000.34), and round your answer to 2 decimal places. Do not include the dollar sign.

In [9]:
get_regression_predictions(2650, sqft_intercept, sqft_slope)

700074.84594751371

2.Using the learned slope and intercept from the squarefeet model, what is the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data?

In [10]:
get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)

1201918354177286.2

**Quiz Question: According to this function and the regression slope and intercept from (3) what is the estimated square-feet for a house costing $800,000?**

In [11]:
inverse_regression_predictions(800000, sqft_intercept, sqft_slope)

3004.3962451522766