In [1]:
import numpy as np
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float,
              'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float,
              'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
# 1. Import King County House Sales data.
# 2. Data is already split in train and test.

kc_train = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
kc_test = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [4]:
kc_train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [5]:
# Data is clean
print(kc_train.isnull().sum().sum())
print(kc_test.isnull().sum().sum())

0
0


In [6]:
kc_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,17384.0,539366.627934,369691.178858,75000.0,320000.0,450000.0,640000.0,7700000.0
bedrooms,17384.0,3.369363,0.906468,0.0,3.0,3.0,4.0,10.0
bathrooms,17384.0,2.115048,0.771783,0.0,1.75,2.25,2.5,8.0
sqft_living,17384.0,2080.02951,921.630888,290.0,1420.0,1910.0,2550.0,13540.0
sqft_lot,17384.0,15091.91124,41459.272327,520.0,5049.5,7616.0,10665.25,1651359.0
waterfront,17384.0,0.007651,0.087136,0.0,0.0,0.0,0.0,1.0
view,17384.0,0.236079,0.768008,0.0,0.0,0.0,0.0,4.0
condition,17384.0,3.41078,0.649792,1.0,3.0,3.0,4.0,5.0
grade,17384.0,7.655028,1.169818,1.0,7.0,7.0,8.0,13.0
sqft_above,17384.0,1787.844512,827.107595,290.0,1200.0,1560.0,2210.0,9410.0


In [7]:
# 3. Write a generic function that accepts a column of data ‘input_feature’ and another column ‘output’
#    and returns the Simple Linear Regression parameters ‘intercept’ and ‘slope’.
#    Use the closed form solution to calculate the slope and intercept.

def simple_linear_regression(input_feature, output):
    n = len(input_feature)
    sum_of_x = np.sum(input_feature)
    sum_of_y = np.sum(output)
    sum_of_x_mul_y = np.sum(input_feature*output)
    sum_of_x_square = np.sum(pow(input_feature,2))
    mean_of_y = np.mean(output)
    mean_of_x = np.mean(input_feature)
    
    numerator = sum_of_x_mul_y - (1/n)*(sum_of_x * sum_of_y)
    denominator = sum_of_x_square - (1/n)*(sum_of_x * sum_of_x)
    
    slope = numerator/denominator
    intercept = (mean_of_y) - slope * (mean_of_x)
    
    return (intercept, slope)

In [8]:
# Above function can also be implemented as follows by dividing both sides by (1/n).

def simple_linear_regression2(input_feature, output):
    n = len(input_feature)
    mean_of_x_mul_y = np.mean((input_feature * output))
    mean_of_x = np.mean(input_feature)
    mean_of_y = np.mean(output)
    mean_of_x_square = np.mean(pow(input_feature,2))
    
    numerator = mean_of_x_mul_y - (mean_of_x*mean_of_y)
    denominator = mean_of_x_square - (mean_of_x*mean_of_x)
    
    slope = numerator/denominator
    intercept = (mean_of_y) - slope * (mean_of_x)
    
    return (intercept, slope)

In [9]:
# 4. Use simple_linear_regression function to calculate the estimated slope and intercept on the training data 
#    to predict ‘price’ given ‘sqft_living’. e.g. in python with SFrames using:

squarefeet_input_feature = kc_train['sqft_living']
squarefeet_output = kc_train['price']

(squarefeet_intercept, squarefeet_slope) = simple_linear_regression(squarefeet_input_feature, squarefeet_output)
print(squarefeet_intercept, squarefeet_slope)

-47116.07907289418 281.9588396303426


In [10]:
# Almost same output cab be obtained using simple_linear_regression2 function
simple_linear_regression2(squarefeet_input_feature, squarefeet_output)

(-47116.07907289488, 281.95883963034294)

In [11]:
# 5. Write a function that accepts a column of data ‘input_feature’, the ‘slope’, and the ‘intercept’,
#    and returns a column of predictions ‘predicted_output’ for each entry in the input column.

def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = intercept + slope * input_feature
    return (predicted_output)

In [12]:
# 6. Using Slope and Intercept from (4), What is the predicted price for a house with 2650 sqft?
price_of_2650_sqft_house = get_regression_predictions(2650, squarefeet_intercept, squarefeet_slope)
price_of_2650_sqft_house

700074.8459475137

In [13]:
# 7. Write a function that accepts column of data: ‘input_feature’, and ‘output’
#    and the regression parameters ‘slope’ and ‘intercept’ and outputs the Residual Sum of Squares (RSS).

def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    predicted_output = intercept + slope * input_feature
    RSS = np.sum(pow((output - predicted_output), 2))
    
    return RSS

In [14]:
# 8. According to function get_residual_sum_of_squares and the slope and intercept from (4),
#    What is the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data?

get_residual_sum_of_squares(squarefeet_input_feature, squarefeet_output, squarefeet_intercept, squarefeet_slope)

1201918354177283.0

In [15]:
# 9. Note that although we estimated the regression slope and intercept in order to predict the output from the input,
#    since this is a simple linear relationship with only two variables we can invert the linear function
#    to estimate the input given the output!

def inverse_regression_predictions(output, intercept, slope):
    estimated_input = (output - intercept)/slope

    return estimated_input

In [16]:
# 10. According to function inverse_regression_predictions and the regression slope and intercept from (3),
#     what is the estimated square-feet for a house costing $800,000?
    
estimated_squarefeet_for_a_house_costing_800000 = inverse_regression_predictions(800000, squarefeet_intercept, squarefeet_slope)
estimated_squarefeet_for_a_house_costing_800000

3004.3962451522766

In [17]:
# 11. Instead of using ‘sqft_living’ to estimate prices we could use ‘bedrooms’ (count of the #bedrooms in the house)
#     to estimate prices. Using the function from (3), calculate the Simple Linear Regression slope and intercept
#     for estimating price based on bedrooms.

(bedroom_intercept, bedroom_slope) = simple_linear_regression(kc_train['bedrooms'], kc_train['price'])
print(bedroom_intercept, bedroom_slope)

109473.17762295867 127588.9529339881


In [18]:
# 12. Now that we have 2 different models compute the RSS from BOTH models on TEST data.

RSS_square_feet_model = get_residual_sum_of_squares(kc_test['sqft_living'], kc_test['price'],
                                                    squarefeet_intercept, squarefeet_slope)
RSS_bedrooms_model = get_residual_sum_of_squares(kc_test['bedrooms'], kc_test['price'],
                                                 bedroom_intercept, bedroom_slope)
print(RSS_square_feet_model, RSS_bedrooms_model)

275402933617812.12 493364585960301.0


In [19]:
# 13. Which model (square feet or bedrooms) has lowest RSS on TEST data?

if RSS_square_feet_model > RSS_bedrooms_model:
    print("RSS_bedrooms_model has lowest RSS on TEST data. Hence this is best model.")
else:
    print("RSS_square_feet_model has lowest RSS on TEST data. Hence this is best model.")

RSS_square_feet_model has lowest RSS on TEST data. Hence this is best model.
