In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
cwd_old = os.getcwd()
print('\nWorking directory: ', cwd_old)

os.chdir('/content/gdrive/MyDrive/Machine Learning - Uni of Washington')
print('Working directory has changed')

# verifying the current directory has changed
cwd_new = os.getcwd()
print('\nCurrent Working directory: ', cwd_new)


Working directory:  /content
Working directory has changed

Current Working directory:  /content/gdrive/MyDrive/Machine Learning - Uni of Washington


### Import Modules

In [3]:
import pandas as pd
import numpy as np

In [4]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float,
              'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float,
              'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [5]:
sales = pd.read_csv('kc_house_data.csv', dtype = dtype_dict)
house_train_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
house_test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict) 

In [6]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [7]:
house_train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [8]:
house_test_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,114101516,20140528T000000,310000.0,3.0,1.0,1430.0,19901,1.5,0,0,...,7,1430,0,1927,0,98028,47.7558,-122.229,1780.0,12697.0
1,9297300055,20150124T000000,650000.0,4.0,3.0,2950.0,5000,2.0,0,3,...,9,1980,970,1979,0,98126,47.5714,-122.375,2140.0,4000.0
2,1202000200,20141103T000000,233000.0,3.0,2.0,1710.0,4697,1.5,0,0,...,6,1710,0,1941,0,98002,47.3048,-122.218,1030.0,4705.0
3,8562750320,20141110T000000,580500.0,3.0,2.5,2320.0,3980,2.0,0,0,...,8,2320,0,2003,0,98027,47.5391,-122.07,2580.0,3980.0
4,7589200193,20141110T000000,535000.0,3.0,1.0,1090.0,3000,1.5,0,0,...,8,1090,0,1929,0,98117,47.6889,-122.375,1570.0,5080.0


### Building a generic simple linear regression function

In [9]:
def simple_linear_regression(input_feature, output):
  n = len(input_feature)
  x = input_feature
  y = output

  x_mean = x.mean()
  y_mean = y.mean()

  sum_xy = (y * x).sum()
  xy_by_n = (y.sum() * x.sum())/n

  x_square = (x**2).sum()
  xx_by_n = (x.sum() * x.sum())/n

  slope = (sum_xy - xy_by_n)/(x_square - xx_by_n)
  intercept = y_mean - (slope * x_mean)
  
  return (intercept, slope)

### Estimating Intercept and Slope

In [10]:
#Estimating Intercept and Slope when input feature is 'sqft_living'
sqft_intercept, sqft_slope = simple_linear_regression(house_train_data['sqft_living'].values, house_train_data['price'].values)
print("Intercept when input feature is sqft_living:", sqft_intercept)
print("Slope when input feature is sqft_living:", sqft_slope)

Intercept when input feature is sqft_living: -47116.07907289418
Slope when input feature is sqft_living: 281.9588396303426


In [11]:
#Estimating Intercept and Slope when input feature is 'bedrooms'
bedrooms_intercept, bedrooms_slope = simple_linear_regression(house_train_data['bedrooms'].values, house_train_data['price'].values)
print("Intercept when input feature is bedrooms:", bedrooms_intercept)
print("Slope when input feature is bedrooms:", bedrooms_slope)

Intercept when input feature is bedrooms: 109473.1776229596
Slope when input feature is bedrooms: 127588.95293398784


### Predicting Values

In [12]:
def get_regression_predictions(input_feature, intercept, slope):
  predicted_values = intercept + (slope * input_feature)
  return predicted_values

### Residual Sum of Squares

In [13]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
  predicted_values = intercept + (slope * input_feature)
  residuals = output - predicted_values
  RSS = (residuals * residuals).sum()

  return RSS

### Predict the squarefeet given price

In [14]:
def inverse_regression_predictions(output, intercept, slope):
  estimated_feature = (output - intercept)/slope

  return estimated_feature

### Quiz Questions

###1. Using your Slope and Intercept from (4 i.e., 'sqft_living'), What is the predicted price for a house with 2650 sqft? 

In [15]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, sqft_intercept, sqft_slope)
print("The predicted price of the house with {} sqft is ${}".format(my_house_sqft, estimated_price.round(2)))

The predicted price of the house with 2650 sqft is $700074.85


###2. According to this function and the slope and intercept from (4 i.e., 'sqft_living') What is the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data?

In [16]:
rss_sqft = get_residual_sum_of_squares(house_train_data['sqft_living'], house_train_data['price'], sqft_intercept, sqft_slope)
print('The RSS to predict prices on training data based on sqft_living is:', rss_sqft)

The RSS to predict prices on training data based on sqft_living is: 1201918354177283.0


### 3. According to this function (inverse_regression_predictions) and the regression slope and intercept from (3 i.e., 'sqft_living') what is the estimated square-feet for a house costing $800,000?

In [17]:
my_house_price = 800000
estimated_square_feet = inverse_regression_predictions(my_house_price, sqft_intercept, sqft_slope)
print("For a house costing ${} the estimated square-feet for the house will be {} sqft".format(my_house_price, estimated_square_feet))

For a house costing $800000 the estimated square-feet for the house will be 3004.3962451522766 sqft


### 4. Which model (square feet or bedrooms) has lowest RSS on TEST data?

In [18]:
#sqft_living
rss_sqft_test = get_residual_sum_of_squares(house_test_data['sqft_living'], house_test_data['price'], sqft_intercept, sqft_slope)

print('The RSS to predict prices on test data based on sqft_living is:', rss_sqft_test)

The RSS to predict prices on test data based on sqft_living is: 275402933617812.12


In [19]:
#bedrooms
rss_bedrooms_test = get_residual_sum_of_squares(house_test_data['bedrooms'], house_test_data['price'], bedrooms_intercept, bedrooms_slope)

print('The RSS to predict prices on test data based on bedrooms is:', rss_bedrooms_test)

The RSS to predict prices on test data based on bedrooms is: 493364585960300.9
