In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

home_df = pd.read_csv('home_data.csv')

home_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


# Question 1

**Selection and summary statistics**: We found the zip code with the highest average house price. What is the average house price of that zip code?

In [2]:
import numpy

zip_df = home_df[['zipcode','price']]

zip_df = zip_df.sort_values(by = ['zipcode'])

#zip_df = zip_df.set_index('zipcode')

zip_df = zip_df.groupby(['zipcode']).mean().sort_values(by = ['price'])

zip_df.tail(1)

Unnamed: 0_level_0,price
zipcode,Unnamed: 1_level_1
98039,2160606.6


# Question 2

**Filtering data**: What fraction of the houses have living space between 2000 sq.ft. and 4000 sq.ft.?

In [3]:
sqft_living_df = home_df[['id', 'sqft_living']]

sqft_living_2000_to_4000_df = sqft_living_df[sqft_living_df['sqft_living'].between(2000, 4000)]

print(float(sqft_living_2000_to_4000_df.shape[0]) / float(sqft_living_df.shape[0]))

0.4266413732475825


# Question 3

**Building a regression model with several more features**: What is the difference in RMSE between the model trained with my_features and the one trained with advanced_features?

my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

advanced_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors]

In [4]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
y = home_df['price']


advanced_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15' # average lot size of 15 nearest neighbors
                    ]

X_advanced = home_df[advanced_features]
X_advanced.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,zipcode,condition,grade,waterfront,view,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,3,1.0,1180,5650,1.0,98178,3,7,0,0,1180,0,1955,0,47.5112,-122.257,1340,5650
1,3,2.25,2570,7242,2.0,98125,3,7,0,0,2170,400,1951,1991,47.721,-122.319,1690,7639
2,2,1.0,770,10000,1.0,98028,3,6,0,0,770,0,1933,0,47.7379,-122.233,2720,8062
3,4,3.0,1960,5000,1.0,98136,5,7,0,0,1050,910,1965,0,47.5208,-122.393,1360,5000
4,3,2.0,1680,8080,1.0,98074,3,8,0,0,1680,0,1987,0,47.6168,-122.045,1800,7503


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt


X_advanced_train, X_advanced_test, y_train, y_test = train_test_split(X_advanced, y,test_size=0.2, random_state = 42)

X_my_train = X_advanced_train[my_features]
X_my_test = X_advanced_test[my_features]

linreg_my = LinearRegression().fit(X_my_train, y_train)
linreg_ad = LinearRegression().fit(X_advanced_train, y_train)
print(linreg_my.coef_)
print(linreg_ad.coef_)

print(linreg_my.score(X_my_test, y_test))
print(linreg_ad.score(X_advanced_test, y_test))

y_my_pred = linreg_my.predict(X_my_test)
y_advanced_pred = linreg_ad.predict(X_advanced_test)
print(sqrt(mean_squared_error(y_test, y_my_pred)) - sqrt(mean_squared_error(y_test, y_advanced_pred)))


[ -5.66723924e+04   1.65476422e+04   3.13063049e+02  -3.20232174e-01
  -5.76662072e+03   5.81860332e+02]
[ -3.43354189e+04   4.45645358e+04   1.09015816e+02   8.88473505e-02
   7.00312462e+03  -5.52253009e+02   2.45267105e+04   9.45678906e+04
   5.62413071e+05   5.36411067e+04   7.00227412e+01   3.89930748e+01
  -2.68076897e+03   2.04156320e+01   5.95968123e+05  -1.94585720e+05
   2.12143281e+01  -3.25831855e-01]
0.515217682596
0.701190442637
58177.67226499258


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

advance = advanced_features + ['price']

home_advance = home_df[advance]

train_data, test_data = train_test_split(home_advance, test_size=0.8, random_state=42)

lin_reg_ad = LinearRegression().fit(train_data[advanced_features], train_data['price'])
lin_reg_my = LinearRegression().fit(train_data[my_features], train_data['price'])

y_ad_predict = lin_reg_ad.predict(test_data[advanced_features])
y_my_predict = lin_reg_my.predict(test_data[my_features])

print(sqrt(mean_squared_error(test_data['price'], y_my_predict)) - sqrt(mean_squared_error(test_data['price'], y_ad_predict))) 

53745.86330736554
