In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns

In [10]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [11]:
train = pd.read_csv("kc_house_train_data.csv", )
test = pd.read_csv('kc_house_test_data.csv')

In [12]:
test.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [4]:
def fix_dtypes(d) :
    d['bedrooms'] = d['bedrooms'].astype('float64')
    d['sqft_living15'] = d['sqft_living15'].astype('float64')
    d['sqft_lot15'] = d['sqft_lot15'].astype('float64')
    d['zipcode']  = d['zipcode'].astype('str')
    return d

In [5]:
train = fix_dtypes(train)
test  = fix_dtypes(test)

In [6]:
# create interaction and devired variables
def create_vars(d):
    d['bedrooms_squared'] = d.bedrooms **2
    d['bed_bath_rooms'] = d.bedrooms * d.bathrooms
    d['log_sqft_living'] = np.log(d.sqft_living)
    d['lat_plus_long'] = d['lat'] + d['long']
    return d

In [7]:
train = create_vars(train)
test = create_vars(test)

In [8]:
# quiz question 4
test[['bedrooms_squared','bed_bath_rooms','log_sqft_living', 'lat_plus_long']].mean()

bedrooms_squared    12.446678
bed_bath_rooms       7.503902
log_sqft_living      7.550275
lat_plus_long      -74.653334
dtype: float64

In [22]:
# create helper function to build modles
def make_mod(df,depvar, invar):
    X = df[invar]
    y = df[depvar]
    X = sm.add_constant(X)
    mod = sm.OLS(y,X).fit()
    return mod

In [42]:
mod1_vars = ['sqft_living','bedrooms', 'bathrooms', 'lat', 'long']
mod2_vars = [ 'sqft_living', 'bedrooms', 'bathrooms', 'lat','long','bed_bath_rooms']
mod3_vars =  [ 'sqft_living', 'bedrooms', 'bathrooms', 'lat','long','bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [43]:
# building models
mod1 = (make_mod(train
                , 'price'
                , mod1_vars))

In [44]:
mod2 = (make_mod(train
                , 'price'
                ,mod2_vars ))

In [50]:
mod3 = (make_mod(train
                , 'price'
                , mod3_vars))

In [29]:
mod1.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.593
Model:,OLS,Adj. R-squared:,0.592
Method:,Least Squares,F-statistic:,5056.0
Date:,"Thu, 10 Dec 2015",Prob (F-statistic):,0.0
Time:,20:43:36,Log-Likelihood:,-239730.0
No. Observations:,17384,AIC:,479500.0
Df Residuals:,17378,BIC:,479500.0
Df Model:,5,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,-6.908e+07,1.65e+06,-41.940,0.000,-7.23e+07 -6.58e+07
sqft_living,312.2586,3.183,98.097,0.000,306.019 318.498
bedrooms,-5.959e+04,2482.861,-23.999,0.000,-6.45e+04 -5.47e+04
bathrooms,1.571e+04,3587.158,4.379,0.000,8675.552 2.27e+04
lat,6.586e+05,1.31e+04,50.286,0.000,6.33e+05 6.84e+05
long,-3.094e+05,1.33e+04,-23.331,0.000,-3.35e+05 -2.83e+05

0,1,2,3
Omnibus:,12962.917,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,615844.364
Skew:,3.114,Prob(JB):,0.0
Kurtosis:,31.486,Cond. No.,2100000.0


In [30]:
mod2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.597
Model:,OLS,Adj. R-squared:,0.596
Method:,Least Squares,F-statistic:,4283.0
Date:,"Thu, 10 Dec 2015",Prob (F-statistic):,0.0
Time:,20:43:53,Log-Likelihood:,-239650.0
No. Observations:,17384,AIC:,479300.0
Df Residuals:,17377,BIC:,479400.0
Df Model:,6,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,-6.687e+07,1.65e+06,-40.584,0.000,-7.01e+07 -6.36e+07
sqft_living,306.6101,3.197,95.909,0.000,300.344 312.876
bedrooms,-1.134e+05,4797.612,-23.646,0.000,-1.23e+05 -1.04e+05
bathrooms,-7.146e+04,7552.563,-9.462,0.000,-8.63e+04 -5.67e+04
lat,6.548e+05,1.3e+04,50.230,0.000,6.29e+05 6.8e+05
long,-2.943e+05,1.32e+04,-22.218,0.000,-3.2e+05 -2.68e+05
bed_bath_rooms,2.558e+04,1953.134,13.097,0.000,2.18e+04 2.94e+04

0,1,2,3
Omnibus:,12343.124,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,533516.1
Skew:,2.916,Prob(JB):,0.0
Kurtosis:,29.506,Cond. No.,2110000.0


In [40]:
print "mod1 RSS:",np.sum(mod1.resid **2)
print "mod2 RSS:",np.sum(mod2.resid **2)
print "mod3 RSS:", np.sum(mod3.resid**2)

mod1 RSS: 9.6787996305e+14
mod2 RSS: 9.58419635074e+14
mod3 RSS: 9.0343645505e+14


In [46]:
# compute for RSS of testing data
def compute_rss(invar, model) :
    X = test[invar]
    X = sm.add_constant(X)
    RSS = np.sum((test.price - model.predict(X))**2)
    return RSS

In [52]:
print "Mod 1 RSS :", compute_rss(mod1_vars, mod1)
print "Mod 2 RSS :", compute_rss(mod2_vars, mod2)
print "Mod 3 RSS :", compute_rss(mod3_vars, mod3)

Mod 1 RSS : 2.25500469795e+14
Mod 2 RSS : 2.23377462976e+14
Mod 3 RSS : 2.59236319207e+14
