# Introduction/Abstract

Data set from King County Washington.

# Packages Needed/Setting Global Variables

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

  import pandas.util.testing as tm


In [2]:
def zoningLat(List):
    lowest = List.min()
    highest = List.max()
    distance = highest - lowest
    distance = math.ceil(distance*69)
    zones = []
    for entry in List:
        for zone in list(range(1,distance+1)):
            if (lowest+(zone-1)/69) <= entry < (lowest + zone/69):
                zones.append(zone)
    return zones

def zoningLong(List):
    lowest = List.min()
    highest = List.max()
    distance = highest-lowest
    distance = math.ceil(distance*54.6)
    zones = []
    for entry in List:
        for zone in list(range(1,distance+1)):
            if (lowest+(zone-1)/54.6) <= entry < (lowest + zone/54.6):
                zones.append(zone)
    return zones

#Working on a better zoning code

# def zoning(Lat,Long):
#     lowestLat = Lat.min()
#     highestLat = Lat.max()
#     lowestLong = Long.min()
#     highestLong = Long.max()
#     distanceLat = highestLat - lowestLat
#     distanceLat = math.ceil(distance*69)
#     distanceLong = highestLong - lowestLong
#     distanceLong = math.ceil(distance*54.6)
#     Latzone = []
#     Longzone = []
#     for entry in Lat:
#         for zoneLat in list(range(1,distance+1)):
#             if ((lowest+(zone-1)/69) <= entry) & (entry < (lowest + zone/69)):
#                 Latzone.append(zoneLat)
#     for entri in Long:
#         for zoneLong in list(range(1,distance+1)):
#             if ((lowestLong+(zoneLong-1)/54.6) <= entri) & (entri < (lowestLong + zoneLong/54.6)):
#                 Longzone.append(zoneLong)
#     total_zones = distanceLat*distanceLong
    
    
    
#     return 

In [3]:
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

# EDA

## Questions about the Data

## Import and Clean the Data

In [28]:
# Import the dataset
df = pd.read_csv('kc_house_data.csv')

#Dropping id column
df = df.drop(['id'],axis=1)

# Creating a new column for the year sold, dropping origianl date
df['year_sold'] = df.date.apply(lambda x: x[-4:]).astype(int)
df = df.drop(['date'], axis=1)

# Changing bedroom number for outlier, I think it was a typo of hitting 3 twice
# based off of the sqft
df.at[15856, 'bedrooms'] = 3

# Replacing NaN values in 'waterfront' with 0
df['waterfront'] = df['waterfront'].fillna(0)

# Replacing NaN values in view with the median/mode value which is 0 (63 entries)
# 0 means house was not viewed
df['view'] = df['view'].fillna(0)

# Replacing ? in sqft_basement and converting to float
df['sqft_basement'] = df['sqft_basement'].replace('?',0).astype(float)

# yr_renovated and renovated columns
df['yr_renovated'] = df['yr_renovated'].fillna(0)
df['Renovated'] = df['yr_renovated'] > 0
df['Renovated'] = df['Renovated'].astype(int)
df['yr_renovated'] = df['yr_renovated'].replace(0,df['yr_built'])

#Adding column for age when sold
df['Age_When_Sold'] = df['year_sold'] - df['yr_built']

#Adding column for years since renovation when sold and dropping yr_renovated
df['Age_Since_Renovation'] = df['year_sold'] - df['yr_renovated']

#Adding interaction variable Renovated*Age since renovation
df['Renovated*Age'] = df['Renovated'] * df['Age_Since_Renovation']

#Adding zones for Lat and Long
df['Lat_Zones'] = zoningLat(df['lat'])
df['Long_Zones'] = zoningLong(df['long'])
df['Lat_Zones'] = df['Lat_Zones'].astype(str)
df['Long_Zones'] = df['Long_Zones'].astype(str)
df['Zones'] = df['Lat_Zones'] + 'x' + df['Long_Zones']
df = df.drop(['lat','long','Long_Zones','Lat_Zones'],axis=1)


#Correlations that are getting dropped
df = df.drop(['yr_renovated'],axis=1)
df = df.drop(['Age_Since_Renovation'],axis=1)
df = df.drop(['sqft_living'],axis=1)
df = df.drop(['Renovated*Age'],axis=1)

In [5]:
#Creating Dummie Variables double check this list
dummy_list = ['bedrooms', 'bathrooms', 'floors', 'view', 'condition',
                'grade', 'yr_built', 'zipcode', 'year_sold',
                'Zones']
dum=df[dummy_list].astype('category')
dummies = pd.get_dummies(dum[dummy_list],prefix=dummy_list,drop_first=True)
df_preprocessed = df.drop(dummy_list, axis=1)
df_preprocessed = pd.concat([df_preprocessed,dummies],axis=1)

In [29]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,zipcode,sqft_living15,sqft_lot15,year_sold,Renovated,Age_When_Sold,Zones
0,221900.0,3,1.0,5650,1.0,0.0,0.0,3,7,1180,0.0,1955,98178,1340,5650,2014,0,59,25x15
1,538000.0,3,2.25,7242,2.0,0.0,0.0,3,7,2170,400.0,1951,98125,1690,7639,2014,1,63,39x11
2,180000.0,2,1.0,10000,1.0,0.0,0.0,3,6,770,0.0,1933,98028,2720,8062,2015,0,82,41x16
3,604000.0,4,3.0,5000,1.0,0.0,0.0,5,7,1050,910.0,1965,98136,1360,5000,2014,0,49,26x7
4,510000.0,3,2.0,8080,1.0,0.0,0.0,3,8,1680,0.0,1987,98074,1800,7503,2015,0,28,32x26


## Notes

# Modeling

In [9]:
y = df_preprocessed['price']
X = df_preprocessed.drop(['price'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
X_train_int = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_int).fit()
model.summary()
#Adj. R-squared values
#First run 0.853
#Second run 0.850 (after taking out yr_renovated)
#Third run 0.857 (after taking out Age_Since_Renovation)
#Fourth run 0.851 (after taking out sqft_living)
#Fifth run 0.851 (after taking out Renovated*Age)

0,1,2,3
Dep. Variable:,price,R-squared:,0.854
Model:,OLS,Adj. R-squared:,0.851
Method:,Least Squares,F-statistic:,285.4
Date:,"Sun, 18 Oct 2020",Prob (F-statistic):,0.0
Time:,18:20:55,Log-Likelihood:,-229410.0
No. Observations:,17277,AIC:,459500.0
Df Residuals:,16928,BIC:,462200.0
Df Model:,348,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-14.9509,1.191,-12.557,0.000,-17.285,-12.617
sqft_living,92.9736,14.622,6.358,0.000,64.313,121.634
sqft_lot,0.2663,0.041,6.541,0.000,0.186,0.346
waterfront,6.329e+05,1.71e+04,37.065,0.000,5.99e+05,6.66e+05
sqft_above,68.2426,14.681,4.648,0.000,39.466,97.019
sqft_basement,16.1630,14.457,1.118,0.264,-12.174,44.500
yr_renovated,64.5895,71.522,0.903,0.366,-75.601,204.780
sqft_living15,19.0425,3.051,6.241,0.000,13.062,25.023
sqft_lot15,-0.1433,0.065,-2.213,0.027,-0.270,-0.016

0,1,2,3
Omnibus:,11378.776,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1708007.077
Skew:,2.209,Prob(JB):,0.0
Kurtosis:,51.509,Cond. No.,4.39e+20


In [11]:
continuous = ['sqft_lot', 'sqft_above',
              'sqft_basement', 'sqft_living15', 'sqft_lot15','Age_When_Sold']

In [12]:
#Remove Correlated Features
test = X_train[continuous]

In [13]:
test_corr=test.corr().abs().stack().reset_index().sort_values(0, ascending=False)
test_corr['pairs'] = list(zip(test_corr.level_0, test_corr.level_1))
test_corr.set_index(['pairs'],inplace=True)
test_corr.drop(['level_0','level_1'],axis=1,inplace=True)
test_corr.columns = ['cc']
test_corr.drop_duplicates(inplace=True)
test_corr[(test_corr.cc >.75) & (test_corr.cc<1)]

Unnamed: 0_level_0,cc
pairs,Unnamed: 1_level_1


In [17]:
#Looks like yr_built and yr_renovated is highly correlated, dropping yr_renovated column
#Looks like Age_When_sold and Age_Since_Renovation is highly correlated, dropping Age_Since_Renovation
#Looks like sqft_living and sqft_above is highly correlated, dropping sqft_living
#Looks like Renovated*Age and Renovated is highly correlated, dropping Renovated*Age
#Looks like sqft_living and sqft_living15 is highly correlated, dropping sqft_living

#Did this in EDA section

In [16]:
results = stepwise_selection(X_train,y_train)

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Add  grade_10                       with p-value 0.0
Add  sqft_basement                  with p-value 0.0
Add  grade_9                        with p-value 0.0
Add  grade_11                       with p-value 0.0
Add  grade_12                       with p-value 0.0
Add  grade_13                       with p-value 0.0
Add  sqft_above                     with p-value 0.0
Add  zipcode_98004                  with p-value 5.35476e-294
Add  waterfront                     with p-value 6.47204e-299
Add  Age_When_Sold                  with p-value 2.6693e-287
Add  zipcode_98039                  with p-value 4.5108e-205
Add  grade_8                        with p-value 2.17857e-205
Add  zipcode_98112                  with p-value 4.57094e-148
Add  zipcode_98040                  with p-value 4.24952e-125
Add  Lat_Zones_37                   with p-value 4.80636e-110
Add  Lat_Zones_36                   with p-value 6.28492e-116
Add  Lat_Zones_35                   with p-value 2.17253e-122
Add  Lat_Zo

KeyboardInterrupt: 

In [None]:
#Looking at scaling
test.hist(figsize=(7,10));