# Introduction/Abstract

Data set from King County Washington.

# Packages Needed/Setting Global Variables

In [32]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [33]:
def Decade(List):
    decade_list=[]
    oldest = List.min()
    i=0
    for x in List:
        i += 1
        while len(decade_list) < i:
            if (x >= oldest) & (x < (oldest+10)):
                decade_list.append('{}-{}'.format(oldest,(oldest+10)))
            oldest += 10
        oldest = List.min()
    return decade_list

In [34]:
def zoningLat(List):
    lowest = List.min()
    highest = List.max()
    distance = highest - lowest
    distance = math.ceil(distance*69)
    zones = []
    for entry in List:
        for zone in list(range(1,distance+1)):
            if ((lowest+(zone-1)/69) <= entry) &(entry < (lowest + zone/69)):
                zones.append(zone)
    return zones

def zoningLong(List):
    lowest = List.min()
    highest = List.max()
    distance = highest-lowest
    distance = math.ceil(distance*54.6)
    zones = []
    for entry in List:
        for zone in list(range(1,distance+1)):
            if ((lowest+(zone-1)/54.6) <= entry) & (entry < (lowest + zone/54.6)):
                zones.append(zone)
    return zones

In [35]:
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

# EDA

## Questions about the Data

## Import and Clean the Data

In [36]:
# Import the dataset
df = pd.read_csv('kc_house_data.csv')

#Dropping id column
df = df.drop(['id'],axis=1)

# Creating a new column for the year sold, dropping origianl date
df['year_sold'] = df.date.apply(lambda x: x[-4:]).astype(int)
df = df.drop(['date'], axis=1)

# Changing bedroom number for outlier, I think it was a typo of hitting 3 twice
# based off of the sqft
df.at[15856, 'bedrooms'] = 3

# Replacing NaN values in 'waterfront' with 0
df['waterfront'] = df['waterfront'].fillna(0)

# Replacing NaN values in view with the median/mode value which is 0 (63 entries)
# 0 means house was not viewed
df['view'] = df['view'].fillna(0)

# Replacing ? in sqft_basement and converting to float
df['sqft_basement'] = df['sqft_basement'].replace('?',0).astype(float)

# yr_renovated and renovated columns
df['yr_renovated'] = df['yr_renovated'].fillna(0)
df['Renovated'] = df['yr_renovated'] > 0
df['Renovated'] = df['Renovated'].astype(int)
df['yr_renovated'] = df['yr_renovated'].replace(0,df['yr_built'])

#Adding column for age when sold
df['Age_When_Sold'] = df['year_sold'] - df['yr_built']

#Adding column for years since renovation when sold and dropping yr_renovated
df['Age_Since_Renovation'] = df['year_sold'] - df['yr_renovated']

#Adding interaction variable Renovated*Age since renovation
df['Renovated*Age'] = df['Renovated'] * df['Age_Since_Renovation']

#Adding zones for Lat and Long
df['Lat_Zones'] = zoningLat(df['lat'])
df['Long_Zones'] = zoningLong(df['long'])
df['Lat_Zones'] = df['Lat_Zones'].astype(str)
df['Long_Zones'] = df['Long_Zones'].astype(str)
df['Zones'] = df['Lat_Zones'] + 'x' + df['Long_Zones']
df = df.drop(['lat','long','Long_Zones','Lat_Zones'],axis=1)

#Grouping year built by decade
df['yr_built'] = Decade(df.yr_built)

#Correlations that are getting dropped
df = df.drop(['yr_renovated'],axis=1)
df = df.drop(['Age_Since_Renovation'],axis=1)
df = df.drop(['sqft_living'],axis=1)
df = df.drop(['Renovated*Age'],axis=1)

In [37]:
#Creating Dummie Variables double check this list
dummy_list = ['bedrooms', 'bathrooms', 'floors', 'view', 'condition',
                'grade', 'yr_built', 'zipcode', 'year_sold',
                'Zones']
dum=df[dummy_list].astype('category')
dummies = pd.get_dummies(dum[dummy_list],prefix=dummy_list,drop_first=True)
df_preprocessed = df.drop(dummy_list, axis=1)
df_preprocessed = pd.concat([df_preprocessed,dummies],axis=1)

## Notes

# Modeling

In [38]:
y = df_preprocessed['price']
X = df_preprocessed.drop(['price'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [39]:
X_train_int = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_int).fit()
model.summary()
#Adj. R-squared values
#First run 0.853
#Second run 0.850 (after taking out yr_renovated)
#Third run 0.857 (after taking out Age_Since_Renovation)
#Fourth run 0.851 (after taking out sqft_living)
#Fifth run 0.851 (after taking out Renovated*Age)

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,price,R-squared:,0.884
Model:,OLS,Adj. R-squared:,0.877
Method:,Least Squares,F-statistic:,115.1
Date:,"Mon, 19 Oct 2020",Prob (F-statistic):,0.0
Time:,18:18:11,Log-Likelihood:,-227190.0
No. Observations:,17277,AIC:,456500.0
Df Residuals:,16199,BIC:,464900.0
Df Model:,1077,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.001e+04,1.9e+05,-0.474,0.636,-4.62e+05,2.82e+05
sqft_lot,0.2589,0.041,6.387,0.000,0.179,0.338
waterfront,6.754e+05,1.68e+04,40.263,0.000,6.42e+05,7.08e+05
sqft_above,149.0809,3.048,48.914,0.000,143.107,155.055
sqft_basement,103.2180,3.392,30.427,0.000,96.569,109.867
sqft_living15,20.4915,2.978,6.882,0.000,14.655,26.328
sqft_lot15,-0.0030,0.094,-0.032,0.974,-0.186,0.180
Renovated,4.939e+04,6038.526,8.179,0.000,3.76e+04,6.12e+04
Age_When_Sold,106.9691,369.912,0.289,0.772,-618.099,832.037

0,1,2,3
Omnibus:,8938.892,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,529379.7
Skew:,1.704,Prob(JB):,0.0
Kurtosis:,29.903,Cond. No.,2.61e+22


In [11]:
continuous = ['sqft_lot', 'sqft_above',
              'sqft_basement', 'sqft_living15', 'sqft_lot15','Age_When_Sold']

In [9]:
#Remove Correlated Features
test = X_train#[continuous]

In [10]:
test_corr=test.corr().abs().stack().reset_index().sort_values(0, ascending=False)
test_corr['pairs'] = list(zip(test_corr.level_0, test_corr.level_1))
test_corr.set_index(['pairs'],inplace=True)
test_corr.drop(['level_0','level_1'],axis=1,inplace=True)
test_corr.columns = ['cc']
test_corr.drop_duplicates(inplace=True)
test_corr[(test_corr.cc >.75) & (test_corr.cc<1)]

Unnamed: 0_level_0,cc
pairs,Unnamed: 1_level_1
"(condition_3, condition_4)",0.813554


In [17]:
#Looks like yr_built and yr_renovated is highly correlated, dropping yr_renovated column
#Looks like Age_When_sold and Age_Since_Renovation is highly correlated, dropping Age_Since_Renovation
#Looks like sqft_living and sqft_above is highly correlated, dropping sqft_living
#Looks like Renovated*Age and Renovated is highly correlated, dropping Renovated*Age
#Looks like sqft_living and sqft_living15 is highly correlated, dropping sqft_living
#Also looks like condition_3 and condition_4 is highly correlated, not going to do anything for these since they are from the both category
#Did this in EDA section

In [40]:
results = stepwise_selection(X_train,y_train)

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


KeyboardInterrupt: 

In [None]:
#Looking at scaling
test.hist(figsize=(7,10));