# Introduction/Abstract

Data set from King County Washington.

# Packages Needed/Setting Global Variables

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

  import pandas.util.testing as tm


In [2]:
def zoningLat(List):
    lowest = List.min()
    highest = List.max()
    distance = highest - lowest
    distance = math.ceil(distance*69)
    zones = []
    for entry in List:
        for zone in list(range(1,distance+1)):
            if (lowest+(zone-1)/69) <= entry < (lowest + zone/69):
                zones.append(zone)
    return zones

def zoningLong(List):
    lowest = List.min()
    highest = List.max()
    distance = highest-lowest
    distance = math.ceil(distance*54.6)
    zones = []
    for entry in List:
        for zone in list(range(1,distance+1)):
            if (lowest+(zone-1)/54.6) <= entry < (lowest + zone/54.6):
                zones.append(zone)
    return zones

#Working on a better zoning code

# def zoning(Lat,Long):
#     lowestLat = Lat.min()
#     highestLat = Lat.max()
#     lowestLong = Long.min()
#     highestLong = Long.max()
#     distanceLat = highestLat - lowestLat
#     distanceLat = math.ceil(distance*69)
#     distanceLong = highestLong - lowestLong
#     distanceLong = math.ceil(distance*54.6)
#     Latzone = []
#     Longzone = []
#     for entry in Lat:
#         for zoneLat in list(range(1,distance+1)):
#             if ((lowest+(zone-1)/69) <= entry) & (entry < (lowest + zone/69)):
#                 Latzone.append(zoneLat)
#     for entri in Long:
#         for zoneLong in list(range(1,distance+1)):
#             if ((lowestLong+(zoneLong-1)/54.6) <= entri) & (entri < (lowestLong + zoneLong/54.6)):
#                 Longzone.append(zoneLong)
#     total_zones = distanceLat*distanceLong
    
    
    
#     return 

In [3]:
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

# EDA

## Questions about the Data

## Import and Clean the Data

In [4]:
# Import the dataset
df = pd.read_csv('kc_house_data.csv')

#Dropping id column
df = df.drop(['id'],axis=1)

# Creating a new column for the year sold, dropping origianl date
df['year_sold'] = df.date.apply(lambda x: x[-4:]).astype(int)
df = df.drop(['date'], axis=1)

# Changing bedroom number for outlier, I think it was a typo of hitting 3 twice
# based off of the sqft
df.at[15856, 'bedrooms'] = 3

# Replacing NaN values in 'waterfront' with 0
df['waterfront'] = df['waterfront'].fillna(0)

# Replacing NaN values in view with the median/mode value which is 0 (63 entries)
# 0 means house was not viewed
df['view'] = df['view'].fillna(0)

# Replacing ? in sqft_basement and converting to float
df['sqft_basement'] = df['sqft_basement'].replace('?',0).astype(float)

# yr_renovated and renovated columns
df['yr_renovated'] = df['yr_renovated'].fillna(0)
df['Renovated'] = df['yr_renovated'] > 0
df['Renovated'] = df['Renovated'].astype(int)
df['yr_renovated'] = df['yr_renovated'].replace(0,df['yr_built'])

#Adding column for age when sold
df['Age_When_Sold'] = df['year_sold'] - df['yr_built']

#Adding column for years since renovation when sold and dropping yr_renovated
df['Age_Since_Renovation'] = df['year_sold'] - df['yr_renovated']

#Adding interaction variable Renovated*Age since renovation
df['Renovated*Age'] = df['Renovated'] * df['Age_Since_Renovation']

#Adding zones for Lat and Long
df['Lat_Zones'] = zoningLat(df['lat'])
df['Long_Zones'] = zoningLong(df['long'])
df = df.drop(['lat','long'],axis=1)

In [5]:
#Creating Dummie Variables double check this list
dummy_list = ['bedrooms', 'bathrooms', 'floors', 'view', 'condition',
                'grade', 'yr_built', 'zipcode', 'year_sold',
                'Lat_Zones','Long_Zones']
dum=df[dummy_list].astype('category')
dummies = pd.get_dummies(dum[dummy_list],prefix=dummy_list,drop_first=True)
df_preprocessed = df.drop(dummy_list, axis=1)
df_preprocessed = pd.concat([df_preprocessed,dummies],axis=1)

In [6]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15,year_sold,Renovated,Age_When_Sold,Age_Since_Renovation,Renovated*Age,Lat_Zones,Long_Zones
0,221900.0,3,1.0,1180,5650,1.0,0.0,0.0,3,7,1180,0.0,1955,1955.0,98178,1340,5650,2014,0,59,59.0,0.0,25,15
1,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,7,2170,400.0,1951,1991.0,98125,1690,7639,2014,1,63,23.0,23.0,39,11
2,180000.0,2,1.0,770,10000,1.0,0.0,0.0,3,6,770,0.0,1933,1933.0,98028,2720,8062,2015,0,82,82.0,0.0,41,16
3,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,5,7,1050,910.0,1965,1965.0,98136,1360,5000,2014,0,49,49.0,0.0,26,7
4,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,3,8,1680,0.0,1987,1987.0,98074,1800,7503,2015,0,28,28.0,0.0,32,26


## Notes

In [7]:
# There is 21 columns, one being the id (Need to change this list)
# Take out yr_build and put age in the continuous
# Add age since renovation in continuous can
# Change yr_renovated as a yes or a no
categoricals = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition',
                'grade', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long','year_sold']
continuous = ['sqft_living', 'sqft_lot', 'sqft_above',
              'sqft_basement', 'sqft_living15', 'sqft_lot15','Age_When_Sold','Age_Since_Renovation']

In [8]:
#Before creating dummy variables lets see correlation, don't take out correlation yet
#but make a list of what's highly correlated

# Modeling

In [9]:
y = df_preprocessed['price']
X = df_preprocessed.drop(['price'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
X_train_int = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_int).fit()
model.summary()
#Adj. R-squared values
#First run 0.853
#Second run 0.850 (after taking out yr_renovated)
#Third run 0.857 (after taking out Age_Since_Renovation)
#Fourth run 0.851 (after taking out sqft_living)
#Fifth run 0.851 (after taking out Renovated*Age)

0,1,2,3
Dep. Variable:,price,R-squared:,0.853
Model:,OLS,Adj. R-squared:,0.85
Method:,Least Squares,F-statistic:,282.1
Date:,"Sun, 18 Oct 2020",Prob (F-statistic):,0.0
Time:,18:14:15,Log-Likelihood:,-229200.0
No. Observations:,17277,AIC:,459100.0
Df Residuals:,16928,BIC:,461800.0
Df Model:,348,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-14.4106,1.181,-12.200,0.000,-16.726,-12.095
sqft_living,94.6164,13.797,6.858,0.000,67.572,121.661
sqft_lot,0.2697,0.039,6.897,0.000,0.193,0.346
waterfront,5.934e+05,1.67e+04,35.600,0.000,5.61e+05,6.26e+05
sqft_above,59.0595,13.869,4.258,0.000,31.874,86.245
sqft_basement,11.6286,13.657,0.851,0.395,-15.142,38.399
yr_renovated,-0.5025,109.680,-0.005,0.996,-215.487,214.482
sqft_living15,24.8752,3.007,8.271,0.000,18.980,30.770
sqft_lot15,-0.1547,0.062,-2.501,0.012,-0.276,-0.033

0,1,2,3
Omnibus:,12266.167,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1788124.91
Skew:,2.52,Prob(JB):,0.0
Kurtosis:,52.584,Cond. No.,2.93e+20


In [11]:
continuous = ['sqft_lot', 'sqft_above',
              'sqft_basement', 'sqft_living15', 'sqft_lot15','Age_When_Sold']

In [12]:
#Remove Correlated Features
test = X_train[continuous]

In [13]:
test_corr=test.corr().abs().stack().reset_index().sort_values(0, ascending=False)
test_corr['pairs'] = list(zip(test_corr.level_0, test_corr.level_1))
test_corr.set_index(['pairs'],inplace=True)
test_corr.drop(['level_0','level_1'],axis=1,inplace=True)
test_corr.columns = ['cc']
test_corr.drop_duplicates(inplace=True)
test_corr[(test_corr.cc >.75) & (test_corr.cc<1)]

Unnamed: 0_level_0,cc
pairs,Unnamed: 1_level_1


In [14]:
#Looks like yr_built and yr_renovated is highly correlated, dropping yr_renovated column
#Looks like Age_When_sold and Age_Since_Renovation is highly correlated, dropping Age_Since_Renovation
#Looks like sqft_living and sqft_above is highly correlated, dropping sqft_living
#Looks like Renovated*Age and Renovated is highly correlated, dropping Renovated*Age
#Looks like sqft_living and sqft_living15 is highly correlated, dropping sqft_living

In [15]:
#Correlations that are getting dropped
X_train = X_train.drop(['yr_renovated'],axis=1)
X_train = X_train.drop(['Age_Since_Renovation'],axis=1)
X_train = X_train.drop(['sqft_living'],axis=1)
X_train = X_train.drop(['Renovated*Age'],axis=1)

In [16]:
stepwise_selection(X_train,y_train)

  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Add  sqft_basement                  with p-value 0.0
Add  grade_10                       with p-value 0.0
Add  grade_9                        with p-value 0.0
Add  grade_11                       with p-value 0.0
Add  sqft_living15                  with p-value 0.0
Add  sqft_above                     with p-value 0.0
Add  Age_When_Sold                  with p-value 2.52156e-307
Add  zipcode_98004                  with p-value 1.241e-271
Add  waterfront                     with p-value 1.5025e-270
Add  grade_13                       with p-value 8.379e-241
Add  grade_12                       with p-value 5.66348e-240
Add  zipcode_98039                  with p-value 6.40433e-189
Add  grade_8                        with p-value 1.49938e-187
Add  zipcode_98112                  with p-value 1.52358e-132
Add  zipcode_98040                  with p-value 3.48891e-135
Add  Lat_Zones_37                   with p-value 9.39082e-116
Add  Lat_Zones_36                   with p-value 2.62072e-125
Add  

KeyboardInterrupt: 

In [None]:
#Looking at scaling
test.hist(figsize=(7,10));