In [1]:
# numpy, pandas, matplotlib, seaborn, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
plt.style.use('seaborn')
sns.set_style('darkgrid')
%matplotlib inline

# statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols

# sci-kit learn
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.feature_selection import RFECV

In [2]:
# Load up the data and preview dataframe
df = pd.read_csv('data/kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,NONE,...,7 Average,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,7 Average,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,NO,NONE,...,6 Low Average,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,NO,NONE,...,7 Average,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,NO,NONE,...,8 Good,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


In [3]:
# Create copy of dataframe to maintain integrity of original dataframe

housing_df = df.copy()

In [4]:
# Replace NaN values in 'waterfront' column, and convert into binary categorical
housing_df['waterfront'].fillna('NO', inplace=True)
housing_df['waterfront'].replace({'NO': 0, 'YES': 1}, inplace=True)

# Replace 'yr_renovated' NaN values and convert to dtype int
housing_df['yr_renovated'] = housing_df['yr_renovated'].fillna(0.0).astype(int)

# Replace 'view' NaN values
housing_df['view'].fillna('NONE', inplace=True)

# Convert 'date' column into dtype datetime
housing_df['date'] = pd.to_datetime(housing_df['date'])

# Replace '?' values in 'sqft_basement' with '0.0' and convert column into dtype float
housing_df['sqft_basement'] = housing_df['sqft_basement'].replace('?', '0.0').astype('float')

In [5]:
# Convert ordinal variables to dtype 'category'

housing_df['view'] = housing_df['view'].astype('category')
housing_df['condition'] = housing_df['condition'].astype('category')
housing_df['grade'] = housing_df['grade'].astype('category')

# Reorder categories in correct order
housing_df['view'] = housing_df['view'].cat.reorder_categories(['NONE', 'FAIR', 'AVERAGE', 'GOOD', 'EXCELLENT'])
housing_df['condition'] = housing_df['condition'].cat.reorder_categories(['Poor', 'Fair', 'Average', 'Good', 'Very Good'])
housing_df['grade'] = housing_df['grade'].cat.reorder_categories(['3 Poor', '4 Low', '5 Fair', '6 Low Average', 
                                                            '7 Average', '8 Good', '9 Better', '10 Very Good', 
                                                            '11 Excellent', '12 Luxury', '13 Mansion'])

# Set up lists for Ordinal Encoder
ordinals = housing_df.select_dtypes('category')
view_list = ['NONE', 'FAIR', 'AVERAGE', 'GOOD', 'EXCELLENT']
condition_list = ['Poor', 'Fair', 'Average', 'Good', 'Very Good']
grade_list = ['3 Poor', '4 Low', '5 Fair', '6 Low Average', '7 Average', '8 Good', 
              '9 Better', '10 Very Good', '11 Excellent', '12 Luxury', '13 Mansion']

In [6]:
# Instantiate OrdinalEncoder and fit to ordinals
ord_enc = OrdinalEncoder(categories = [view_list, condition_list, grade_list])
ord_enc.fit(ordinals)

# Transform the ordinal categorical subset
ordinals_encoded = pd.DataFrame(ord_enc.transform(ordinals), columns = ordinals.columns)
ordinals_encoded

Unnamed: 0,view,condition,grade
0,0.0,2.0,4.0
1,0.0,2.0,4.0
2,0.0,2.0,3.0
3,0.0,4.0,4.0
4,0.0,2.0,5.0
...,...,...,...
21592,0.0,2.0,5.0
21593,0.0,2.0,5.0
21594,0.0,2.0,4.0
21595,0.0,2.0,5.0


In [7]:
# Replace object ordinal columns in housing_df with ordinal encoded columns
housing_df['view'] = ordinals_encoded['view']
housing_df['condition'] = ordinals_encoded['condition']
housing_df['grade'] = ordinals_encoded['grade']

In [8]:
# No missing values and properly converted datatypes - sanity check

housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             21597 non-null  int64         
 1   date           21597 non-null  datetime64[ns]
 2   price          21597 non-null  float64       
 3   bedrooms       21597 non-null  int64         
 4   bathrooms      21597 non-null  float64       
 5   sqft_living    21597 non-null  int64         
 6   sqft_lot       21597 non-null  int64         
 7   floors         21597 non-null  float64       
 8   waterfront     21597 non-null  int64         
 9   view           21597 non-null  float64       
 10  condition      21597 non-null  float64       
 11  grade          21597 non-null  float64       
 12  sqft_above     21597 non-null  int64         
 13  sqft_basement  21597 non-null  float64       
 14  yr_built       21597 non-null  int64         
 15  yr_renovated   2159