<a href="https://colab.research.google.com/github/smccracken13/Zestimate-Project/blob/main/Zestimate_Modeling_Linear%2C_KNN%2C_RF_(McCracken).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Pre-processing and Training Data Development

The goals of this notebook are to:

1. Create dummies for categorical data
2. Split the data into train and test sets
3. Scale the data
4. Test out the following models: Linear Regression, KNN, Random Forest, XGBoost

In [1]:
from google.colab import files
# load zillow_clean.csv
files.upload()

Saving zillow_clean.csv to zillow_clean.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('zillow_clean.csv', low_memory=False, index_col = 'Unnamed: 0')

In [4]:
# Remove absolute log error column
df.drop(columns=['fips', 'abs_log_error'], inplace = True)

In [5]:
# set index to parcelid
df.set_index('parcelid')

Unnamed: 0_level_0,logerror,transaction_month,transaction_day,transaction_quarter,aircon,architecture,basementsqft,bathroomcnt,bedroomcnt,framing,...,numberofstories,fireplaceflag,tav_built,tax_assessed_value,assessmentyear,tav_land,property_tax,taxdelinquencyflag,taxdelinquencyyear,age
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11016594,0.0276,Jan,1,1st,Central,not given,not given,2.0,3.0,not given,...,1.0,not given,122754.0,360170.0,2015.0,237416.0,6735.88,0.0,not applicable,57.0
14366692,-0.1684,Jan,1,1st,not given,not given,not given,3.5,4.0,not given,...,1.0,not given,346458.0,585529.0,2015.0,239071.0,10153.02,0.0,not applicable,2.0
12098116,-0.0040,Jan,1,1st,Central,not given,not given,3.0,2.0,not given,...,1.0,not given,61994.0,119906.0,2015.0,57912.0,11484.48,0.0,not applicable,76.0
12643413,0.0218,Jan,2,1st,Central,not given,not given,2.0,2.0,not given,...,1.0,not given,171518.0,244880.0,2015.0,73362.0,3048.74,0.0,not applicable,29.0
14432541,-0.0050,Jan,2,1st,not given,not given,not given,2.5,4.0,not given,...,2.0,not given,169574.0,434551.0,2015.0,264977.0,5488.96,0.0,not applicable,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10774160,-0.0356,Dec,30,4th,Central,not given,not given,1.0,1.0,not given,...,1.0,not given,43800.0,191000.0,2015.0,147200.0,2495.24,0.0,not applicable,37.0
12046695,0.0070,Dec,30,4th,not given,not given,not given,3.0,3.0,not given,...,1.0,not given,117893.0,161111.0,2015.0,43218.0,1886.54,0.0,not applicable,51.0
12995401,-0.2679,Dec,30,4th,not given,not given,not given,2.0,4.0,not given,...,1.0,not given,22008.0,38096.0,2015.0,16088.0,1925.70,1.0,2014.0,92.0
11402105,0.0602,Dec,30,4th,not given,not given,not given,2.0,2.0,not given,...,1.0,not given,132991.0,165869.0,2015.0,32878.0,2285.57,0.0,not applicable,35.0


# One-hot encoding

In [6]:
# get list of categorical columns
cat_cols = ['transaction_month', 'transaction_day','transaction_quarter','aircon',
            'architecture', 'basementsqft', 'framing', 'deck', 'heating',
            'poolsizesum', 'county_land_use_code', 'land_use_code','zoning_code',
            'city', 'county', 'neighborhood','zipcode', 'storytypeid', 'material',
            'patio_sqft', 'shed_sqft','assessmentyear', 'taxdelinquencyyear','has_spa',
            'pool_with_spa', 'pool_without_spa', 'fireplaceflag']

prefix_list = ['tm', 'td', 'tq', 'air', 'arch', 'bsqft', 'fram', 'deck', 'heat',
               'poolsize', 'county_lu_code', 'lu_code', 'zoning', 'city',
               'county', 'neigh', 'zip', 'storyid', 'material', 'patiosqft', 'shedsqft',
               'assessyear', 'taxdelyear', 'has_spa', 'pool_with_spa', 'pool_without_spa', 'fireplaceflag']

prefix_dict = dict(zip(cat_cols, prefix_list))

In [7]:
# Identify columns that have 'not given' to make sure the get one-hot encoded
not_given_cols = df.columns[df.isin(['not given']).any()]
print(not_given_cols)

Index(['aircon', 'architecture', 'basementsqft', 'framing', 'deck', 'has_spa',
       'heating', 'poolsizesum', 'pool_with_spa', 'pool_without_spa',
       'zoning_code', 'city', 'neighborhood', 'storytypeid', 'material',
       'patio_sqft', 'shed_sqft', 'fireplaceflag'],
      dtype='object')


In [8]:
# one-hot encode cat cols
df = pd.get_dummies(df, columns = cat_cols, prefix= prefix_dict, drop_first=True)
print(len(df.columns))

984


# Train and Test Split

In [9]:
# Create train_test_split
X = df.loc[:, df.columns != 'logerror']
y = df['logerror']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Scaling



In [10]:
num_cols = ['bathroomcnt', 'bedroomcnt', 'quality', 'home_sqft', 'fireplacecnt', 'garagecarcnt',
            'garage_sqft', 'latitude', 'longitude', 'lot_sqft', 'poolcnt', 'roomcnt', 'unitcnt',
            'numberofstories','tav_built','tax_assessed_value','tav_land','property_tax','age']

In [11]:
# Instantiate StandardScaler
scaler = StandardScaler()

# Fit scaler to the training data
scaler.fit(X_train)

# Transform the train and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Instantiate StandardScaler on num_cols only
scaler_num = StandardScaler()

# Fit scaler to the training data
scaler_num.fit(X_train[num_cols])

# Transform the train and test data
X_train_scaled_num = scaler_num.transform(X_train[num_cols])
X_test_scaled_num = scaler_num.transform(X_test[num_cols])

In [13]:
print('X_train shape:', X_train.shape)
print('X_train_scaled shape:', X_train_scaled.shape)
print('y_train shape:', y_train.shape)

print('X_test shape:', X_test.shape)
print('X_test_scaled shape:', X_test_scaled.shape)
print('y_test shape:', y_test.shape)

X_train shape: (72220, 983)
X_train_scaled shape: (72220, 983)
y_train shape: (72220,)
X_test shape: (18055, 983)
X_test_scaled shape: (18055, 983)
y_test shape: (18055,)


# Modeling

1. Linear Regression
2. K Nearest Neighbors
3. Random Forest

In [14]:
# load modeling packages
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# load metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error

1. Linear Regression

In [15]:
# Create linear regression model
lin_reg = LinearRegression()

# fit and predict
lin_reg.fit(X_train_scaled, y_train)
y_pred = lin_reg.predict(X_test_scaled)

# model evaluation
print('Linear Regression Model (all cols)')
print('MSE :', mean_squared_error(y_test, y_pred))
print('RMSE :', np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE :', mean_absolute_error(y_test, y_pred))

Linear Regression Model (all cols)
MSE : 1.2180978301552139e+20
RMSE : 11036746939.90586
MAE : 145131378.3934689


In [16]:
# Create linear regression model for numeric columns only
# Use only numerical data (that is scaled)
lin_reg = LinearRegression()

# fit and predict
lin_reg.fit(X_train_scaled_num, y_train)
y_pred = lin_reg.predict(X_test_scaled_num)

# model evaluation
print('Linear Regression Model (numeric only)')
print('MSE :', mean_squared_error(y_test, y_pred))
print('RMSE :', np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE :', mean_absolute_error(y_test, y_pred))

Linear Regression Model (numeric only)
MSE : 0.023877017503954212
RMSE : 0.15452189975519395
MAE : 0.06717579397639704


2. K Nearest Neighbors

In [17]:
# Create KNN regression model
knn_reg = KNeighborsRegressor()

# fit and predict
knn_reg.fit(X_train_scaled, y_train)
y_pred = knn_reg.predict(X_test_scaled)

# model evaluation
print('KNN Regression Model (all cols)')
print('MSE :', mean_squared_error(y_test, y_pred))
print('RMSE :', np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE :', mean_absolute_error(y_test, y_pred))

KNN Regression Model (all cols)
MSE : 0.028453071250711715
RMSE : 0.16868038193788784
MAE : 0.08334299086125727


In [18]:
# Create KNN regression model for numeric columns only
# Use only numerical data (that is scaled)
knn_reg = KNeighborsRegressor()

# fit and predict
knn_reg.fit(X_train_scaled_num, y_train)
y_pred = knn_reg.predict(X_test_scaled_num)

# model evaluation
print('KNN Regression Model (numeric only)')
print('MSE :', mean_squared_error(y_test, y_pred))
print('RMSE :', np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE :', mean_absolute_error(y_test, y_pred))

KNN Regression Model (numeric only)
MSE : 0.027959422561107724
RMSE : 0.16721071305723126
MAE : 0.08199119357518693


3. Random Forest Regressor

In [19]:
# Create Random Forest Regression Model
# Use all data, not scaled
rfc_reg = RandomForestRegressor()

# fit and predict
rfc_reg.fit(X_train, y_train)
y_pred = rfc_reg.predict(X_test)

# model evaluation
print('Random Forest Regression Model')
print('MSE :', mean_squared_error(y_test, y_pred))
print('RMSE :', np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE :', mean_absolute_error(y_test, y_pred))

Random Forest Regression Model
MSE : 0.025152852861078868
RMSE : 0.15859650961190436
MAE : 0.07276532013292716
