<a href="https://colab.research.google.com/github/smccracken13/Zestimate-Project/blob/main/Zestimate_Modeling_(McCracken).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Pre-processing and Training Data Development

The goals of this notebook are to:

1. Create dummies for categorical data
2. Split the data into train and test sets
3. Scale the data
4. Test out the following models: Linear Regression, KNN, Random Forest, XGBoost

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from google.colab import files

In [3]:
# load zillow_clean.csv
files.upload()

Saving zillow_clean.csv to zillow_clean.csv


In [19]:
df = pd.read_csv('zillow_clean.csv', low_memory=False, index_col = 'Unnamed: 0')

In [None]:
# Remove absolute log error column
df.drop(columns=['fips', 'abs_log_error'], inplace = True)

In [None]:
# set index to parcelid
df.set_index('parcelid')

# One-hot encoding

In [20]:
# get list of categorical columns
cat_cols = ['transaction_month', 'transaction_day','transaction_quarter','aircon',
            'architecture', 'basementsqft', 'framing', 'deck', 'heating',
            'poolsizesum', 'county_land_use_code', 'land_use_code','zoning_code',
            'city', 'county', 'neighborhood','zipcode', 'storytypeid', 'material',
            'patio_sqft', 'shed_sqft','assessmentyear', 'taxdelinquencyyear','has_spa',
            'pool_with_spa', 'pool_without_spa', 'fireplaceflag']

prefix_list = ['tm', 'td', 'tq', 'air', 'arch', 'bsqft', 'fram', 'deck', 'heat',
               'poolsize', 'county_lu_code', 'lu_code', 'zoning', 'city',
               'county', 'neigh', 'zip', 'storyid', 'material', 'patiosqft', 'shedsqft',
               'assessyear', 'taxdelyear', 'has_spa', 'pool_with_spa', 'pool_without_spa', 'fireplaceflag']

prefix_dict = dict(zip(cat_cols, prefix_list))

In [None]:
not_given_cols = df.columns[df.isin(['not given']).any()]
print(not_given_cols)

In [None]:
# one-hot encode cat cols
df = pd.get_dummies(df, columns = cat_cols, prefix= prefix_dict, drop_first=True)
print(len(df.columns))

# Train and Test Split

In [None]:
# Create train_test_split
X = df.loc[:, df.columns != 'logerror']
y = df['logerror']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Scaling



In [None]:
num_cols = ['bathroomcnt', 'bedroomcnt', 'quality', 'home_sqft', 'fireplacecnt', 'garagecarcnt',
            'garage_sqft', 'latitude', 'longitude', 'lot_sqft', 'poolcnt', 'roomcnt', 'unitcnt',
            'numberofstories','tav_built','tax_assessed_value','tav_land','property_tax','age']

In [None]:
# Instantiate StandardScaler
scaler = StandardScaler()

# Fit scaler to the training data
scaler.fit(X_train)

# Transform the train and test data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Instantiate StandardScaler on num_cols only
scaler_num = StandardScaler()

# Fit scaler to the training data
scaler_num.fit(X_train[num_cols])

# Transform the train and test data
X_train_scaled_num = scaler_num.transform(X_train[num_cols])
X_test_scaled_num = scaler_num.transform(X_test[num_cols])

In [None]:
print('X_train shape:', X_train.shape)
print('X_train_scaled shape:', X_train_scaled.shape)
print('y_train shape:', y_train.shape)

print('X_test shape:', X_test.shape)
print('X_test_scaled shape:', X_test_scaled.shape)
print('y_test shape:', y_test.shape)

# Modeling

1. Linear Regression
2. K Nearest Neighbors
3. Random Forest
4. XGBoost

In [None]:
# load modeling packages
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# load metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error

1. Linear Regression

In [21]:
# Create linear regression model
lin_reg = LinearRegression()

# fit and predict
lin_reg.fit(X_train_scaled, y_train)
y_pred = lin_reg.predict(X_test_scaled)

# model evaluation
print('Linear Regression Model (all cols)')
print('MSE : ', mean_squared_error(y_test, y_pred))
print('MAE : ', mean_absolute_error(y_test, y_pred))

Linear Regression Model (all cols)
MSE :  1.8465567909347742e+21
MAE :  679009761.0687196


In [22]:
# Create linear regression model for numeric columns only
# Use only numerical data (that is scaled)
lin_reg = LinearRegression()

# fit and predict
lin_reg.fit(X_train_scaled_num, y_train)
y_pred = lin_reg.predict(X_test_scaled_num)

# model evaluation
print('Linear Regression Model (numeric only)')
print('MSE : ', mean_squared_error(y_test, y_pred))
print('MAE : ', mean_absolute_error(y_test, y_pred))

Linear Regression Model (numeric only)
MSE :  0.023877017503954212
MAE :  0.06717579397639704


2. K Nearest Neighbors

In [23]:
# Create KNN regression model
knn_reg = KNeighborsRegressor()

# fit and predict
knn_reg.fit(X_train_scaled, y_train)
y_pred = knn_reg.predict(X_test_scaled)

# model evaluation
print('KNN Regression Model (all cols)')
print('MSE : ', mean_squared_error(y_test, y_pred))
print('MAE : ', mean_absolute_error(y_test, y_pred))

KNN Regression Model (all cols)
MSE :  0.028453071250711715
MAE :  0.08334299086125727


In [24]:
# Create KNN regression model for numeric columns only
# Use only numerical data (that is scaled)
knn_reg = KNeighborsRegressor()

# fit and predict
knn_reg.fit(X_train_scaled_num, y_train)
y_pred = knn_reg.predict(X_test_scaled_num)

# model evaluation
print('KNN Regression Model (numeric only)')
print('MSE : ', mean_squared_error(y_test, y_pred))
print('MAE : ', mean_absolute_error(y_test, y_pred))

KNN Regression Model (numeric only)
MSE :  0.027959422561107724
MAE :  0.08199119357518693


3. Random Forest Regressor

In [25]:
# Create Random Forest Regression Model
# Use all data, not scaled
rfc_reg = RandomForestRegressor()

# fit and predict
rfc_reg.fit(X_train, y_train)
y_pred = rfc_reg.predict(X_test)

# model evaluation
print('Random Forest Regression Model')
print('MSE : ', mean_squared_error(y_test, y_pred))
print('MAE : ', mean_absolute_error(y_test, y_pred))

Random Forest Regression Model
MSE :  0.025097058371440988
MAE :  0.07279008811963446


4. XGBoost

In [28]:
# Create XGBoost regression model
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators = 10, seed=1)

# fit and predict
xgb_reg.fit(X_train_scaled_num, y_train)
y_pred = xgb_reg.predict(X_test_scaled_num)

# model evaluation
print('XGBoost Regression Model')
print('MSE : ', mean_squared_error(y_test, y_pred))
print('MAE : ', mean_absolute_error(y_test, y_pred))

XGBoost Squared Error Model
MSE :  0.02416183620303665
MAE :  0.07049195631184317
