In [1]:
import pandas as pd
import numpy as np
import wrangle
import matplotlib.pyplot as plt
from math import sqrt

# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")



In [2]:
# Acquire and clean data
df = wrangle.wrangle_zillow()
df = df.sample(100000,random_state=21)

# Get dummies for fips
full = pd.get_dummies(df,columns=['fips'],drop_first=True)

# Split df
train, X_train, y_train, X_val, y_val, X_test, y_test = wrangle.split_data(full,'value')
df.shape, train.shape, X_val.shape, X_test.shape

((100000, 7), (56000, 8), (24000, 7), (20000, 7))

In [3]:
y_val = pd.DataFrame(y_val)


In [4]:
# Establish baseline predictions 
y_train = pd.DataFrame(y_train)
y_train['base_mean'] = round(y_train.value.mean(),1)
y_train['base_median'] = round(y_train.value.median(),1)


In [5]:
# Evaluate baseline models using RMSE - we will use Mean as the baseline
rmse_train_mean = mean_squared_error(y_train.value,y_train.base_mean)**(1/2)
rmse_train_median = mean_squared_error(y_train.value, y_train.base_median)**(1/2)

print('RMSE Mean: '),print(rmse_train_mean)
print('\n')
print('RMSE Median: '),print(rmse_train_median)

rmse = {'baseline':rmse_train_mean}

RMSE Mean: 
465755.3753754042


RMSE Median: 
475355.5144812572


In [6]:
rmse_val = {}

In [7]:
# Make predictions using LinearRegression, store RMSE in dict
lm = LinearRegression()
lm.fit(X_train, y_train.value)
y_train['lm'] = lm.predict(X_train).round(1)
rmse['lm'] = sqrt(mean_squared_error(y_train.value,y_train.lm))


In [8]:
# Make predictions using LassoLars, store RMSE in dict
lasso = LassoLars(alpha=.1)
lasso.fit(X_train, y_train.value)
y_train['lasso'] = lasso.predict(X_train).round(1)
rmse['lasso'] = sqrt(mean_squared_error(y_train.value,y_train.lasso))

In [9]:
# Make predictions using Tweedy, store RMSE in dict
tweedy = TweedieRegressor(power=0)
tweedy.fit(X_train, y_train.value)
y_train['tweedy0'] = tweedy.predict(X_train).round(1)
rmse['tweedy0'] = sqrt(mean_squared_error(y_train.value,y_train.tweedy0))

In [10]:
# Make predictions using Tweedy using power = 1, store RMSE in dict
tweedy1 = TweedieRegressor(power=1)
tweedy1.fit(X_train, y_train.value)
y_train['tweedy1'] = tweedy1.predict(X_train).round(1)
rmse['tweedy1'] = sqrt(mean_squared_error(y_train.value,y_train.tweedy1))

In [11]:
# Transform our X_train and X_val set into polynomials of range 1-3
pf1 = PolynomialFeatures(degree=1)
X_train_degree1 = pf1.fit_transform(X_train)
X_val_degree1 = pf1.fit_transform(X_val)

pf2 = PolynomialFeatures(degree=2)
X_train_degree2 = pf2.fit_transform(X_train)
X_val_degree2 = pf2.fit_transform(X_val)

pf3 = PolynomialFeatures(degree=3)
X_train_degree3 = pf3.fit_transform(X_train)
X_val_degree3 = pf3.fit_transform(X_val)
X_test_degree3 = pf3.fit_transform(X_test)

In [12]:
# Use our poly transformed X_train set with a LinearRegression model
lm_poly = LinearRegression(normalize=True)
lm_poly.fit(X_train_degree1,y_train.value)
y_train['lmpoly1'] = lm_poly.predict(X_train_degree1)
rmse['lmpoly1'] = sqrt(mean_squared_error(y_train.value,y_train.lmpoly1))

y_val['lmpoly1'] = lm_poly.predict(X_val_degree1)
rmse_val['lmpoly1'] = sqrt(mean_squared_error(y_val.value,y_val.lmpoly1))

lm_poly2 = LinearRegression(normalize=True)
lm_poly2.fit(X_train_degree2,y_train.value)
y_train['lmpoly2'] = lm_poly2.predict(X_train_degree2)
rmse['lmpoly2'] = sqrt(mean_squared_error(y_train.value,y_train.lmpoly2))

y_val['lmpoly2'] = lm_poly2.predict(X_val_degree2)
rmse_val['lmpoly2'] = sqrt(mean_squared_error(y_val.value,y_val.lmpoly2))

lm_poly3 = LinearRegression(normalize=True)
lm_poly3.fit(X_train_degree3,y_train.value)
y_train['lmpoly3'] = lm_poly3.predict(X_train_degree3)
rmse['lmpoly3'] = sqrt(mean_squared_error(y_train.value,y_train.lmpoly3))

y_val['lmpoly3'] = lm_poly3.predict(X_val_degree3)
rmse_val['lmpoly3'] = sqrt(mean_squared_error(y_val.value,y_val.lmpoly3))



In [15]:
# Apply poly 3 to Test set, RMSE of $93,070
lm_poly3.predict(X_test_degree3)
sqrt(mean_squared_error(y_test,lm_poly3.predict(X_test_degree3)))

93070.720569808

In [13]:
rmse

{'baseline': 465755.3753754042,
 'lm': 113880.82936547349,
 'lasso': 113880.86787754115,
 'tweedy0': 463635.4309044032,
 'tweedy1': 697190.8871938162,
 'lmpoly1': 113880.82926910011,
 'lmpoly2': 86838.36858790288,
 'lmpoly3': 64139.40683789053}

In [14]:
rmse_val

{'lmpoly1': 65633.88749626571,
 'lmpoly2': 70887.8244599154,
 'lmpoly3': 66043.62389005676}