# Log Transformations

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import sklearn.metrics as metrics

import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
import statsmodels.stats.api as sms

In [2]:
df = pd.read_csv('model_data.csv')
df.head()

Unnamed: 0,price,sqft_living,sqft_lot,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15,br_2,...,gd_4,gd_5,gd_6,gd_7,gd_8,gd_9,gd_10,gd_11,gd_12,wf_1.0
0,221900.0,-1.096635,-0.304539,1180,0.0,1955,1955.0,-0.992008,-0.319209,0,...,0,0,0,1,0,0,0,0,0,0
1,538000.0,0.870195,0.046284,2170,400.0,1951,1991.0,-0.245102,0.137043,0,...,0,0,0,1,0,0,0,0,0,0
2,180000.0,-2.175269,0.502317,770,0.0,1933,1933.0,1.286644,0.218571,1,...,0,0,1,0,0,0,0,0,0,0
3,604000.0,0.185533,-0.477261,1050,910.0,1965,1965.0,-0.944324,-0.50409,0,...,0,0,0,1,0,0,0,0,0,0
4,510000.0,-0.203973,0.201026,1680,0.0,1987,1987.0,-0.042143,0.109869,0,...,0,0,0,0,1,0,0,0,0,0


In [6]:
linreg = LinearRegression()

# our feature variables
X = df.drop(['price'], axis=1)

# our target variable
y = df['price']

# separate our data into testing and training subsets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

# fitting our training feature variables to our training target variable (price)
linreg.fit(X_train, y_train)

# predicted prices from our training subset
y_hat_train = linreg.predict(X_train)

# predicted prices from our testing subset
y_hat_test = linreg.predict(X_test)

# calculating residuals
train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test

# calculate mean square error for our test and training results
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
print('Train Mean Squared Error:', round(train_mse,2))
print('Test Mean Squared Error:', round(test_mse,2))

# calculate r-squared
r2 = round(linreg.score(X,y), 3)
print("R Squared:", r2)

# calculate mean absolute error
mean_abs_err = metrics.mean_absolute_error(y_test, y_hat_test)
print("Mean Absolute Error:", round(mean_abs_err,2))

# calculate root mean squared error
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_hat_test))
print("Root Mean Squared Error:", round(rmse,2))

# calculate mean predicted price and mean actual price
average_predicted_price = y_hat_test.mean()
average_actual_price = y_test.mean()
print("Average Predicted Price:", round(average_predicted_price,2))
print("Average Actual Price:", round(average_actual_price,2))
print("Difference:", round(average_predicted_price - average_actual_price, 2))

Train Mean Squared Error: 18734722588.94
Test Mean Squared Error: 18450854084.6
R Squared: 0.612
Mean Absolute Error: 104553.88
Root Mean Squared Error: 135833.92
Average Predicted Price: 480476.9
Average Actual Price: 480593.35
Difference: -116.45


In [7]:
## R-Squared of .61, but RSME is 135,833.92 meaning that my model is about $135,833 off of the price value of a home.
# This will be my baseline as I start to look further into how to tighten this model up. 