In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression 
from sklearn.linear_model import LinearRegression
from math import sqrt
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

import env
import wrangle_z
import z_split_scale

In [2]:
df_z = wrangle_z.wrangle_zillow()

## Descriptive statistics

In [3]:
df_z.head()

Unnamed: 0,propertylandusedesc,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,taxamount,fips
0,Single Family Residential,2.0,4.0,1604.0,498347.0,6089.82,6037.0
1,Single Family Residential,3.0,3.0,2384.0,549917.0,6679.55,6037.0
2,Single Family Residential,2.0,3.0,1574.0,235272.0,3876.31,6037.0
3,Single Family Residential,2.0,2.0,1619.0,340000.0,4206.15,6037.0
4,Single Family Residential,3.0,2.0,2408.0,2017254.0,24353.42,6037.0


In [4]:
df_z.tail(10)

Unnamed: 0,propertylandusedesc,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,taxamount,fips
15953,Single Family Residential,2.0,3.0,1284.0,208601.0,2808.39,6037.0
15954,Single Family Residential,2.0,3.0,1284.0,364080.0,4647.69,6037.0
15955,Single Family Residential,5.0,6.0,2745.0,337869.0,5243.39,6037.0
15956,Single Family Residential,1.0,3.0,1357.0,192065.0,3124.43,6037.0
15957,Single Family Residential,1.0,2.0,872.0,59830.0,1084.21,6037.0
15958,Single Family Residential,2.0,3.0,2040.0,321351.0,4532.87,6037.0
15959,Single Family Residential,1.0,2.0,1292.0,32830.0,813.11,6037.0
15960,Single Family Residential,1.0,1.0,684.0,324000.0,4495.39,6037.0
15961,Single Family Residential,2.0,4.0,1536.0,284770.0,4014.27,6037.0
15962,Single Family Residential,2.0,4.0,1706.0,441000.0,6350.89,6037.0


In [5]:
df_z.sample(5)

Unnamed: 0,propertylandusedesc,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,taxamount,fips
14085,Single Family Residential,2.0,3.0,1229.0,550000.0,6625.01,6037.0
603,Single Family Residential,3.0,4.0,1748.0,101513.0,1358.66,6037.0
3701,Single Family Residential,1.0,2.0,814.0,117310.0,2476.13,6037.0
4136,Single Family Residential,2.0,4.0,1258.0,366879.0,4308.61,6037.0
11090,Single Family Residential,2.0,3.0,1874.0,229244.0,2950.49,6037.0


In [6]:
df_z.describe()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,taxamount,fips
count,15963.0,15963.0,15963.0,15963.0,15963.0,15963.0
mean,2.215749,3.252835,1837.059199,463316.5,5680.536779,6044.516758
std,1.026447,0.949797,984.656553,643297.6,7465.277971,17.335701
min,1.0,1.0,320.0,12168.0,49.18,6037.0
25%,2.0,3.0,1233.0,182063.5,2495.545,6037.0
50%,2.0,3.0,1592.0,320432.0,4063.98,6037.0
75%,3.0,4.0,2162.5,528425.5,6408.97,6037.0
max,20.0,25.0,26345.0,19129820.0,228999.21,6111.0


In [7]:
df_z.shape

(15963, 7)

In [8]:
df_z.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15963 entries, 0 to 15962
Data columns (total 7 columns):
propertylandusedesc             15963 non-null object
bathroomcnt                     15963 non-null float64
bedroomcnt                      15963 non-null float64
calculatedfinishedsquarefeet    15963 non-null float64
taxvaluedollarcnt               15963 non-null float64
taxamount                       15963 non-null float64
fips                            15963 non-null float64
dtypes: float64(6), object(1)
memory usage: 997.7+ KB


In [9]:
df_z.isnull().sum()

propertylandusedesc             0
bathroomcnt                     0
bedroomcnt                      0
calculatedfinishedsquarefeet    0
taxvaluedollarcnt               0
taxamount                       0
fips                            0
dtype: int64

## Split variables into X and y, then train and test

In [45]:
X = df_z.drop(columns=['propertylandusedesc', 'taxvaluedollarcnt', 'taxamount', 'fips'])
y=df_z[['taxvaluedollarcnt']]

In [46]:
x_train, x_test, y_train, y_test = z_split_scale.split_my_data(X,y)

In [47]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12770 entries, 8477 to 3582
Data columns (total 3 columns):
bathroomcnt                     12770 non-null float64
bedroomcnt                      12770 non-null float64
calculatedfinishedsquarefeet    12770 non-null float64
dtypes: float64(3)
memory usage: 399.1 KB


In [15]:
x_train.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet
8477,2.5,3.0,1781.0
10275,2.0,4.0,1851.0
1659,3.0,6.0,2788.0
13290,1.0,3.0,1264.0
7209,2.0,3.0,1802.0


In [16]:
y_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3193 entries, 10657 to 3305
Data columns (total 1 columns):
taxvaluedollarcnt    3193 non-null float64
dtypes: float64(1)
memory usage: 49.9 KB


In [17]:
y_test.head()

Unnamed: 0,taxvaluedollarcnt
10657,208546.0
5788,137176.0
1185,227220.0
9415,220308.0
6987,395686.0


## Apply standard scaler to x_train and x_test

In [18]:
train_x_scaled_data, test_x_scaled_data,scaler_x_train, scaler_x_test = z_split_scale.standard_scaler(x_train,x_test)

In [19]:
train_x_scaled_data.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet
8477,0.278822,-0.262844,-0.054008
10275,-0.20766,0.78378,0.01688
1659,0.765304,2.877028,0.965763
13290,-1.180624,-0.262844,-0.577564
7209,-0.20766,-0.262844,-0.032742


In [20]:
test_x_scaled_data.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet
10657,-1.199997,-1.359338,-1.057453
5788,-1.199997,-0.280182,-0.867332
1185,0.759144,-0.280182,0.151103
9415,0.759144,0.798974,0.844789
6987,-0.710212,-0.280182,-0.861165


## Run regression model on data and baseline

In [21]:
lm1 = LinearRegression()

In [28]:
lm1.fit(train_x_scaled_data, y_train)
print("Linear Model:", lm1)

lm1_y_intercept = lm1.intercept_
print("intercept: ", lm1_y_intercept)

lm1_coefficients = lm1.coef_
print("coefficients: ", lm1_coefficients)


Linear Model: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
intercept:  [462043.62960063]
coefficients:  [[  54558.47376137 -137406.19237539  451263.14727761]]


In [30]:
y_pred_lm1 = lm1.predict(train_x_scaled_data)
y_pred_lm1

array([[489000.3946324 ],
       [350635.07111573],
       [544289.11543087],
       ...,
       [364555.6651571 ],
       [302651.54268246],
       [663497.55574983]])

In [25]:
mse_lm1 = mean_squared_error(y_train, y_pred_lm1)
print("linear model\n  mean squared error: {:.3}".format(mse_lm1)) 

r2_lm1 = r2_score(y_train, y_pred_lm1)
print('  {:.2%} of the variance in the tax value can be explained by the model.'.format(r2_lm1))

linear model
  mean squared error: 2.4e+11
  42.98% of the variance in the tax value can be explained by the model.


In [49]:
y_pred_baseline = np.array([y_train.mean()[0]]*len(y_train))
MSE = mean_squared_error(y_train, y_pred_baseline)
SSE = MSE*len(y_train)
RMSE = sqrt(MSE)

evs = explained_variance_score(y_train, y_pred_baseline)

print('sum of squared errors\n model: {:.5}'.format(SSE))
print('  {:.2%} of the variance in the tax value can be explained by the baseline model.'.format(evs))

sum of squared errors
 model: 5.3753e+15
  0.00% of the variance in the tax value can be explained by the baseline model.
