In [1]:
from prepare import calc_rmse
import prepare
import acquire


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from math import sqrt

# hypothesis testing
from scipy import stats

# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

import plotly.express as px

In [2]:
df = acquire.get_zillow_2017()
df = prepare.prepare_zillow(df)

In [13]:
df[df.bath==0]

Unnamed: 0,bed,bath,squarefeet,lotsquarefeet,value,yearbuilt,fips


In [7]:
train, X_train, y_train, X_val, y_val, X_test, y_test = prepare.split_data(df,'value')

In [15]:
X_train[X_train.bath==0]

Unnamed: 0,bed,bath,squarefeet,lotsquarefeet,yearbuilt,fips_6059.0,fips_6111.0
9519,0.222222,0.0,0.057973,0.000707,0.518519,1.0,0.0
16947,0.000000,0.0,0.041978,0.000270,0.444444,1.0,0.0
18231,0.222222,0.0,0.058461,0.000934,0.555556,0.0,0.0
15098,0.222222,0.0,0.056025,0.000660,0.481481,0.0,0.0
27647,0.111111,0.0,0.058379,0.000760,0.333333,1.0,0.0
...,...,...,...,...,...,...,...
5093,0.222222,0.0,0.067636,0.000601,0.318519,0.0,0.0
49882,0.222222,0.0,0.058542,0.000915,0.474074,0.0,0.0
48620,0.222222,0.0,0.062114,0.001185,0.555556,0.0,0.0
31905,0.222222,0.0,0.054238,0.000659,0.533333,0.0,0.0


In [None]:
train.head()

In [None]:
X_train[X_train.bath==0]

In [None]:
# Loop through X sets and apply new_features function
X_sets = [X_train,X_val,X_test]
for x in X_sets:
    x = prepare.new_features(x)

In [None]:
train.shape, X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

### Four Questions:  
1. Are newer houses worth more?
2. Does increasing the amount of beds and baths per squarefeet change the value?
3. Does increasing lot size increase value?
4. Does the percentage of house sq ft per lot sq ft change the value?

# Are newer houses worth more?  
Continuous vs continous = pearson R, scatter

In [None]:
#px.scatter(train.yearbuilt,train.value)
plt.scatter(train.yearbuilt,train.value)


In [None]:
corr, p = stats.pearsonr(train.yearbuilt, train.value)
corr, p

### It's hard to see correlation in the scatter plot but using pearsonsr confirms a relationship

# Does increasing the amount of beds and baths per squarefeet change the value?

In [None]:
train['bb_sqft'] = (train['bed']+train['bath'])/train['squarefeet']

In [None]:
train.head()

In [None]:
plt.scatter(train.bb_sqft,train.value)

In [None]:
corr, p = stats.pearsonr(train.bb_sqft, train.value)
corr, p

### The plot looks like there might be a negative correlation between bb/sqft and value. Pearson R confirms this is the case

# Does increasing lot size increase value?

In [None]:
plt.scatter(train.lotsquarefeet,train.value)

In [None]:
alpha = .05
corr, p = stats.pearsonr(train.lotsquarefeet, train.value)
print(f'alpha: {alpha}')
print(f'p: {p/2}')
print('corr is positive and p<alpha so we can say that greater lotsquarefeet increases the value')



# Does the percentage of house sq ft per lot sq ft change the value?

In [None]:
train['hsf_lsf'] = train.squarefeet/train.lotsquarefeet

In [None]:
plt.scatter(train.hsf_lsf,train.value)

In [None]:
alpha = .05
corr, p = stats.pearsonr(train.hsf_lsf, train.value)
corr,p

Houses that occupy a larger percentage of the lot size are worth more than when lot size is larger and house size smaller

# Let's run some models!

In [None]:
# Establish baseline predictions 
y_train['base_mean'] = round(y_train.value.mean(),1)
y_train['base_median'] = round(y_train.value.median(),1)

# Evaluate baseline models using RMSE - we will use Mean as the baseline
rmse_train_mean = calc_rmse(y_train.value,y_train.base_mean)
rmse_train_median = calc_rmse(y_train.value, y_train.base_median)

print('RMSE Mean: '),print(rmse_train_mean)
print('\n')
print('RMSE Median: '),print(rmse_train_median)

# Create a dict to store rmse values from our models
rmse = {'baseline':rmse_train_mean}
rmse_val = {}

In [None]:
y_train.head()

In [None]:
X_train[X_train.bed==0]

In [None]:
models= [LinearRegression(),LassoLars(alpha=.1),TweedieRegressor(power=0),TweedieRegressor(power=1)]
preds = []
for m in models:
    model = m
    print(type(model))
    model.fit(X_train, y_train.value)
    
 



In [None]:
    model.fit(X_train, y_train.value
    model.predict(X_train).round(1))