In [None]:
from acquire import acquire_data
from wrangle import wrangle_data
import env

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from scipy.stats import linregress

from math import sqrt

# Aqcuisition

In [None]:
df = acquire_data()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
print(f'Total number of rows: {len(df)}')

# Preparation

In [None]:
seed = 115
train_size = .8

train, test = wrangle_data(df, seed, train_size)

In [None]:
train.head()

In [None]:
train.info()

# Exploration

## Exploration Goals

* Create graphs that highlights the effect these three features have on the price of a home
* Get an idea of underlying patterns in the data

## Graphs

In [None]:
# Figure: Scatterplot of Features

y = train.tax_amount

plt.figure(figsize=(14,6))

# Left graph
plt.subplot(131)
sns.scatterplot(train.total_sqft, y)
plt.title('Total Square Feet')
plt.xlabel('Total Square Feet')
plt.ylabel('Tax Amount')

# Center graph
plt.subplot(132)
sns.scatterplot(train.bedroom_count, y)
plt.title('Number of Bedrooms')
plt.xlabel('# of Bedrooms')
plt.ylabel('Tax Amount')

# Right graph
plt.subplot(133)
sns.scatterplot(train.bathroom_count, y)
plt.title('Number of Bathrooms')
plt.xlabel('# of Bathrooms')
plt.ylabel('Tax Amount')

plt.suptitle('What Affects Taxes Most')

In [None]:
# Figure: Pairplot of Features

sns.pairplot(df[['total_sqft', 'bedroom_count', 'bathroom_count', 'tax_amount']], kind='reg')
plt.suptitle('Correlation Between Square Feet, Bedrooms and Bathrooms', size=14, y=1.02)
plt.figure(figsize=(16, 16))

In [None]:
corr = train.corr()
sns.heatmap(corr, cmap="BuGn")

# Modeling

## Create a Dataframe to Hold Actual and Predicted Values

In [None]:
predictions = pd.DataFrame(
    {'actual': train.tax_amount})

## Create the Models

In [None]:
# Model Using the Total Square Feet to Predict Tax Amount

x = train[['total_sqft']]
y = train.tax_amount

lm_sqft = LinearRegression().fit(x, y)

In [None]:
# Model Using the Number of Bedrooms to Predict Tax Amount

x = train[['bedroom_count']]
y = train.tax_amount

lm_bedroom = LinearRegression().fit(x, y)

In [None]:
# Model Using the Number of Bathrooms to Predict Tax Amount

x = train[['bathroom_count']]
y = train.tax_amount

lm_bathroom = LinearRegression().fit(x, y)

## Use the  Models to Create Predictions

In [None]:
predictions['baseline'] = y.mean()
predictions['lm_sqft'] = lm_sqft.predict(x)
predictions['lm_bedroom'] = lm_bedroom.predict(x)
predictions['lm_bathroom'] = lm_bathroom.predict(x)

In [None]:
predictions.head()

# Evaluation

## Use RMSE to Compare Models

In [None]:
rmse_sqft = sqrt(mse(predictions.actual, predictions.lm_sqft))
rmse_bedroom = sqrt(mse(predictions.actual, predictions.lm_bedroom))
rmse_bathroom = sqrt(mse(predictions.actual, predictions.lm_bathroom))
rmse_baseline = sqrt(mse(predictions.actual, predictions.baseline))

In [None]:
print('sqft: {: .2f}, bedroom: {: .2f}, bathroom: {: .2f}, baseline: {: .2f}'.format(rmse_sqft, rmse_bedroom, rmse_bathroom, rmse_baseline))

## Result

__Model using bathrooms as driver performed the best__. Additionally, model beats baseline indicating model is significant.