In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from env import get_db_url
import acquire
import prepare
from prepare import percentage_stacked_plot
import warnings
warnings.filterwarnings("ignore")


ModuleNotFoundError: No module named 'env'

# Explore data (on train only)

In [None]:
# brings in acquire.py
df = acquire.get_zillow_data()

In [None]:
# runs prepare functions
df = prepare.prep_zillow(df)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# splits my data
train_validate, test = train_test_split(df, test_size = 0.10, random_state = 123)
train, validate = train_test_split(train_validate, test_size = 0.20, random_state = 123)
print(train.shape, validate.shape, test.shape)

In [None]:
train.shape

In [None]:
# list discrete columns
discrete = [col for col in [col for col in train.columns if train[col].dtypes in ['int64','float64']] if 'sqft' not in col and  'cnt' not in col and  'nbr' not in col and 'number' not in col and len(train[col].unique())< 50  ]
# print('we have {} columns of discrete variables which are columns with unique values less than 50'.format(len(discrete)))

In [None]:
# list continuous columns
continuous = [col for col in [col for col in train.columns if train[col].dtypes in ['int64','float64']] if col not in discrete]
# print('we have {} columns of continuous variable which are columns with unique values more than 50'.format(len(continuous)))

In [None]:
#function to plot parameter values in graph
def univariate(data,col,vartype=[0,1],hue =None):    
    '''
    Univariate function will plot parameter values in graphs.
    df      : dataframe name
    col     : Column name
    vartype : variable type : continuous or categorical
                Continuous(0)   : Distribution, Violin & Boxplot will be plotted.
                Categorical(1) : Countplot will be plotted.
    hue     : Only applicable in categorical analysis.
    '''
    sns.set(style="darkgrid")
    df = data.copy()
    if vartype == 0:
        fig, ax=plt.subplots(nrows =1,ncols=5,figsize=(20,6))
        #
        ax[0].set_title(col+" Distribution Plot")
        sns.distplot(df[col],ax=ax[0])

        ax[1].set_title(col+" Violin Plot")
        sns.violinplot(data =df, x=col,ax=ax[1], inner="quartile")#.set(ylabel='')
        #
        ax[2].set_title(col+" Box Plot")
        sns.boxplot(data =df, x=col,ax=ax[2],orient='v')
        #
        ax[3].set_title(col+" strip Plot")
        sns.stripplot(data =df, x=col,ax=ax[3])
        df[col]=np.log(df[col])
        ax[4].set_title(col+" scatter Plot")
        sns.scatterplot(x =df[col], y=df['logerror'],ax=ax[4])

        

    if vartype == 1:
        temp = pd.Series(data = hue)
        fig, ax = plt.subplots()
      
        width = len(df[col].unique()) + 3 + 2*len(temp.unique())
        fig.set_size_inches(width , 4)
        ax = sns.countplot(data = df, x= col, order=df[col].value_counts().index,hue = hue) 
        if len(temp.unique()) > 0:
            for p in ax.patches:
                ax.annotate('{:1.1f}%'.format((p.get_height()*100)/float(len(df))), (p.get_x()+0.05, p.get_height()+10))  
        else:
            for p in ax.patches:
                ax.annotate(p.get_height(), (p.get_x()+0.16, p.get_height()+10)) 
        del temp
    else:
        exit
    fig.tight_layout()
    plt.show()

In [None]:
#for col in train[continuous].columns:
#univariate(train,col,0,hue =None)

In [None]:
discrete = [ col for col in discrete if train[col].nunique()>1]

In [None]:
for col in train[discrete].columns:
  univariate(train,col,1,hue =train.transaction_month)

In [None]:
#Temporal data distribution
plt.figure(figsize=(15,10))
sns.countplot(x=train.transaction_month).set_title("Transaction distribution based on month");

## May has the highest amount of transactions

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x=train.index,y=train.log_error,hue=train.transaction_month).set_title("Logerror based on transaction month");

In [None]:
# list my categorical columns
categorical = [col for col in train.columns if train[col].dtypes  in ['object'] ]
categorical

In [None]:
# lists the columns I am using
train.columns

In [None]:
train.head()

In [None]:
categorical = [col for col in train.columns if train[col].dtypes  in ['object'] ]
categorical

### Identifying relationships between tax_amount and other features

In [None]:
sns.scatterplot(x=train.transaction_month, y=train.tax_amount)

In [None]:
#counts the number of transactions in each month
train.transaction_month.value_counts()

### May is the highest month for transactions

In [None]:
# Plots lats/longs by tax amount
sns.scatterplot(x=train.latitude , y=train.longitude, hue=train.tax_amount)

In [None]:
sns.scatterplot(x=train.tax_rate , y=train.square_feet, hue=train.tax_value)

### the higher the square_feet, the higher the tax_value

In [None]:
sns.scatterplot(x=train.tax_amount , y=train.bedrooms)

### 3 and 4 bedrooms have the highest tax_amount as represented

In [None]:
train.bedrooms.value_counts()

In [None]:
sns.scatterplot(x=train.tax_value, y=train.tax_amount, hue=train.square_feet)

### the higher the square_feet the higher the tax_amount and tax value

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x= train[train.tax_value<1000000].county, y= train[train.tax_value<100000].tax_value, data=train[train.tax_value<100000])

### Highest tax values are in Orange county

In [None]:
# plot showing distribution of tax_values
sns.distplot(train.tax_value)

In [None]:
#create a subset of df with continuous variables to create pairplot
train_corr = train[['bathrooms', 'bedrooms', 'square_feet', 'lot_size', 'tax_value']]

In [None]:
# using corr() function to find correlation between columns
train_corr = train_corr.corr()
train_corr

In [None]:
# plots heatmap and correlation values 
plt.figure(figsize=(8,6))
sns.heatmap(train_corr, annot=True, cmap='inferno')

In [None]:
sns.pairplot(train_corr)

### $H_0$: there is no relationship between tax values and number of bedrooms, bathrooms and square_feet
### $H_a$: There is a relationship between tax values and number of bedrooms, bathrooms and square_feet

In [None]:
#Lets take a look at simple model based on bathoom, bedroom and squarefeet and compare if against the baseline(median home value)
X_train = train[['bedrooms', 'bathrooms', 'square_feet']]
y_train = train[['tax_value']]
y_train['y_baseline'] = train['tax_value'].median()
# predictor/independent features split into train, test, validate
X_train = train.drop(columns = ['tax_value'],axis=1)
X_validate = validate.drop(columns = ['tax_value'],axis=1)
X_test = test.drop(columns = ['tax_value'],axis=1)

# target variables split into train, test, validate
#y_train = train.tax_value
y_validate = validate.tax_value
y_test = test.tax_value

In [None]:
y_train.head()

In [None]:
# Calculating the baseline (based on median) RMSE
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse_baseline = sqrt(mean_squared_error(y_train.tax_value, y_train.y_baseline))

rmse_baseline

In [None]:
#We now create a OLS model based on bedrooms, bathrooms and square_feet
from statsmodels.formula.api import ols

# generate parameters, i.e. create model
ols_model = ols('tax_value ~ bedrooms + bathrooms + square_feet', data = train).fit()

# compute predictions and add to original dataframe
y_train['yhat'] = ols_model.predict(X_train)

In [None]:
 y_train.head()

In [None]:
ols_model.summary()

### My p-values are <0.05, I can reject my null hypothesis.
### This means that there is statistically significant relationship between tax_values and the independent variables I used.

In [None]:
rmse = sqrt(mean_squared_error(y_train.tax_value, y_train.yhat))
rmse

###  RMSE for OLS model with bedrooom, bathroom and square feet 201147 < the baseline 219730.  R2 is 0.15 and p-value is 0.

In [None]:
# make a dataframe 'predictions' with actual tax_values
predictions = pd.DataFrame({
    'actual': y_train.tax_value
})
predictions.head()

In [None]:
X_train1 = X_train[['bedrooms', 'bathrooms', 'square_feet']]
y_train1 = y_train[['tax_value']]

In [None]:
#linear regression model
lm = LinearRegression()
# fit our train data on the model
lm.fit(X_train1, y_train1)

In [None]:
print("Linear Model:", lm)

print("intercept: ", lm.intercept_)

print("coefficients: ", lm.coef_)

In [None]:
#predictions from linear regresssion model based on X_train independent variable values
predictions['yhat_lm'] = lm.predict(X_train1)
#Baseline predicted home value (median home price)
predictions['baseline'] = y_train.tax_value.median()

In [None]:
predictions.head()

In [None]:
# Use lamda and calculate RMSE for each columns in prediction dataframe
pd.options.display.float_format = '{:.3f}'.format
predictions.apply(lambda col: sqrt(mean_squared_error(predictions.actual, col)))

In [None]:
rmse

### RMSE performs better than baseline

In [None]:
#using test data and predicting tax values based on linear model above
X_test = test[['bedrooms', 'bathrooms', 'square_feet']]
y_test = test[['tax_value']]
y_test['test_prediction'] = lm.predict(X_test)

In [None]:
y_test.rename(columns = {'tax_value': 'actual'}, inplace = True)
y_test.head()

In [None]:
#Calculating RMSE for test predictions
y_test.apply(lambda col: sqrt(mean_squared_error(y_test.actual, col)))

In [None]:
#compare baseline
rmse_baseline

### RMSE for test data is lower than train data and shows LM is better than baseline

In [None]:
Mth = sns.kdeplot(train.bedrooms[(train["tax_value"] == 0) ],
                color="#0072BD", shade = True)
Mth = sns.kdeplot(train.bedrooms[(train["tax_value"] == 1) ],
                ax =Mth, color="#ebb086", shade= True)
Mth.legend(["No Churn","Churn"])
Mth.set_ylabel('Density')
Mth.set_xlabel('Monthly Charges')
Mth.set_title('Monthly charges by churn')