# Explore Zillow

### Imports

In [2]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import env

from sklearn.model_selection import train_test_split

import sklearn.preprocessing

In [3]:

# connection function for accessing mysql 
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [4]:

def split_data(df, stratify_by=""):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    return train, validate, test

In [5]:

# Acquire

query = """
SELECT * 
FROM properties_2017
JOIN predictions_2017 using(parcelid)
WHERE transactiondate between "2017-05-01" and "2017-06-30"
and unitcnt = 1;
"""

df = pd.read_sql(query, get_connection('zillow'))
df.head()

Unnamed: 0,parcelid,id,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate
0,11289917,2061546,1.0,,,2.0,3.0,,6.0,2.0,...,136104.0,2016.0,27214.0,2319.9,Y,15.0,60379010000000.0,1248,-0.362001,2017-06-23
1,11705026,1834372,,,,1.0,2.0,,6.0,1.0,...,35606.0,2016.0,23624.0,543.69,,,60372320000000.0,1772,-0.146056,2017-06-30
2,11389003,2121349,,,,2.0,3.0,,6.0,2.0,...,614000.0,2016.0,449000.0,7673.19,,,60377030000000.0,3273,-0.325393,2017-06-01
3,11967869,2093710,,,,1.0,2.0,,5.0,1.0,...,274237.0,2016.0,191821.0,3267.47,,,60371850000000.0,3429,-0.005566,2017-06-29
4,12035176,1288537,,,,1.0,1.0,,11.0,1.0,...,245906.0,2016.0,146810.0,2926.19,,,60371890000000.0,5444,-0.114435,2017-05-24


In [None]:

# Some prep
df = df.rename(columns={"bedroomcnt": "bedrooms", "bathroomcnt": "bathrooms", "calculatedfinishedsquarefeet": "square_feet", "taxamount": "taxes", "taxvaluedollarcnt": "tax_value"})

In [None]:
features = [
    "parcelid",
    "bedrooms",
    "bathrooms",
    "square_feet",
    "tax_value"
]

df = df[features]
df = df.set_index("parcelid")

# Let's drop the nulls
df = df.dropna()

In [None]:
train, validate, test = split_data(df, stratify_by="tax_value")

In [None]:
train.head()

## Exercises

### 1. Write a function named plot_variable_pairs that accepts a dataframe as input and plots all of the pairwise relationships along with the regression line for each pair.

In [None]:

# .corr() is linear correlation coefficient
train.corr()

In [None]:

sns.scatterplot(x="bedrooms", y="bathrooms", data=train)

In [None]:
def plot_variable_pairs(df):
    sns.pairplot(train, kind="reg")
    

In [None]:
plot_variable_pairs(train)

In [None]:
df.hist(grid=False, figsize=(16,12), color='lightslategrey')

### 2. Write a function named months_to_years that accepts your telco churn dataframe and returns a dataframe with a new feature tenure_years, in complete years as a customer.

In [6]:
df2 = pd.read_sql(query, get_connection('zillow'))

In [7]:
df2 = df2.rename(columns={"bedroomcnt": "bedrooms", "bathroomcnt": "bathrooms", "calculatedfinishedsquarefeet": "square_feet", "taxamount": "taxes", "taxvaluedollarcnt": "tax_value"})

In [8]:
df2.head()

Unnamed: 0,parcelid,id,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathrooms,bedrooms,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,tax_value,assessmentyear,landtaxvaluedollarcnt,taxes,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,id.1,logerror,transactiondate
0,11289917,2061546,1.0,,,2.0,3.0,,6.0,2.0,...,136104.0,2016.0,27214.0,2319.9,Y,15.0,60379010000000.0,1248,-0.362001,2017-06-23
1,11705026,1834372,,,,1.0,2.0,,6.0,1.0,...,35606.0,2016.0,23624.0,543.69,,,60372320000000.0,1772,-0.146056,2017-06-30
2,11389003,2121349,,,,2.0,3.0,,6.0,2.0,...,614000.0,2016.0,449000.0,7673.19,,,60377030000000.0,3273,-0.325393,2017-06-01
3,11967869,2093710,,,,1.0,2.0,,5.0,1.0,...,274237.0,2016.0,191821.0,3267.47,,,60371850000000.0,3429,-0.005566,2017-06-29
4,12035176,1288537,,,,1.0,1.0,,11.0,1.0,...,245906.0,2016.0,146810.0,2926.19,,,60371890000000.0,5444,-0.114435,2017-05-24


In [9]:
features = [
    "parcelid",
    "bedrooms",
    "bathrooms",
    "square_feet",
    "tax_value",
    "yearbuilt"
]

df2 = df2[features]
df2 = df2.set_index("parcelid")


# Let's drop the nulls
df2 = df2.dropna()

In [None]:
train, validate, test = split_data(df2, stratify_by="tax_value")

In [None]:

def house_age(df):
    df['home_age'] = df['yearbuilt']
    

### 3. Write a function named plot_categorical_and_continuous_vars that accepts your dataframe and the name of the columns that hold the continuous and categorical features and outputs 3 different plots for visualizing a categorical variable and a continuous variable.

In [None]:
categorical_vars = []
quantitative_vars = ['bedrooms', 'bathrooms', 'square_feet', 'tax_value']


def plot_categorical_and_continuous_vars(df):
    

### 4. Save the functions you have written to create visualizations in a file named explore.py. Rewrite your notebook code so that you are using the functions imported from this file.

### 5. Explore your dataset with any other visualizations you think will be helpful.

### 6. In a seperate notebook, use the functions you have developed in this exercise with the mall customers dataset.