In [1]:
from acquire import acquire_data
from wrangle import wrangle_data

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Get the Data 

In [2]:
df = acquire_data()

csv has been previously generated.
Data Acquired


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,taxamount,fips,FIPS,Name,State
0,0,3.0,2.0,1458.0,136104.0,2319.9,6037.0,6037,Los Angeles,CA
1,1,2.0,1.0,1421.0,35606.0,543.69,6037.0,6037,Los Angeles,CA
2,2,3.0,2.0,1650.0,614000.0,7673.19,6037.0,6037,Los Angeles,CA
3,3,2.0,1.0,693.0,274237.0,3267.47,6037.0,6037,Los Angeles,CA
4,4,0.0,0.0,1378.0,168828.0,2135.39,6037.0,6037,Los Angeles,CA


## Clean the Data

In [4]:
# Dropping the weird unnamed (but named) column that came over from the csv
# The fips columns were useful for merging the two dataframes together, but now we can drop them
df.drop(columns=['Unnamed: 0','fips', 'FIPS'], inplace=True)

In [5]:
df.columns

Index(['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet',
       'taxvaluedollarcnt', 'taxamount', 'Name', 'State'],
      dtype='object')

In [6]:
# Renaming the columns
df.rename(columns={'bedroomcnt':'bedroom_count', 'bathroomcnt':'bathroom_count', 'calculatedfinishedsquarefeet':'total_sqft',
       'taxvaluedollarcnt':'property_value', 'taxamount':'tax_amount', 'Name':'county', 'State':'state'}, inplace=True)

In [7]:
df.head(1)

Unnamed: 0,bedroom_count,bathroom_count,total_sqft,property_value,tax_amount,county,state
0,3.0,2.0,1458.0,136104.0,2319.9,Los Angeles,CA


# Drop Missing Values

## Convert zeros to nulls

In [8]:
df.replace(0, np.nan, inplace=True)
df.dropna(inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14989 entries, 0 to 15035
Data columns (total 7 columns):
bedroom_count     14989 non-null float64
bathroom_count    14989 non-null float64
total_sqft        14989 non-null float64
property_value    14989 non-null float64
tax_amount        14989 non-null float64
county            14989 non-null object
state             14989 non-null object
dtypes: float64(5), object(2)
memory usage: 936.8+ KB


## Split the Data

In [10]:
train, test = train_test_split(df, random_state=115, train_size=.8)

In [11]:
print(f'train: {train.shape}, test: {test.shape}')

train: (11991, 7), test: (2998, 7)


# Module

In [12]:
def drop_columns(df):
    df.drop(columns=['Unnamed: 0','fips', 'FIPS'], inplace=True)
    return df

In [13]:
def rename_columns(df):
    df.rename(columns={'bedroomcnt':'bedroom_count', 'bathroomcnt':'bathroom_count', 'calculatedfinishedsquarefeet':'total_sqft',
       'taxvaluedollarcnt':'property_value', 'taxamount':'tax_amount', 'Name':'county', 'State':'state'}, inplace=True)
    return df

In [14]:
def clean_data(df):
    df.replace(0, np.nan, inplace=True)
    df.dropna(inplace=True)
    return df

In [15]:
def split_data(df, seed, train_size):
    train, test = train_test_split(df, random_state=seed, train_size=train_size)
    return train, test

In [16]:
def wrangle(df, seed, train_size):
    df = drop_columns(df)
    df = rename_columns(df)
    df = clean_data(df)
    train, test = split_data(df, seed, train_size)
    print('Data Prepared')
    return train, test

# Test the Functions

In [17]:
test_df = acquire_data()

csv has been previously generated.
Data Acquired


In [18]:
train, test = wrangle(test_df, 115, .8)

Data Prepared


In [19]:
test_df.head()

Unnamed: 0,bedroom_count,bathroom_count,total_sqft,property_value,tax_amount,county,state
0,3.0,2.0,1458.0,136104.0,2319.9,Los Angeles,CA
1,2.0,1.0,1421.0,35606.0,543.69,Los Angeles,CA
2,3.0,2.0,1650.0,614000.0,7673.19,Los Angeles,CA
3,2.0,1.0,693.0,274237.0,3267.47,Los Angeles,CA
5,3.0,2.0,1108.0,486866.0,5990.5,Los Angeles,CA


# Test the Module

In [20]:
df = acquire_data()

csv has been previously generated.
Data Acquired


In [21]:
train, test = wrangle_data(df, 115, .8)

Data Prepared and Split


In [22]:
df.head()

Unnamed: 0,bedroom_count,bathroom_count,total_sqft,property_value,tax_amount,county,state
0,3.0,2.0,1458.0,136104.0,2319.9,Los Angeles,CA
1,2.0,1.0,1421.0,35606.0,543.69,Los Angeles,CA
2,3.0,2.0,1650.0,614000.0,7673.19,Los Angeles,CA
3,2.0,1.0,693.0,274237.0,3267.47,Los Angeles,CA
5,3.0,2.0,1108.0,486866.0,5990.5,Los Angeles,CA
