# Mod 4 Project - Starter Notebook

This notebook has been provided to you so that you can make use of the following starter code to help with the trickier parts of preprocessing the Zillow dataset. 

The notebook contains a rough outline the general order you'll likely want to take in this project. You'll notice that most of the areas are left blank. This is so that it's more obvious exactly when you should make use of the starter code provided for preprocessing.


# Step 1: Load the Data/Filtering for Chosen Zipcodes

In [22]:
# Import necessary functions

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report

from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

sns.set(font_scale = 1)

In [23]:
df = pd.read_csv('data/zillow_data.csv')

In [24]:
df.shape

(14723, 272)

# Step 2: Data Preprocessing

In [25]:
melted = pd.melt(df, id_vars=['RegionName', 'RegionID', 'SizeRank', 'City', 'State', 'Metro', 'CountyName'], var_name='time')
melted['time'] = pd.to_datetime(melted['time'], infer_datetime_format=True)
melted = melted.dropna(subset=['value'])

In [26]:
melted

Unnamed: 0,RegionName,RegionID,SizeRank,City,State,Metro,CountyName,time,value
0,60657,84654,1,Chicago,IL,Chicago,Cook,1996-04-01,334200.0
1,75070,90668,2,McKinney,TX,Dallas-Fort Worth,Collin,1996-04-01,235700.0
2,77494,91982,3,Katy,TX,Houston,Harris,1996-04-01,210400.0
3,60614,84616,4,Chicago,IL,Chicago,Cook,1996-04-01,498100.0
4,79936,93144,5,El Paso,TX,El Paso,El Paso,1996-04-01,77300.0
...,...,...,...,...,...,...,...,...,...
3901590,1338,58333,14719,Ashfield,MA,Greenfield Town,Franklin,2018-04-01,209300.0
3901591,3293,59107,14720,Woodstock,NH,Claremont,Grafton,2018-04-01,225800.0
3901592,40404,75672,14721,Berea,KY,Richmond,Madison,2018-04-01,133400.0
3901593,81225,93733,14722,Mount Crested Butte,CO,,Gunnison,2018-04-01,664400.0


In [27]:
melted = melted.rename(columns={'RegionName': 'ZipCode'})

In [28]:
melted = melted.drop('RegionID', axis=1)

In [29]:
melted

Unnamed: 0,ZipCode,SizeRank,City,State,Metro,CountyName,time,value
0,60657,1,Chicago,IL,Chicago,Cook,1996-04-01,334200.0
1,75070,2,McKinney,TX,Dallas-Fort Worth,Collin,1996-04-01,235700.0
2,77494,3,Katy,TX,Houston,Harris,1996-04-01,210400.0
3,60614,4,Chicago,IL,Chicago,Cook,1996-04-01,498100.0
4,79936,5,El Paso,TX,El Paso,El Paso,1996-04-01,77300.0
...,...,...,...,...,...,...,...,...
3901590,1338,14719,Ashfield,MA,Greenfield Town,Franklin,2018-04-01,209300.0
3901591,3293,14720,Woodstock,NH,Claremont,Grafton,2018-04-01,225800.0
3901592,40404,14721,Berea,KY,Richmond,Madison,2018-04-01,133400.0
3901593,81225,14722,Mount Crested Butte,CO,,Gunnison,2018-04-01,664400.0


In [30]:
melted.set_index('time', inplace=True)

In [31]:
melted['ZipCode'] = melted['ZipCode'].map("{:05}".format)

In [32]:
melted

Unnamed: 0_level_0,ZipCode,SizeRank,City,State,Metro,CountyName,value
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1996-04-01,60657,1,Chicago,IL,Chicago,Cook,334200.0
1996-04-01,75070,2,McKinney,TX,Dallas-Fort Worth,Collin,235700.0
1996-04-01,77494,3,Katy,TX,Houston,Harris,210400.0
1996-04-01,60614,4,Chicago,IL,Chicago,Cook,498100.0
1996-04-01,79936,5,El Paso,TX,El Paso,El Paso,77300.0
...,...,...,...,...,...,...,...
2018-04-01,01338,14719,Ashfield,MA,Greenfield Town,Franklin,209300.0
2018-04-01,03293,14720,Woodstock,NH,Claremont,Grafton,225800.0
2018-04-01,40404,14721,Berea,KY,Richmond,Madison,133400.0
2018-04-01,81225,14722,Mount Crested Butte,CO,,Gunnison,664400.0


# Step 3: EDA and Visualization

In [33]:
# font = {'family' : 'normal',
        # 'weight' : 'bold',
        # 'size'   : 22}

# plt.rc('font', **font)

# NOTE: if you visualizations are too cluttered to read, try calling 'plt.gcf().autofmt_xdate()'!

In [34]:
melted_states = melted.groupby('State').mean()
melted_states.reset_index(drop=False, inplace=True)

In [35]:
melted_states.sort_values(by='value', ascending=False).head(10)

Unnamed: 0,State,SizeRank,value
7,DC,2675.0,487971.048218
4,CA,4994.285179,457429.433338
11,HI,6866.973086,456644.65392
31,NJ,7584.605578,325240.623919
19,MA,7838.689916,322503.757992
20,MD,7649.189295,264829.234655
34,NY,8758.210618,264731.711413
6,CT,7825.217742,260539.637858
5,CO,6627.960242,258393.456322
47,WA,6471.047953,246150.326746


In [36]:
import plotly.express as px

In [37]:
fig = px.choropleth(melted_states,
                    locations='State', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='value',
                    color_continuous_scale="Viridis_r",      
                    )
fig.update_layout(
      title_text = 'Mean Home Value by State, 1996 - 2018',
      title_font_family="Arial",
      title_font_size = 22,
      title_font_color="black", 
      title_x=0.45, 
         )
fig.show()

In [38]:
melted_zip = melted.groupby('ZipCode').mean()
melted_zip.reset_index(drop=False, inplace=True)

In [39]:
melted_zip.sort_values(by='value', ascending=False).head(10)

Unnamed: 0,ZipCode,SizeRank,value
1405,10021,273.0,12854270.0
1403,10011,21.0,7755844.0
1404,10014,509.0,6836902.0
1406,10128,22.0,5085436.0
13590,94027,10238.0,3487129.0
12180,81611,7597.0,3147124.0
12902,90210,4817.0,2789977.0
5528,33480,4724.0,2634498.0
13621,94123,2027.0,2630977.0
5038,31561,14623.0,2403194.0


In [40]:
melted_county = melted.groupby('CountyName').mean()
melted_county.reset_index(drop=False, inplace=True)

In [41]:
melted_county.sort_values(by='value', ascending=False).head(10)

Unnamed: 0,CountyName,SizeRank,value
766,New York,638.056385,5314744.0
856,Pitkin,11277.666667,2092172.0
963,San Francisco,1827.105263,1160127.0
747,Nantucket,5781.0,1059878.0
968,San Mateo,4724.528487,972512.1
666,Marin,7582.8125,923662.7
975,Santa Clara,3730.038462,831519.1
974,Santa Barbara,6253.944444,647131.0
327,Dukes,10633.4,643844.2
28,Arlington,3957.444444,564767.6


# Step 5: ARIMA Modeling

# Step 6: Interpreting Results