# Step 1: Load the Data
Our main data set is stored in the `zillow_data.csv` spread sheet 

In [3]:
import pandas as pd

In [4]:
raw = pd.read_csv('../data/zillow_data.csv')

In [5]:
filtered = raw.query("City == 'Baltimore'")
print(filtered.shape)
filtered.head()

(15, 272)


Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,...,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04
316,66825,21215,Baltimore,MD,Baltimore,Baltimore City,317,53500.0,53500.0,53400.0,...,59400,61400,63200,64100,65100,66800,67400,69500,72900,75000
441,66834,21224,Baltimore,MD,Baltimore,Baltimore City,442,49600.0,49400.0,49200.0,...,156900,161000,165800,168800,171100,173800,176400,177900,177500,176600
605,66828,21218,Baltimore,MD,Baltimore,Baltimore City,606,44700.0,44700.0,44700.0,...,73000,74500,76400,78100,81100,84800,88200,91800,96400,99900
677,66816,21206,Baltimore,MD,Baltimore,Baltimore City,678,73200.0,73300.0,73500.0,...,113100,115700,118400,119800,121100,123400,126600,129600,132400,135200
1004,66839,21229,Baltimore,MD,Baltimore,Baltimore City,1005,56000.0,56200.0,56500.0,...,77300,80000,83000,84400,85400,86100,86500,88400,92400,95600


# Step 2: Data Preprocessing

In [None]:
melted = df.melt(id_vars=df.columns.values[:7], 
        value_vars=df.columns.values[7:], 
        var_name='Date',
        value_name='MedianSales'
       )
melted.head()

In [None]:
converted = melted.copy()
converted['Date'] = pd.to_datetime(melted['Date'])
print(converted.shape)
converted.head()

In [None]:
cleaned = converted.dropna(subset=['MedianSales'])
print(cleaned.shape)
cleaned.head()

In [None]:
grouped = cleaned.groupby('Date').agg({'MedianSales': 'mean'})
print(grouped.shape)
grouped.head()

# Step 3: EDA and Visualization

In [None]:
grouped.plot()

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(16,3))
plot_acf(grouped, ax=ax, lags=9);

fig, ax = plt.subplots(figsize=(16,3))
plot_pacf(grouped, ax=ax, lags=9);

# Step 4: ARIMA Modeling

In [None]:
# Import ARMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm

# Instantiate an AR(1) model to the simulated data
mod_arima = SARIMAX(grouped, order=(1, 1, 1), seasonal_order=(1, 1, 0, 12), freq='MS')

In [None]:
res_arima = mod_arima.fit()

In [None]:
from itertools import combinations_with_replacement
best_aic = 1e16
best_model = None
for comb in combinations_with_replacement([0, 1, 2], 4):
    p, q, P, Q = comb
    mod_sarima = SARIMAX(grouped, order=(p, 1, q), seasonal_order=(P, 1, Q, 12), freq='MS')
    res_sarima = mod_sarima.fit()
    aic = res_sarima.aic
    if aic < best_aic:
        print(aic)
        best_aic = aic
        best_model = mod_sarima
        
        


In [None]:
best_res = best_model.fit()

In [None]:
print(best_res.summary())

In [None]:
plt.plot(best_res.predict())

# Step 5: Interpreting Results