In [1]:
import warnings
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


import statsmodels.api as sm
from matplotlib.pylab import rcParams
plt.style.use('ggplot')

In [10]:
! pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-1.8.2-cp38-cp38-macosx_10_15_x86_64.whl (592 kB)
[K     |████████████████████████████████| 592 kB 2.4 MB/s eta 0:00:01
Collecting numpy~=1.19.0
  Downloading numpy-1.19.5-cp38-cp38-macosx_10_9_x86_64.whl (15.6 MB)
[K     |████████████████████████████████| 15.6 MB 16.0 MB/s eta 0:00:01
Collecting statsmodels!=0.12.0,>=0.11
  Downloading statsmodels-0.12.2-cp38-cp38-macosx_10_15_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 12.6 MB/s eta 0:00:01     |████████████▏                   | 3.6 MB 12.6 MB/s eta 0:00:01
Installing collected packages: numpy, statsmodels, pmdarima
  Attempting uninstall: numpy
    Found existing installation: numpy 1.18.5
    Uninstalling numpy-1.18.5:
      Successfully uninstalled numpy-1.18.5
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.12.0
    Uninstalling statsmodels-0.12.0:
      Successfully uninstalled statsmodels-0.12.0
[31mERROR: After October 202

In [2]:
def get_datetimes(df):
    """
    Takes a dataframe:
    returns only those column names that can be converted into datetime objects 
    as datetime objects.
    NOTE number of returned columns may not match total number of columns in passed dataframe
    """
    
    return pd.to_datetime(df.columns.values[7:], format='%Y-%m')


def melt_data(df):
    """
    Takes the zillow_data dataset in wide form or a subset of the zillow_dataset.  
    Returns a long-form datetime dataframe 
    with the datetime column names as the index and the values as the 'values' column.
    
    If more than one row is passes in the wide-form dataset, the values column
    will be the mean of the values from the datetime columns in all of the rows.
    """
    
    melted = pd.melt(df, id_vars=['RegionName', 'RegionID', 'SizeRank', 'City', 'State', 'Metro', 'CountyName'], var_name='time')
    melted['time'] = pd.to_datetime(melted['time'], infer_datetime_format=True)
    melted = melted.dropna(subset=['value'])
    return melted.groupby('time').aggregate({'value':'mean'})



# Import the check_stationarity function from previous lab
def stationarity_check(TS):
    
    # Import adfuller
    from statsmodels.tsa.stattools import adfuller
    
    # Calculate rolling statistics
    roll_mean = TS.rolling(window=8, center=False).mean()
    roll_std = TS.rolling(window=8, center=False).std()
    
    # Perform the Dickey Fuller test
    dftest = adfuller(TS) 
    
    # Plot rolling statistics:
    fig = plt.figure(figsize=(12,6))
    orig = plt.plot(TS, color='blue',label='Original')
    mean = plt.plot(roll_mean, color='red', label='Rolling Mean')
    std = plt.plot(roll_std, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    # Print Dickey-Fuller test results
    print('Results of Dickey-Fuller Test: \n')

    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 
                                             '#Lags Used', 'Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
    
    return None

In [3]:
#Loading our dataset
df = pd.read_csv('data/zillow_data.csv')

In [6]:
#dataframe containing regions within the Miami-Fort Lauderdale metro region
miami_metro = df[df['Metro']== 'Miami-Fort Lauderdale']

In [7]:
#Dataframe containining historical prices of the 33480 Zip Code.

zip33480 = miami_metro[miami_metro['RegionName']==33480]

In [8]:
# Time Series dataframe of 33480 Zip Code prices

series33480 = melt_data(zip33480)

In [15]:
import pmdarima as pm

model = pm.auto_arima(series33480, start_p=0, start_q=0,
                     test='adf',
                     max_p=5, max_q=5,
                     m=1,
                     d=0,
                     seasonal = True,
                     start_P=0,start_Q=0,
                     D=0,
                     trace=True,
                     error_action= 'ignore',
                     suppress_warnings=True,
                     stepwise=True,with_intercept=False)

output = model.fit()

print(model.summary())
model.plot_diagnostics(figsize=(18,18))
plt.show()

Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=8625.676, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=inf, Time=0.11 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=8436.157, Time=0.06 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=5828.589, Time=0.33 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=5403.561, Time=0.46 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=inf, Time=0.25 sec
 ARIMA(3,0,1)(0,0,0)[0]             : AIC=inf, Time=0.71 sec
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=5392.761, Time=0.61 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=inf, Time=0.53 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=5401.190, Time=0.65 sec
 ARIMA(2,0,3)(0,0,0)[0]             : AIC=inf, Time=0.97 sec
 ARIMA(1,0,3)(0,0,0)[0]             : AIC=5781.442, Time=0.61 sec
 ARIMA(3,0,3)(0,0,0)[0]             : AIC=inf, Time=0.99 sec
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=5392.860, Time=0.37 sec

Best model:  ARIMA(2,0,2)(0,0,0)[0]          
Total fit time: 

TypeError: fit() missing 1 required positional argument: 'y'