In [1]:
import sktime
print(sktime.__version__)

0.17.0


In [2]:
import pandas as pd
import numpy as np
import os

In [58]:
dfs= pd.read_csv('../../data/final/FINAL_city_nyc.csv')
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              974 non-null    object 
 1   city              974 non-null    object 
 2   dayofweek         974 non-null    int64  
 3   month             974 non-null    int64  
 4   year              974 non-null    int64  
 5   day_condition_le  974 non-null    int64  
 6   temp_mean         974 non-null    float64
 7   precip            974 non-null    int64  
 8   visibility_mean   974 non-null    float64
 9   uvindex_mean      974 non-null    float64
 10  nyc_gasprice_avg  974 non-null    float64
 11  traffic_stdS_PC1  974 non-null    float64
 12  traffic_stdS_PC2  974 non-null    float64
 13  city_rco_T        974 non-null    int64  
dtypes: float64(6), int64(6), object(2)
memory usage: 106.7+ KB


In [59]:
dfs.columns

Index(['date', 'city', 'dayofweek', 'month', 'year', 'day_condition_le',
       'temp_mean', 'precip', 'visibility_mean', 'uvindex_mean',
       'nyc_gasprice_avg', 'traffic_stdS_PC1', 'traffic_stdS_PC2',
       'city_rco_T'],
      dtype='object')

# Moving averages

In [5]:
# create 14 day moving average
dfs['MA14'] = dfs['ride_count_outflow'].rolling(14).mean()

# plot the data and MA
import plotly.express as px
fig = px.line(dfs, x="date", 
              y=["ride_count_outflow", "MA14"], 
              template = 'plotly_dark', 
              title='demand for busiest city bike NYC station: W 21 St & 6 Ave (ID-6140.05)')
fig.show()

# time series with external factors

In [6]:
from pycaret.time_series import *
# from pycaret.time_series import TSForecastingExperiment
# exp = TSForecastingExperiment()

In [60]:
dfs.columns

Index(['date', 'city', 'dayofweek', 'month', 'year', 'day_condition_le',
       'temp_mean', 'precip', 'visibility_mean', 'uvindex_mean',
       'nyc_gasprice_avg', 'traffic_stdS_PC1', 'traffic_stdS_PC2',
       'city_rco_T'],
      dtype='object')

In [61]:
s= setup(dfs, index='date', target='city_rco_T', ignore_features=['city', 'dayofweek', 'month', 'year',
       'day_condition_le', 'temp_mean', 'precip', 'visibility_mean',
       'uvindex_mean', 'nyc_gasprice_avg', 'traffic_stdS_PC1',
       'traffic_stdS_PC2', 'MA14'], fh=14, fold=5, n_jobs=4, use_gpu=False,
          fold_strategy='rolling', verbose=True)

In [62]:
s.get_config('X')

In [63]:
for i in ['mae','smape', 'mase', 'rmsse']:
    s.remove_metric(i)

In [64]:
arima= create_model('arima')

Unnamed: 0,cutoff,RMSE,MAPE,R2
0,2022-07-23,4089.2409,0.0632,0.2308
1,2022-08-06,3488.2437,0.0515,0.3455
2,2022-08-20,5237.8745,0.0804,-0.3879
3,2022-09-03,12830.9564,0.2566,0.095
4,2022-09-17,16627.4281,0.2523,-2.5195
Mean,NaT,8454.7487,0.1408,-0.4472
SD,NaT,5291.8048,0.0933,1.0658


In [50]:
arima.get_params()

{'concentrate_scale': False,
 'enforce_invertibility': True,
 'enforce_stationarity': True,
 'hamilton_representation': False,
 'maxiter': 50,
 'measurement_error': False,
 'method': 'lbfgs',
 'mle_regression': True,
 'order': (1, 0, 0),
 'out_of_sample_size': 0,
 'scoring': 'mse',
 'scoring_args': None,
 'seasonal_order': (0, 1, 0, 14),
 'simple_differencing': False,
 'start_params': None,
 'time_varying_regression': False,
 'trend': None,
 'with_intercept': True}

In [65]:
a_pred= predict_model(arima, fh = 14)

Unnamed: 0,Model,RMSE,MAPE,R2
0,ARIMA,17768.3316,0.3969,-0.8199


In [66]:
plot_model(arima, plot = 'forecast')

In [24]:
tests= check_stats(estimator=arima)
tests[(tests['Test']=='Stationarity') | (tests['Test']=='Summary')]

Unnamed: 0,Test,Test Name,Data,Property,Setting,Value
0,Summary,Statistics,Residual,Length,,960.0
1,Summary,Statistics,Residual,# Missing Values,,0.0
2,Summary,Statistics,Residual,Mean,,3.364065
3,Summary,Statistics,Residual,Median,,2.0634
4,Summary,Statistics,Residual,Standard Deviation,,93.929419
5,Summary,Statistics,Residual,Variance,,8822.735701
6,Summary,Statistics,Residual,Kurtosis,,1.44207
7,Summary,Statistics,Residual,Skewness,,0.159177
8,Summary,Statistics,Residual,# Distinct Values,,959.0
15,Stationarity,ADF,Residual,Stationarity,{'alpha': 0.05},True


In [67]:
t_arima= tune_model(arima, optimize='rmse')

Unnamed: 0,cutoff,RMSE,MAPE,R2
0,2022-07-23,3761.3063,0.0532,0.3492
1,2022-08-06,4352.8302,0.0685,-0.0191
2,2022-08-20,4614.8096,0.0628,-0.0774
3,2022-09-03,12196.0642,0.2477,0.1823
4,2022-09-17,9578.4587,0.1478,-0.1679
Mean,NaT,6900.6938,0.116,0.0534
SD,NaT,3369.9852,0.074,0.1873


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.9s


In [68]:
at_pred= predict_model(t_arima, fh = 14)

Unnamed: 0,Model,RMSE,MAPE,R2
0,ARIMA,10062.7088,0.2215,0.4163


In [28]:
t_arima.get_params()

{'concentrate_scale': False,
 'enforce_invertibility': True,
 'enforce_stationarity': True,
 'hamilton_representation': False,
 'maxiter': 50,
 'measurement_error': False,
 'method': 'lbfgs',
 'mle_regression': True,
 'order': (1, 1, 1),
 'out_of_sample_size': 0,
 'scoring': 'mse',
 'scoring_args': None,
 'seasonal_order': (1, 0, 0, 13),
 'simple_differencing': False,
 'start_params': None,
 'time_varying_regression': False,
 'trend': None,
 'with_intercept': True}

In [29]:
arima.get_params()

{'concentrate_scale': False,
 'enforce_invertibility': True,
 'enforce_stationarity': True,
 'hamilton_representation': False,
 'maxiter': 50,
 'measurement_error': False,
 'method': 'lbfgs',
 'mle_regression': True,
 'order': (1, 0, 0),
 'out_of_sample_size': 0,
 'scoring': 'mse',
 'scoring_args': None,
 'seasonal_order': (0, 1, 0, 13),
 'simple_differencing': False,
 'start_params': None,
 'time_varying_regression': False,
 'trend': None,
 'with_intercept': True}

In [36]:
plot_model(t_arima, plot = 'forecast')

In [31]:
plot_model(t_arima, plot = 'residuals')

In [37]:
plot_model(t_arima, plot = 'acf')

In [42]:
plot_model(t_arima, plot = 'ts')

In [41]:
help(plot_model)

Help on function plot_model in module pycaret.time_series.forecasting.functional:

plot_model(estimator: Union[Any, NoneType] = None, plot: Union[str, NoneType] = None, return_fig: bool = False, return_data: bool = False, verbose: bool = False, display_format: Union[str, NoneType] = None, data_kwargs: Union[Dict, NoneType] = None, fig_kwargs: Union[Dict, NoneType] = None, save: Union[str, bool] = False) -> Union[Tuple[str, list], NoneType]
    This function analyzes the performance of a trained model on holdout set.
    When used without any estimator, this function generates plots on the
    original data set. When used with an estimator, it will generate plots on
    the model residuals.
    
    
    Example
    --------
    >>> from pycaret.datasets import get_data
    >>> airline = get_data('airline')
    >>> from pycaret.time_series import *
    >>> exp_name = setup(data = airline,  fh = 12)
    >>> plot_model(plot="diff", data_kwargs={"order_list": [1, 2], "acf": True, "pacf": T