# How to Develop Simple Methods for Univariate Forecasting

In [26]:
# import libraries
from math import sqrt
from numpy import mean
from numpy import median
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from sklearn.metrics import mean_squared_error
from pandas import read_csv

In [27]:
# one-step simple forecast
def simple_forecast(history, config):
    n, offset, avg_type = config
    # persist value, ignore other config
    if avg_type == 'persist':
        return history[-n]
    # collect values to average
    values = list()
    if offset == 1:
        values = history[-n:]
    else:
        # skip bad configs
        if n*offset > len(history):
            raise Exception('Config beyond end of data: %d %d' % (n,offset))
        # try and collect n values using offset
        for i in range(1, n+1):
            ix = i * offset
            values.append(history[-ix])
    # check if we can average
    if len(values) < 2:
        raise Exception('Cannot calculate average')
    # mean of last n values
    if avg_type == 'mean':
        return mean(values)
    # median of last n values
    return median(values)

In [28]:
# measure root mean squared error or rmse
def measure_rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [29]:
# split a univariate dataset into train-test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]


In [30]:
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # fit model and make forecast for history
        yhat = simple_forecast(history, cfg)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
    # estimate prediction error
    error = measure_rmse(test, predictions)
    return error

In [31]:
# score a model, return None on failure
def score_model(data, n_test, cfg, debug=False):
    result = None
    # convert config to a key
    key = str(cfg)
    # show all warnings and fail on exception if debugging
    if debug:
        result = walk_forward_validation(data, n_test, cfg)
    else:
        # one failure during model validation suggests an unstable config
        try:
            # never show warnings when grid searching, too noisy
            with catch_warnings():
                filterwarnings("ignore")
            result = walk_forward_validation(data, n_test, cfg)
        except:
            error = None
    # check for an interesting result
    if result is not None:
        print(' > Model[%s] %.3f' % (key, result))
    return (key, result)

In [32]:
def grid_search(data, cfg_list, n_test, parallel=True):
    scores = None
    if parallel:
        # execute configs in parallel
        executor = Parallel(n_jobs=cpu_count(), backend='multiprocessing')
        tasks = (delayed(score_model)(data, n_test, cfg) for cfg in cfg_list)
        scores = executor(tasks)
    else:
        scores = [score_model(data, n_test, cfg) for cfg in cfg_list]
    # remove empty results
    scores = [r for r in scores if r[1] != None]
    # sort configs by error, asc
    scores.sort(key=lambda tup: tup[1])
    return scores

In [33]:
# create a set of simple configs to try
def simple_configs(max_length, offsets = [1]):
    configs = list()
    for i in range(1, max_length+1):
        for o in offsets:
            for t in ['persist', 'mean', 'median']:
                cfg = [i,o,t]
                configs.append(cfg)
    return configs

In [34]:
if __name__ == '__main__':
    # define dataset
    data = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
    # split data
    n_test = 4
    # model configs
    max_length = len(data) - n_test
    cfg_list = simple_configs(max_length)
    # grid search
    scores = grid_search(data, cfg_list, n_test, parallel = False)
    print('done')
    # list top 3 configs
    for cfg, error in scores[:3]:
        print(cfg, error)

 > Model[[1, 1, 'persist']] 10.000
 > Model[[2, 1, 'persist']] 20.000
 > Model[[2, 1, 'mean']] 15.000
 > Model[[2, 1, 'median']] 15.000
 > Model[[3, 1, 'persist']] 30.000
 > Model[[3, 1, 'mean']] 20.000
 > Model[[3, 1, 'median']] 20.000
 > Model[[4, 1, 'persist']] 40.000
 > Model[[4, 1, 'mean']] 25.000
 > Model[[4, 1, 'median']] 25.000
 > Model[[5, 1, 'persist']] 50.000
 > Model[[5, 1, 'mean']] 30.000
 > Model[[5, 1, 'median']] 30.000
 > Model[[6, 1, 'persist']] 60.000
 > Model[[6, 1, 'mean']] 35.000
 > Model[[6, 1, 'median']] 35.000
done
[1, 1, 'persist'] 10.0
[2, 1, 'mean'] 15.0
[2, 1, 'median'] 15.0


## Case Study 1: No Trend or Seasonality

In [35]:
if __name__ == '__main__':
    # define dataset
    series = read_csv('../Data/Chapter 11/daily-total-female-births.csv', header=0, index_col=0)
    data = series.values
    # data split
    n_test = 165
    # model configs
    max_length = len(data) - n_test
    cfg_list = simple_configs(max_length)
    # grid search
    scores = grid_search(data, cfg_list, n_test, parallel = False)
    print('done')
    # list top 3 configs
    for cfg, error in scores[:3]:
        print(cfg, error)

 > Model[[1, 1, 'persist']] 8.722
 > Model[[2, 1, 'persist']] 9.284
 > Model[[2, 1, 'mean']] 7.884
 > Model[[2, 1, 'median']] 7.884
 > Model[[3, 1, 'persist']] 9.320
 > Model[[3, 1, 'mean']] 7.518
 > Model[[3, 1, 'median']] 7.324
 > Model[[4, 1, 'persist']] 9.249
 > Model[[4, 1, 'mean']] 7.290
 > Model[[4, 1, 'median']] 7.217
 > Model[[5, 1, 'persist']] 9.415
 > Model[[5, 1, 'mean']] 7.204
 > Model[[5, 1, 'median']] 7.337
 > Model[[6, 1, 'persist']] 9.737
 > Model[[6, 1, 'mean']] 7.222
 > Model[[6, 1, 'median']] 7.230
 > Model[[7, 1, 'persist']] 8.791
 > Model[[7, 1, 'mean']] 7.042
 > Model[[7, 1, 'median']] 7.340
 > Model[[8, 1, 'persist']] 10.172
 > Model[[8, 1, 'mean']] 7.140
 > Model[[8, 1, 'median']] 7.331
 > Model[[9, 1, 'persist']] 10.175
 > Model[[9, 1, 'mean']] 7.200
 > Model[[9, 1, 'median']] 7.302
 > Model[[10, 1, 'persist']] 9.956
 > Model[[10, 1, 'mean']] 7.209
 > Model[[10, 1, 'median']] 7.349
 > Model[[11, 1, 'persist']] 9.873
 > Model[[11, 1, 'mean']] 7.201
 > Model[[11

 > Model[[22, 1, 'median']] 7.104
 > Model[[23, 1, 'persist']] 9.759
 > Model[[23, 1, 'mean']] 6.932
 > Model[[23, 1, 'median']] 7.096
 > Model[[24, 1, 'persist']] 10.065
 > Model[[24, 1, 'mean']] 6.952
 > Model[[24, 1, 'median']] 7.099
 > Model[[25, 1, 'persist']] 9.973
 > Model[[25, 1, 'mean']] 6.963
 > Model[[25, 1, 'median']] 7.106
 > Model[[26, 1, 'persist']] 10.258
 > Model[[26, 1, 'mean']] 6.988
 > Model[[26, 1, 'median']] 7.117
 > Model[[27, 1, 'persist']] 10.059
 > Model[[27, 1, 'mean']] 7.000
 > Model[[27, 1, 'median']] 7.095
 > Model[[28, 1, 'persist']] 9.473
 > Model[[28, 1, 'mean']] 6.981
 > Model[[28, 1, 'median']] 7.111
 > Model[[29, 1, 'persist']] 10.185
 > Model[[29, 1, 'mean']] 6.999
 > Model[[29, 1, 'median']] 7.160
 > Model[[30, 1, 'persist']] 10.326
 > Model[[30, 1, 'mean']] 7.023
 > Model[[30, 1, 'median']] 7.140
 > Model[[31, 1, 'persist']] 10.180
 > Model[[31, 1, 'mean']] 7.038
 > Model[[31, 1, 'median']] 7.106
 > Model[[32, 1, 'persist']] 10.292
 > Model[[32, 1

## Case Study 2: Trend

In [36]:
if __name__ == '__main__':
    # load dataset
    series = read_csv('../Data/Chapter 11/shampoo.csv', header=0, index_col=0)
    data = series.values
    # data split
    n_test = 12
    # model configs
    max_length = len(data) - n_test
    cfg_list = simple_configs(max_length)
    # grid search
    scores = grid_search(data, cfg_list, n_test, parallel = False)
    print('done')
    # list top 3 configs
    for cfg, error in scores[:3]:
        print(cfg, error)


 > Model[[1, 1, 'persist']] 136.761
 > Model[[2, 1, 'persist']] 95.695
 > Model[[2, 1, 'mean']] 96.011
 > Model[[2, 1, 'median']] 96.011
 > Model[[3, 1, 'persist']] 123.969
 > Model[[3, 1, 'mean']] 96.403
 > Model[[3, 1, 'median']] 103.634
 > Model[[4, 1, 'persist']] 136.813
 > Model[[4, 1, 'mean']] 100.609
 > Model[[4, 1, 'median']] 104.878
 > Model[[5, 1, 'persist']] 127.656
 > Model[[5, 1, 'mean']] 101.862
 > Model[[5, 1, 'median']] 101.819
 > Model[[6, 1, 'persist']] 165.500
 > Model[[6, 1, 'mean']] 110.425
 > Model[[6, 1, 'median']] 114.959
 > Model[[7, 1, 'persist']] 164.514
 > Model[[7, 1, 'mean']] 115.500
 > Model[[7, 1, 'median']] 124.605
 > Model[[8, 1, 'persist']] 186.545
 > Model[[8, 1, 'mean']] 122.807
 > Model[[8, 1, 'median']] 132.409
 > Model[[9, 1, 'persist']] 193.951
 > Model[[9, 1, 'mean']] 128.902
 > Model[[9, 1, 'median']] 136.633
 > Model[[10, 1, 'persist']] 205.511
 > Model[[10, 1, 'mean']] 135.664
 > Model[[10, 1, 'median']] 142.336
 > Model[[11, 1, 'persist']] 

## Case Study 3: Seasonality

In [37]:
if __name__ == '__main__':
# define dataset
    series = read_csv('../Data/Chapter 11/monthly-mean-temp.csv', header=0, index_col=0)
    data = series.values
    # data split
    n_test = 12
    # model configs
    max_length = len(data) - n_test
    cfg_list = simple_configs(max_length, offsets=[1,12])
    # grid search
    scores = grid_search(data, cfg_list, n_test, parallel = False)
    print('done')
    # list top 3 configs
    for cfg, error in scores[:3]:
        print(cfg, error)

 > Model[[1, 1, 'persist']] 5.143
 > Model[[1, 12, 'persist']] 5.143
 > Model[[2, 1, 'persist']] 8.492
 > Model[[2, 1, 'mean']] 6.538
 > Model[[2, 1, 'median']] 6.538
 > Model[[2, 12, 'persist']] 8.492
 > Model[[2, 12, 'mean']] 1.589
 > Model[[2, 12, 'median']] 1.589
 > Model[[3, 1, 'persist']] 12.006
 > Model[[3, 1, 'mean']] 8.155
 > Model[[3, 1, 'median']] 8.331
 > Model[[3, 12, 'persist']] 12.006
 > Model[[3, 12, 'mean']] 1.714
 > Model[[3, 12, 'median']] 2.078
 > Model[[4, 1, 'persist']] 14.252
 > Model[[4, 1, 'mean']] 9.442
 > Model[[4, 1, 'median']] 9.787
 > Model[[4, 12, 'persist']] 14.252
 > Model[[4, 12, 'mean']] 1.502
 > Model[[4, 12, 'median']] 1.643
 > Model[[5, 1, 'persist']] 15.418
 > Model[[5, 1, 'mean']] 10.311
 > Model[[5, 1, 'median']] 11.323
 > Model[[5, 12, 'persist']] 15.418
 > Model[[5, 12, 'mean']] 1.719
 > Model[[5, 12, 'median']] 1.801
 > Model[[6, 1, 'persist']] 15.725
 > Model[[6, 1, 'mean']] 10.813
 > Model[[6, 1, 'median']] 12.133
 > Model[[6, 12, 'persist'

## Case Study 4: Trend and Seasonality

In [38]:
if __name__ == '__main__':
    # define dataset
    series = read_csv('../Data/Chapter 11/monthly-car-sales.csv', header=0, index_col=0)
    data = series.values
    # data split
    n_test = 12
    # model configs
    max_length = len(data) - n_test
    cfg_list = simple_configs(max_length, offsets=[1,12])
    # grid search
    scores = grid_search(data, cfg_list, n_test, parallel = False)
    print('done')
    # list top 3 configs
    for cfg, error in scores[:3]:
        print(cfg, error)

 > Model[[1, 1, 'persist']] 3783.966
 > Model[[1, 12, 'persist']] 3783.966
 > Model[[2, 1, 'persist']] 5171.658
 > Model[[2, 1, 'mean']] 4119.852
 > Model[[2, 1, 'median']] 4119.852
 > Model[[2, 12, 'persist']] 5171.658
 > Model[[2, 12, 'mean']] 2215.186
 > Model[[2, 12, 'median']] 2215.186
 > Model[[3, 1, 'persist']] 6000.907
 > Model[[3, 1, 'mean']] 4474.174
 > Model[[3, 1, 'median']] 4243.965
 > Model[[3, 12, 'persist']] 6000.907
 > Model[[3, 12, 'mean']] 2115.198
 > Model[[3, 12, 'median']] 1841.156
 > Model[[4, 1, 'persist']] 6285.170
 > Model[[4, 1, 'mean']] 4646.905
 > Model[[4, 1, 'median']] 4672.775
 > Model[[4, 12, 'persist']] 6285.170
 > Model[[4, 12, 'mean']] 2385.125
 > Model[[4, 12, 'median']] 2184.377
 > Model[[5, 1, 'persist']] 5668.407
 > Model[[5, 1, 'mean']] 4535.757
 > Model[[5, 1, 'median']] 5196.987
 > Model[[5, 12, 'persist']] 5668.407
 > Model[[5, 12, 'mean']] 2843.253
 > Model[[5, 12, 'median']] 2621.677
 > Model[[6, 1, 'persist']] 5798.153
 > Model[[6, 1, 'mea

## Case 5: GLO Historical Stock Price Data

In [40]:
if __name__ == '__main__':
    # define dataset
    series = read_csv('../Data/csv/GLO.csv', header=0, index_col=0)
    data = series.values
    data = data[-300:]
    # data split
    n_test = 100
    # model configs
    max_length = len(data) - n_test
    cfg_list = simple_configs(max_length, offsets=[1,12, 24])
    # grid search
    scores = grid_search(data, cfg_list, n_test, parallel = False)
    print('done')
    # list top 3 configs
    for cfg, error in scores[:3]:
        print(cfg, error)

 > Model[[1, 1, 'persist']] 18625.170
 > Model[[1, 12, 'persist']] 18625.170
 > Model[[1, 24, 'persist']] 18625.170
 > Model[[2, 1, 'persist']] 20166.062
 > Model[[2, 12, 'persist']] 20166.062
 > Model[[2, 24, 'persist']] 20166.062
 > Model[[3, 1, 'persist']] 20406.409
 > Model[[3, 12, 'persist']] 20406.409
 > Model[[3, 24, 'persist']] 20406.409
 > Model[[4, 1, 'persist']] 24135.980
 > Model[[4, 12, 'persist']] 24135.980
 > Model[[4, 24, 'persist']] 24135.980
 > Model[[5, 1, 'persist']] 24224.517
 > Model[[5, 12, 'persist']] 24224.517
 > Model[[5, 24, 'persist']] 24224.517
 > Model[[6, 1, 'persist']] 24299.264
 > Model[[6, 12, 'persist']] 24299.264
 > Model[[6, 24, 'persist']] 24299.264
 > Model[[7, 1, 'persist']] 24510.853
 > Model[[7, 12, 'persist']] 24510.853
 > Model[[7, 24, 'persist']] 24510.853
 > Model[[8, 1, 'persist']] 24920.722
 > Model[[8, 12, 'persist']] 24920.722
 > Model[[8, 24, 'persist']] 24920.722
 > Model[[9, 1, 'persist']] 24906.764
 > Model[[9, 12, 'persist']] 24906