In [47]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pmdarima as pm
from pmdarima import model_selection
import statsmodels
from pmdarima.pipeline import Pipeline
from pmdarima import preprocessing
from pmdarima import arima
from datetime import datetime
import pickle
import glob

In [48]:
# Mean Absolute Percentage Error
def mape(preds, labels):
    err = 0
    for (pred, label) in zip(preds, labels):
        denum = np.absolute(label) if label !=0 else 100 # this might be wrong
        err += (np.absolute(pred-label) / denum)
    err /= preds.shape[0]
    print("MAPE - {}".format(err))
    return err
    
# Brier Score or Mean Squared Error
def mse(preds, labels):
    err = np.sum(np.power(preds-labels, 2)) / preds.shape[0]
    print("MSE - {}".format(err))
    return err

# Mean Absolute Error
def mae(preds, labels):
    err = np.sum(np.abs(preds-labels)) / preds.shape[0]
    print("MAE - {}".format(err))
    return err
    
# Root Mean Squared Error
def rmse(preds, labels):
    err = np.power(mse(preds, labels), 0.5)
    print("RMSE - {}".format(err))
    return err

# Calculates the error for MAE, RMSE, and RMSE
# Returns the results in a list
def calculate_errors(preds, labels):
    return [fn(preds, labels) for fn in [mae, rmse, mape]]


Trains an ARIMA model for every dataset, then makes a prediction on the test set. Models are saved to disk as a pickle file for reuse. Returns a dictionary of predictions. 

In [4]:
save_path = 'arima_output'
# Freq is the frequency in minutes to downsample to,
# split is train/test set split. Parameters are (p,d,q)
# Seasonal parameters are (P, D, Q). Assumes daily seasons
# Each prediction set is in an (ARIMA, mean, naive) tuple
def train_arima_models(freq=20, split=0.8, parameters=(1,0,1), seasonal_parameters = (3, 0, 3), load_from_disk=True):    
    p, d, q= parameters
    P, D, Q = seasona_parameters
    seasonality = (24 * 60) // freq
    
    filenames = glob.glob('samples/*.csv')
    datasets = [filename.split('/')[1].split('.')[0] for filename in filenames]

    arima_maes = []
    mean_maes = []
    for dataset in datasets:
        # Generates train and test sets 
        data = pd.read_csv('samples/' + dataset + '.csv', delimiter=',', index_col=0, parse_dates=True)
        downsampled = data.resample(str(freq) + 'T').mean()
        raw_values = np.asarray(downsampled['CpuUtilizationAverage'])
        tsize = math.floor(raw_values.shape[0] * split)
        train, test = model_selection.train_test_split(raw_values, train_size=tsize)

        
        model_filename = f"{save_path}/models/{dataset}.arima"
        
        # Train ARIMA models
        if (not load_from_disk):
            start = datetime.now()
            model = arima.ARIMA((p,d,q), (P,D,Q,seasonality), method='powell')
            model.fit(train)
            end = datetime.now()
            print(f"Trained {dataset} in " + str(end - start))
            
            # Save models to disk
            pickle.dump(model, open(model_filename, "wb" ))
        else: 
            model = pickle.load(open(model_filename, "rb" ))
            
            print(f"Loaded {f} from disk")

        # Predict future
        arima_forecast = model.predict(test.shape[0])
        
        #Calculate Naive predictions
        mean = train.mean()
        mean_forecast = np.ones(test.shape[0]) * mean
        naive_forecast = np.ones(test.shape[0]) * train[-1]

        # Create plot of prediction vs actual
        fig, ax = plt.subplots()
        x = np.arange(raw_values.shape[0])
        plt.plot(x[:tsize], train, c='blue')
        plt.plot(x[tsize:], model_forecasts, c='green')
        plt.plot(x[tsize:], raw_values[tsize:], c='blue', alpha=0.3)
        plt.xlabel('Time (Days)')
        plt.ylabel('CPU Usage')
        ticks = np.arange(0, raw_values.shape[0], seasonality)
        tick_labels = [str(label) for label in ticks // seasonality]
        ax = plt.gca()
        plt.xticks(ticks)
        ax.set_xticklabels(tick_labels)
        plt.draw()
        plotname = f"./save_path/plots/{dataset}.png"
        plt.savefig(plotname, format='png')

        # Calculate errors
        errors = []

        for forecast in [model_forecasts, mean_forecasts, naive_forecasts]:
            errors.extend(calculate_errors(forecast, test))
            errors.extend(calculate_errors(forecast[:3], test[:3]))
        
        arima_maes.append(mae(model_forecasts, test))
        mean_maes.append(mae(mean_forecasts, test))
        
        
        row = pd.DataFrame([errors], index=[dataset], columns = columns, dtype=np.float64 )
        df = pd.concat([row, df])

    # Calculate avg stats
    avg_row = df.mean(axis=0).rename('Average', inplace=True)
    min_row = df.max(axis=0).rename('Maximum', inplace=True)
    max_row = df.min(axis=0).rename('Minimum', inplace = True)
    df = df.append(avg_row).append(min_row).append(max_row)

    # Saves everything to CSV
    df.to_csv(f"{save_path}/all_results.csv")
    
    # Separate CSV just for short term results
    columns = ['3pt MAE','3pt MAPE', '3pt MEAN MAE','3pt MEAN MAPE']
    short_term_df = df[columns]
    short_term_df.to_csv(f"{save_path}/short_term_results.csv", float_format="%.2f")

    # Separate CSV just for long term results
    columns = ['MAE', 'MAPE', 'MEAN_MAE', 'MEAN_MAPE']
    long_term_df = df[columns]
    long_term_df.to_csv(f"{save_path}/long_term_results.csv", float_format="%.2f")
    
    # Creates graph showing MAE of all files 
    figure(num=None, figsize=(16, 6), dpi=80, facecolor='w', edgecolor='k')
    X = np.arange(50)
    handle1 = plt.bar(X, arima_maes, color='blue', width = 0.25, tick_label=trim_filenames, label = 'ARIMA')
    handle2 = plt.bar(X+0.25, mean_maes, color='red', width = 0.25, tick_label=trim_filenames, label = 'Naive Mean')
    plt.legend(handles = [handle1, handle2])
    plt.ylabel("Mean Absolute Error")
    plt.xlabel("Time Series")
    filename = f"{save_path}/mae_plot.png"
    plt.savefig(filename, dpi=200, bbox_inches='tight')
    
    # Prints percent of times ARIMA outperformed Naive mean
    tot = 0
    for arima_mae, mean_mae in zip(arima_maes, mean_maes):
        if (arima_mae < mean):
            tot += 1
    print("Percentage of times ARIMA outperformed mean:")
    print(tot / len(datasets) * 100))
