In [1]:
import pandas as pd
from statsforecast import StatsForecast
from utilsforecast.losses import mse
from utilsforecast.evaluation import evaluate
from prophet import Prophet
from prophet.diagnostics import performance_metrics
from prophet.diagnostics import cross_validation
from prophet.plot import plot_cross_validation_metric

# Load the dataset
file_path = 'unique_dataset_rnd.csv'
data = pd.read_csv(file_path, parse_dates=True)

uids = data['unique_id'].unique()[:2] # Select 10 ids to make the example faster
data = data.query('unique_id in @uids')


# Creating the two univariate datasets with the corrected timestamp
cpu_usage_dataset_with_corrected_timestamp = data[['timestamp', 'CPU usage [%]', 'unique_id']].copy()
cpu_usage_dataset_with_corrected_timestamp  = cpu_usage_dataset_with_corrected_timestamp.rename(columns={'CPU usage [%]': 'y', 'timestamp': 'ds'})


#Resample to hours
cpu_usage_dataset_with_corrected_timestamp['ds'] = pd.to_datetime(cpu_usage_dataset_with_corrected_timestamp['ds'])
cpu_usage_dataset_with_corrected_timestamp.set_index('ds', inplace=True)
cpu_usage_dataset_with_corrected_timestamp = cpu_usage_dataset_with_corrected_timestamp.groupby('unique_id').resample('H').mean()


cpu_usage_dataset_with_corrected_timestamp = cpu_usage_dataset_with_corrected_timestamp.reset_index()

cpu_usage_dataset_with_corrected_timestamp = cpu_usage_dataset_with_corrected_timestamp .groupby('unique_id').tail(7 * 24)


cpu_usage_dataset_with_corrected_timestamp

  from tqdm.autonotebook import tqdm
Importing plotly failed. Interactive plots will not work.


Unnamed: 0,unique_id,ds,y
2016,253_CPU,2013-09-23 00:00:00,14.368750
2017,253_CPU,2013-09-23 01:00:00,2.156944
2018,253_CPU,2013-09-23 02:00:00,2.161111
2019,253_CPU,2013-09-23 03:00:00,2.150000
2020,253_CPU,2013-09-23 04:00:00,2.170833
...,...,...,...
4363,306_CPU,2013-09-29 19:00:00,55.133333
4364,306_CPU,2013-09-29 20:00:00,55.102778
4365,306_CPU,2013-09-29 21:00:00,55.211111
4366,306_CPU,2013-09-29 22:00:00,55.147222


In [2]:
import os
# this makes it so that the outputs of the predict methods have the id as a column 
# instead of as the index
os.environ['NIXTLA_ID_AS_COL'] = '1'

def evaluate_cross_validation(df, metric):
    models = df.drop(columns=['unique_id', 'ds', 'cutoff', 'y']).columns.tolist()
    evals = []
    # Calculate loss for every unique_id and cutoff.    
    for cutoff in df['cutoff'].unique():
        eval_ = evaluate(df[df['cutoff'] == cutoff], metrics=[metric], models=models)
        evals.append(eval_)
    evals = pd.concat(evals)
    evals = evals.groupby('unique_id').mean(numeric_only=True) # Averages the error metrics for all cutoffs for every combination of model and unique_id
    evals['best_model'] = evals.idxmin(axis=1)
    return evals


In [3]:
from statsforecast.models import (
    AutoARIMA,
    AutoTheta,
    AutoETS,
    AutoCES,
    SeasonalNaive,
    WindowAverage,
    SeasonalWindowAverage,
    Naive
)

models = [
    AutoARIMA(season_length=24),
    AutoTheta(season_length=24),
    AutoETS(season_length=24),
    AutoCES(season_length=24),
    SeasonalNaive(season_length=24), 
    WindowAverage(window_size=24), 
    SeasonalWindowAverage(window_size=1, season_length=24),
    Naive()
]

sf = StatsForecast( 
    models=models,
    freq='H',
    fallback_model = SeasonalNaive(season_length=24),
    n_jobs=-1,
)

crossvaldation_df = sf.cross_validation(
    df=cpu_usage_dataset_with_corrected_timestamp,
    h=24,
    step_size=48,
    n_windows=1
)



In [4]:
crossvaldation_df

Unnamed: 0,unique_id,ds,cutoff,y,AutoARIMA,AutoTheta,AutoETS,CES,SeasonalNaive,WindowAverage,SeasWA,Naive
0,253_CPU,2013-09-29 00:00:00,2013-09-28 23:00:00,2.093055,2.068147,2.096954,2.095835,3.21097,2.0625,2.095023,2.0625,2.095833
1,253_CPU,2013-09-29 01:00:00,2013-09-28 23:00:00,2.106945,2.040827,2.096206,2.095835,1.859447,2.075,2.095023,2.075,2.095833
2,253_CPU,2013-09-29 02:00:00,2013-09-28 23:00:00,2.051389,2.013868,2.095459,2.095835,1.882877,2.130556,2.095023,2.130556,2.095833
3,253_CPU,2013-09-29 03:00:00,2013-09-28 23:00:00,2.094445,1.987264,2.094711,2.095835,1.830173,2.073611,2.095023,2.073611,2.095833
4,253_CPU,2013-09-29 04:00:00,2013-09-28 23:00:00,2.0625,1.961012,2.093963,2.095835,1.853774,2.05,2.095023,2.05,2.095833
5,253_CPU,2013-09-29 05:00:00,2013-09-28 23:00:00,2.130556,1.935107,2.093215,2.095835,1.865206,2.145833,2.095023,2.145833,2.095833
6,253_CPU,2013-09-29 06:00:00,2013-09-28 23:00:00,2.133333,1.909544,2.092467,2.095835,1.819296,2.080555,2.095023,2.080555,2.095833
7,253_CPU,2013-09-29 07:00:00,2013-09-28 23:00:00,2.070833,1.884319,2.091719,2.095835,1.824863,2.151389,2.095023,2.151389,2.095833
8,253_CPU,2013-09-29 08:00:00,2013-09-28 23:00:00,2.098611,1.859427,2.090971,2.095835,1.81757,2.080555,2.095023,2.080555,2.095833
9,253_CPU,2013-09-29 09:00:00,2013-09-28 23:00:00,2.086111,1.834864,2.090223,2.095835,1.830566,2.090278,2.095023,2.090278,2.095833


In [5]:
grouped = cpu_usage_dataset_with_corrected_timestamp.groupby('unique_id')

for unique_id, group_df in grouped:
    # Initialize and fit the Prophet model
    model = Prophet()
    model.fit(group_df)
    df_cv = cross_validation(model, horizon='2 days', initial='3 days')
    df_cv = df_cv.sort_values(by='ds')
    df_cv['unique_id'] = unique_id
    df_new = df_cv[['ds', 'unique_id', 'yhat']].rename(columns={'yhat': 'prophet'})
    print(df_cv)
    # If 'prophet' already exists in crossvaldation_df, prepare to merge and resolve the column values
    if 'prophet' in crossvaldation_df.columns:
        # Temporarily rename 'prophet' in crossvaldation_df to avoid automatic suffixing
        crossvaldation_df.rename(columns={'prophet': 'prophet_temp'}, inplace=True)

        # Merge df1 and df_new
        crossvaldation_df = pd.merge(crossvaldation_df, df_new, on=['ds','unique_id'], how='left')

        # Update 'prophet_temp' with 'prophet' from df_new where available
        crossvaldation_df['prophet'] = crossvaldation_df['prophet'].combine_first(crossvaldation_df['prophet_temp'])

        # Drop the temporary and '_new' columns
        crossvaldation_df.drop(columns=['prophet_temp'], inplace=True)
    else:
        # If 'prophet' does not exist yet, simply merge
        crossvaldation_df = pd.merge(crossvaldation_df, df_new, on=['ds','unique_id'], how='left')
    


crossvaldation_df
print(crossvaldation_df)

17:45:29 - cmdstanpy - INFO - Chain [1] start processing
17:45:29 - cmdstanpy - INFO - Chain [1] done processing


  0%|          | 0/2 [00:00<?, ?it/s]

17:45:29 - cmdstanpy - INFO - Chain [1] start processing
17:45:29 - cmdstanpy - INFO - Chain [1] done processing
17:45:29 - cmdstanpy - INFO - Chain [1] start processing
17:45:30 - cmdstanpy - INFO - Chain [1] done processing
17:45:30 - cmdstanpy - INFO - Chain [1] start processing
17:45:30 - cmdstanpy - INFO - Chain [1] done processing


                    ds      yhat  yhat_lower  yhat_upper         y  \
0  2013-09-27 00:00:00  2.839216    1.334053    4.425003  2.056944   
1  2013-09-27 01:00:00  2.545657    1.002909    4.025338  2.088889   
2  2013-09-27 02:00:00  1.979439    0.474819    3.522417  2.101389   
3  2013-09-27 03:00:00  1.511791   -0.022859    2.935773  2.079167   
4  2013-09-27 04:00:00  1.388862   -0.029814    2.907749  2.211111   
..                 ...       ...         ...         ...       ...   
91 2013-09-29 19:00:00  1.656134    0.361938    3.049401  2.106944   
92 2013-09-29 20:00:00  1.533533    0.173786    2.955790  2.111111   
93 2013-09-29 21:00:00  1.636925    0.329813    2.905918  2.093056   
94 2013-09-29 22:00:00  1.982653    0.698444    3.346104  2.100000   
95 2013-09-29 23:00:00  2.358840    1.096477    3.789346  2.033333   

                cutoff unique_id  
0  2013-09-26 23:00:00   253_CPU  
1  2013-09-26 23:00:00   253_CPU  
2  2013-09-26 23:00:00   253_CPU  
3  2013-09-26 23:00

  0%|          | 0/2 [00:00<?, ?it/s]

17:45:30 - cmdstanpy - INFO - Chain [1] start processing
17:45:30 - cmdstanpy - INFO - Chain [1] done processing
17:45:30 - cmdstanpy - INFO - Chain [1] start processing
17:45:30 - cmdstanpy - INFO - Chain [1] done processing


                    ds       yhat  yhat_lower  yhat_upper          y  \
0  2013-09-27 00:00:00  55.038536   54.773794   55.306682  55.102778   
1  2013-09-27 01:00:00  55.012242   54.776609   55.259902  55.175000   
2  2013-09-27 02:00:00  55.030213   54.776138   55.285974  55.122222   
3  2013-09-27 03:00:00  55.071243   54.812333   55.307609  55.138889   
4  2013-09-27 04:00:00  55.093601   54.842215   55.355279  55.105556   
..                 ...        ...         ...         ...        ...   
91 2013-09-29 19:00:00  55.412449   55.185059   55.659976  55.133333   
92 2013-09-29 20:00:00  55.446957   55.228837   55.685912  55.102778   
93 2013-09-29 21:00:00  55.467400   55.237289   55.698318  55.211111   
94 2013-09-29 22:00:00  55.458554   55.224986   55.698811  55.147222   
95 2013-09-29 23:00:00  55.421654   55.174655   55.665012  55.100000   

                cutoff unique_id  
0  2013-09-26 23:00:00   306_CPU  
1  2013-09-26 23:00:00   306_CPU  
2  2013-09-26 23:00:00   306_C

In [6]:
crossvaldation_df

Unnamed: 0,unique_id,ds,cutoff,y,AutoARIMA,AutoTheta,AutoETS,CES,SeasonalNaive,WindowAverage,SeasWA,Naive,prophet
0,253_CPU,2013-09-29 00:00:00,2013-09-28 23:00:00,2.093055,2.068147,2.096954,2.095835,3.21097,2.0625,2.095023,2.0625,2.095833,2.591269
1,253_CPU,2013-09-29 01:00:00,2013-09-28 23:00:00,2.106945,2.040827,2.096206,2.095835,1.859447,2.075,2.095023,2.075,2.095833,2.353537
2,253_CPU,2013-09-29 02:00:00,2013-09-28 23:00:00,2.051389,2.013868,2.095459,2.095835,1.882877,2.130556,2.095023,2.130556,2.095833,1.907406
3,253_CPU,2013-09-29 03:00:00,2013-09-28 23:00:00,2.094445,1.987264,2.094711,2.095835,1.830173,2.073611,2.095023,2.073611,2.095833,1.545569
4,253_CPU,2013-09-29 04:00:00,2013-09-28 23:00:00,2.0625,1.961012,2.093963,2.095835,1.853774,2.05,2.095023,2.05,2.095833,1.457019
5,253_CPU,2013-09-29 05:00:00,2013-09-28 23:00:00,2.130556,1.935107,2.093215,2.095835,1.865206,2.145833,2.095023,2.145833,2.095833,1.591917
6,253_CPU,2013-09-29 06:00:00,2013-09-28 23:00:00,2.133333,1.909544,2.092467,2.095835,1.819296,2.080555,2.095023,2.080555,2.095833,1.743981
7,253_CPU,2013-09-29 07:00:00,2013-09-28 23:00:00,2.070833,1.884319,2.091719,2.095835,1.824863,2.151389,2.095023,2.151389,2.095833,1.756415
8,253_CPU,2013-09-29 08:00:00,2013-09-28 23:00:00,2.098611,1.859427,2.090971,2.095835,1.81757,2.080555,2.095023,2.080555,2.095833,1.651272
9,253_CPU,2013-09-29 09:00:00,2013-09-28 23:00:00,2.086111,1.834864,2.090223,2.095835,1.830566,2.090278,2.095023,2.090278,2.095833,1.571523


In [7]:
evaluation_df = evaluate_cross_validation(crossvaldation_df, mse)
evaluation_df.head()

Unnamed: 0_level_0,AutoARIMA,AutoTheta,AutoETS,CES,SeasonalNaive,WindowAverage,SeasWA,Naive,prophet,best_model
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
253_CPU,0.127438,0.001599,0.001513,0.14359,0.002841,0.001519,0.002841,0.001513,0.180293,AutoETS
306_CPU,0.019419,0.017164,0.017438,0.023938,0.015574,0.017096,0.015574,0.024209,0.04824,SeasonalNaive
