In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np


# Define time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Store evaluation metrics for each fold
cv_results = {'rmse': [], 'mae': []}

# Prepare the data for cross-validation
time_series = data['utilizationRate_total']

# Perform time series cross-validation
for train_index, test_index in tscv.split(time_series):
    train, test = time_series.iloc[train_index], time_series.iloc[test_index]

    # Fit ARIMA model for each fold
    model_cv = ARIMA(train, order=(2, 1, 2))
    fitted_model_cv = model_cv.fit()
    
    # Predict on the test set
    forecast_cv = fitted_model_cv.forecast(steps=len(test))
    
    # Calculate evaluation metrics
    rmse = np.sqrt(mean_squared_error(test, forecast_cv))
    mae = mean_absolute_error(test, forecast_cv)
    
    # Store results
    cv_results['rmse'].append(rmse)
    cv_results['mae'].append(mae)

# Calculate average metrics across folds
avg_rmse = np.mean(cv_results['rmse'])
avg_mae = np.mean(cv_results['mae'])

# Train the final model on the full training set
final_train_data = time_series[:-720]  # Exclude last 720 hours as holdout set
final_test_data = time_series[-720:]  # Holdout set
final_model = ARIMA(final_train_data, order=(2, 1, 2))
final_fitted_model = final_model.fit()

# Predict on the holdout set
final_forecast = final_fitted_model.forecast(steps=len(final_test_data))

# Calculate performance metrics on holdout set
final_rmse = np.sqrt(mean_squared_error(final_test_data, final_forecast))
final_mae = mean_absolute_error(final_test_data, final_forecast)

# Combine actual and predicted values for visualization
results_df = pd.DataFrame({
    'Actual': final_test_data,
    'Predicted': final_forecast
}, index=final_test_data.index)

# Plot actual vs. predicted values
plt.figure(figsize=(12, 6))
results_df['Actual'].plot(label='Actual (Holdout Set)', color='blue')
results_df['Predicted'].plot(label='Predicted (ARIMA)', color='red')
plt.title("ARIMA Model - Actual vs. Predicted (Holdout Set)")
plt.xlabel("Time")
plt.ylabel("Utilization Rate (%)")
plt.legend()
plt.show()

# Display cross-validation results and holdout set performance
metrics_df = pd.DataFrame({
    'Metric': ['Average RMSE (CV)', 'Average MAE (CV)', 'Holdout RMSE', 'Holdout MAE'],
    'Value': [avg_rmse, avg_mae, final_rmse, final_mae]
})
tools.display_dataframe_to_user(name="Cross-Validation and Holdout Set Performance Metrics", dataframe=metrics_df)

# Display actual vs predicted values
tools.display_dataframe_to_user(name="Actual vs Predicted Utilization (Holdout Set)", dataframe=results_df)