In [17]:
!pip install xgboost statsmodels tensorflow scikit-learn --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [3]:
# 1. Load full dataset

data = pd.concat([
    pd.read_csv("train.csv", parse_dates=['date']),
    pd.read_csv("val.csv", parse_dates=['date']),
    pd.read_csv("test.csv", parse_dates=['date'])
])
data = data.sort_values("date")

print("Regions:", data['Region'].unique())
print("Data shape:", data.shape)

Regions: ['East US' 'West US' 'North Europe' 'Southeast Asia']
Data shape: (1080, 38)


In [None]:
# 2. Utility: Metrics

def get_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, mape


In [19]:
# 3. Backtesting - ARIMA
# -------------------------------
def backtest_arima(data, region, train_window, forecast_horizon, order=(5,1,0)):
    region_data = data[data['Region']==region].groupby('date')['usage_cpu'].mean()
    dates = region_data.index
    metrics = []

    if len(dates) <= train_window + forecast_horizon:
        print(f"⚠️ Not enough data for ARIMA in {region}")
        return []

    for i in range(train_window, len(dates)-forecast_horizon, forecast_horizon):
        train_series = region_data.iloc[:i]
        test_series = region_data.iloc[i:i+forecast_horizon]

        try:
            model = ARIMA(train_series, order=order)
            model_fit = model.fit()
            forecast = model_fit.forecast(steps=forecast_horizon)

            mae, rmse, mape = get_metrics(test_series, forecast)
            metrics.append([region, "ARIMA", test_series.index[0], test_series.index[-1], mae, rmse, mape])
        except Exception as e:
            print(f"⚠️ ARIMA failed in {region}: {e}")
            continue
    return metrics

In [20]:
# 4. Backtesting - XGBoost
# -------------------------------
def backtest_xgb(data, region, features, train_window, forecast_horizon, target='usage_cpu'):
    region_data = data[data['Region']==region].copy().sort_values("date")
    dates = region_data['date'].unique()
    metrics = []

    if len(dates) <= train_window + forecast_horizon:
        print(f"⚠️ Not enough data for XGBoost in {region}")
        return []

    for i in range(train_window, len(dates)-forecast_horizon, forecast_horizon):
        train_dates = dates[:i]
        test_dates = dates[i:i+forecast_horizon]

        train_df = region_data[region_data['date'].isin(train_dates)]
        test_df = region_data[region_data['date'].isin(test_dates)]

        X_train, y_train = train_df[features], train_df[target]
        X_test, y_test = test_df[features], test_df[target]

        model = xgb.XGBRegressor(
            n_estimators=200, learning_rate=0.1, max_depth=6,
            random_state=42, n_jobs=-1
        )
        model.fit(X_train, y_train, verbose=False)
        y_pred = model.predict(X_test)

        mae, rmse, mape = get_metrics(y_test, y_pred)
        metrics.append([region, "XGBoost", test_dates[0], test_dates[-1], mae, rmse, mape])

    return metrics

In [21]:
# 5. Backtesting - LSTM
# -------------------------------
def create_sequences(data, seq_length=7):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

def backtest_lstm(data, region, train_window, forecast_horizon, seq_length=7):
    region_series = data[data['Region']==region].groupby('date')['usage_cpu'].mean().values
    metrics = []

    if len(region_series) <= train_window + forecast_horizon:
        print(f"⚠️ Not enough data for LSTM in {region}")
        return []

    scaler = MinMaxScaler()
    region_scaled = scaler.fit_transform(region_series.reshape(-1,1))

    for i in range(train_window, len(region_scaled)-forecast_horizon, forecast_horizon):
        train_seq = region_scaled[:i]
        test_seq = region_scaled[i-seq_length:i+forecast_horizon]

        X_train, y_train = create_sequences(train_seq, seq_length)
        X_test, y_test = create_sequences(test_seq, seq_length)

        if len(X_test) == 0:
            continue

        # Define LSTM
        model = Sequential([
            LSTM(50, activation='relu', input_shape=(seq_length,1)),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mse')
        model.fit(X_train, y_train, epochs=5, batch_size=16, verbose=0)

        y_pred = model.predict(X_test, verbose=0)

        y_test_inv = scaler.inverse_transform(y_test.reshape(-1,1)).flatten()
        y_pred_inv = scaler.inverse_transform(y_pred).flatten()[:forecast_horizon]

        mae, rmse, mape = get_metrics(y_test_inv[:forecast_horizon], y_pred_inv)
        metrics.append([region, "LSTM", None, None, mae, rmse, mape])
    return metrics


In [22]:
# 6. Run Adaptive Backtesting
# -------------------------------
all_metrics = []

# Features for XGBoost
exclude_cols = ['date','Region','ResourceType','usage_cpu',
                'ResourceType_Storage','ResourceType_VM']
features = [col for col in data.columns if col not in exclude_cols]

for region in data['Region'].unique():
    region_len = len(data[data['Region']==region]['date'].unique())

    # Adaptive windows
    train_window = max(30, int(region_len * 0.6))
    forecast_horizon = max(7, int(region_len * 0.1))

    print(f"\n⏳ Backtesting {region}: train_window={train_window}, forecast_horizon={forecast_horizon}")

    all_metrics.extend(backtest_arima(data, region, train_window, forecast_horizon))
    all_metrics.extend(backtest_xgb(data, region, features, train_window, forecast_horizon))
    all_metrics.extend(backtest_lstm(data, region, train_window, forecast_horizon))



⏳ Backtesting East US: train_window=54, forecast_horizon=9

⏳ Backtesting West US: train_window=54, forecast_horizon=9

⏳ Backtesting North Europe: train_window=54, forecast_horizon=9

⏳ Backtesting Southeast Asia: train_window=54, forecast_horizon=9


In [23]:
# 7. Save & Show Results
# -------------------------------
df_backtest = pd.DataFrame(all_metrics, columns=["Region","Model","Start","End","MAE","RMSE","MAPE"])
df_backtest.to_csv("backtest_metrics.csv", index=False)

print("\n✅ Backtesting Completed!")
print(df_backtest.groupby(["Region","Model"]).mean(numeric_only=True))


✅ Backtesting Completed!
                              MAE       RMSE       MAPE
Region         Model                                   
East US        ARIMA     9.042368  10.427872  12.777746
               LSTM      9.910331  12.231820  12.997841
               XGBoost   0.064375   0.203382   0.104866
North Europe   ARIMA     5.934640   7.294230   8.110310
               LSTM     10.911699  12.449236  14.113892
               XGBoost   0.011653   0.033567   0.013158
Southeast Asia ARIMA     7.599975   9.261109  10.154881
               LSTM     13.612378  15.782995  17.394923
               XGBoost   0.002165   0.005533   0.002796
West US        ARIMA     9.479601  10.721238  12.564169
               LSTM     11.480525  13.527392  14.487433
               XGBoost   0.060546   0.211460   0.074240
