In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from itertools import product
import pickle
import os
import warnings
from google.colab import drive


# === Constants ===
DATA_PATH = 'C:/Swinburne/2025Sem1/COS40007-Artificial Intelligence for Engineering/Group Assignment/5G Zone Prediction System/ProcessedData/clean_data_clst.csv'
MODEL_SAVE_PATH = 'C:/Swinburne/2025Sem1/COS40007-Artificial Intelligence for Engineering/Group Assignment/5G Zone Prediction System/TrainedModel/TimeSeries/arima_model.pkl'
TEST_SIZE = 24
VAL_SIZE = 24
ARIMA_PARAM_GRID = list(product(range(3), repeat=3))  # Try (0–2) for p, d, q

# === Feature Engineering Function ===
def create_features(df, target_col='total_throughput'):
    df = df.copy()
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['minute'] = df.index.minute

    # Cyclical encodings
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)

    # Rolling stats
    for w in [3, 6, 12, 24]:
        df[f'{target_col}_mean_{w}'] = df[target_col].rolling(window=w, min_periods=1).mean()
        df[f'{target_col}_std_{w}'] = df[target_col].rolling(window=w, min_periods=1).std()
        df[f'{target_col}_min_{w}'] = df[target_col].rolling(window=w, min_periods=1).min()
        df[f'{target_col}_max_{w}'] = df[target_col].rolling(window=w, min_periods=1).max()
        df[f'{target_col}_skew_{w}'] = df[target_col].rolling(window=w, min_periods=1).skew()
        df[f'{target_col}_roc_{w}'] = df[target_col].pct_change(periods=w)

    # Interaction
    df['hour_interaction'] = df['hour'] * df[target_col]

    # Cleanup
    df = df.replace([np.inf, -np.inf], np.nan).ffill().bfill()

    return df

# === Load and preprocess raw data ===
df = pd.read_csv(DATA_PATH)
df['Convert_time'] = pd.to_datetime(df['DATES'] + ' ' + df['TIME'])
df.set_index('Convert_time', inplace=True)

# === Resample and feature engineering ===
hourly_data = df.resample('h').agg({'total_throughput': 'mean'}).bfill().ffill()
data = create_features(hourly_data)

# === Train/Val/Test split ===
if len(data) < TEST_SIZE + VAL_SIZE + 1:
    raise ValueError("Not enough data for ARIMA training")

train_data = data[:-TEST_SIZE - VAL_SIZE]
val_data = data[-TEST_SIZE - VAL_SIZE:-TEST_SIZE]
test_data = data[-TEST_SIZE:]

# === Select exogenous features ===
# Only include time-based features that can be recreated during future forecasting
time_exog_cols = ['hour', 'day_of_week', 'minute',
                  'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'minute_sin', 'minute_cos']

# Filter to what's available in data (safe subset)
exog_cols = [col for col in time_exog_cols if col in data.columns]

# === Grid Search for ARIMA(p,d,q) ===
print("\nSearching best ARIMA(p,d,q) order...")
best_aic = float('inf')
best_order = None
fitted_model = None

for order in ARIMA_PARAM_GRID:
    try:
        model = ARIMA(train_data['total_throughput'], order=order, exog=train_data[exog_cols])
        fitted = model.fit()
        aic = fitted.aic
        print(f"ARIMA{order} AIC: {aic:.2f}")
        if aic < best_aic:
            best_aic = aic
            best_order = order
            fitted_model = fitted
    except Exception as e:
        print(f"Failed ARIMA{order}: {e}")
        continue

if fitted_model is None:
    raise RuntimeError("All ARIMA configurations failed.")

# === Save model ===
last_train_timestamp = train_data.index.max()
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
with open(MODEL_SAVE_PATH, 'wb') as f:
    pickle.dump({
        'fitted_model': fitted_model,
        'target_column': 'total_throughput',
        'last_train_timestamp': last_train_timestamp,
        'best_order': best_order,
        'features': exog_cols
    }, f)

print(f"\nARIMA{best_order} model saved to: {MODEL_SAVE_PATH}")
print(f"Model trained up to: {last_train_timestamp}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Searching best ARIMA(p,d,q) order...
Failed ARIMA(0, 0, 0): A constant trend was included in the model specification, but the `exog` data already contains a column of constants.
Failed ARIMA(0, 0, 1): A constant trend was included in the model specification, but the `exog` data already contains a column of constants.
Failed ARIMA(0, 0, 2): A constant trend was included in the model specification, but the `exog` data already contains a column of constants.
ARIMA(0, 1, 0) AIC: 3140.83
ARIMA(0, 1, 1) AIC: 3119.43
ARIMA(0, 1, 2) AIC: 3117.99
ARIMA(0, 2, 0) AIC: 3481.03
ARIMA(0, 2, 1) AIC: 3141.84
ARIMA(0, 2, 2) AIC: 3121.13
Failed ARIMA(1, 0, 0): A constant trend was included in the model specification, but the `exog` data already contains a column of constants.
Failed ARIMA(1, 0, 1): A constant trend was included in the model specification, but the `exog` data 