In [7]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [None]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
import plotly.express as px
import plotly.graph_objects as go

# 1. Prepare Data with Proper ID Column
def prepare_data(filepath):
    """Load and prepare data with required __id__ column"""
    df = pd.read_csv(filepath, parse_dates=['date'])
    
    # Add required ID column (same ID for single time series)
    df['__id__'] = "main_series"  # Single time series ID
    
    # Add temporal features
    for col in ['Crude Oil Brent Price', 'Cocoa Price', 'Gold Price']:
        df[f'{col}_3mo_ma'] = df[col].rolling(3, min_periods=1).mean()
        df[f'{col}_12mo_ma'] = df[col].rolling(12, min_periods=6).mean()
    
    df.set_index('date', inplace=True)
    return df

# 2. Convert to TimeSeriesDataFrame
def create_ts_dataframe(df, target_col):
    """Create properly formatted TimeSeriesDataFrame"""
    # Keep only relevant columns
    keep_cols = ['__id__', target_col] + [col for col in df.columns if col not in ['Crude Oil Brent Price', 'Cocoa Price', 'Gold Price', '__id__']]
    
    return TimeSeriesDataFrame.from_data_frame(
        df[keep_cols].reset_index(),
        id_column="__id__",
        timestamp_column="date"
    )

# 3. AutoML Forecasting Function
def run_automl_forecasting(ts_data, target_col, prediction_length=12):
    """Run AutoML forecasting for a single target"""
    predictor = TimeSeriesPredictor(
        target=target_col,
        prediction_length=prediction_length,
        eval_metric="MAPE",
        path=f"autogluon_models/{target_col.replace(' ', '_')}",
    )
    
    predictor.fit(
        train_data=ts_data,
        time_limit=3600,  # 1 hour training
        presets="medium_quality",
        hyperparameters={
            "ETS": {},
            "ARIMA": {"maxiter": 50},
            "DeepAR": {"num_layers": 2},
            "SimpleFeedForward": {"epochs": 50},
        },
        num_val_windows=3,  # 3-fold time-based validation
        verbosity=2
    )
    
    return predictor

def calculate_metrics(y_true, y_pred):
    """Calculate evaluation metrics for time series forecasts"""
    metrics = {
        'MAE': mean_absolute_error(y_true, y_pred),
        'MAPE': mean_absolute_percentage_error(y_true, y_pred) * 100,  # as percentage
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred))
    }
    return metrics

def evaluate_forecasts(predictor, test_data, target_col, prediction_length=12):
    """Evaluate model on test period"""
    # Get the actual test values (last prediction_length points)
    y_true = test_data.tail(prediction_length)[target_col]
    
    # Generate predictions for test period
    forecast = predictor.predict(test_data)
    forecast = forecast.reset_index()
    
    # Align predictions with actuals
    y_pred = forecast['mean']  # Using the mean forecast
    
    # Calculate metrics
    return calculate_metrics(y_true, y_pred)

def plot_actual_vs_predicted(train_df, test_df, forecast_df, target_col):
    """Interactive plot of actual vs predicted values"""
    fig = go.Figure()
    
    # Training data
    fig.add_trace(go.Scatter(
        x=train_df.index,
        y=train_df[target_col],
        name='Training Data',
        line=dict(color='blue')
    ))
    
    # Test actuals
    fig.add_trace(go.Scatter(
        x=test_df.index,
        y=test_df[target_col],
        name='Actual Values',
        line=dict(color='green')
    ))
    
    # Forecasts
    fig.add_trace(go.Scatter(
        x=forecast_df.index,
        y=forecast_df['mean'],
        name='Forecast',
        line=dict(color='red')
    ))
    
    # Confidence interval
    fig.add_trace(go.Scatter(
        x=forecast_df.index.tolist() + forecast_df.index[::-1].tolist(),
        y=forecast_df['0.9'].tolist() + forecast_df['0.1'][::-1].tolist(),
        fill='toself',
        fillcolor='rgba(255,0,0,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        name='80% Confidence'
    ))
    
    fig.update_layout(
        title=f'{target_col} Forecast vs Actual',
        xaxis_title='Date',
        yaxis_title='Price',
        hovermode='x unified',
        template='plotly_white'
    )
    fig.show()

def plot_metrics(metrics_dict):
    """Bar plot comparing metrics across commodities"""
    metrics_df = pd.DataFrame(metrics_dict).T
    fig = px.bar(metrics_df, 
                 barmode='group',
                 title='Model Performance Across Commodities',
                 labels={'value': 'Metric Value', 'variable': 'Metric'})
    fig.update_layout(template='plotly_white')
    fig.show()

# 4. Main Execution
if __name__ == "__main__":
    # Configuration
    DATA_PATH = "merged_macro_commodity.csv"
    COMMODITIES = ['Crude Oil Brent Price', 'Cocoa Price', 'Gold Price']
    
    # Prepare base dataframe
    df = prepare_data(DATA_PATH)
    
    train_data = df.iloc[:-12]  # all but last year
    test_data = df.iloc[-12:]   # last year for testing
    
    results = {}
    for target in COMMODITIES:
        # Prepare data
        train_ts = create_ts_dataframe(train_data, target)
        test_ts = create_ts_dataframe(test_data, target)
        
        # Train model
        predictor = run_automl_forecasting(train_ts, target)
        
        # Evaluate on test set
        metrics = evaluate_forecasts(predictor, test_ts, target)
        
        # Store results
        results[target] = {
            'metrics': metrics,
            'model': predictor,
            'forecast': predictor.predict(train_ts)
        }

        forecast_df = results[target]['forecast']

        # Show interactive plot
        plot_actual_vs_predicted(train_data, test_data, forecast_df, target)
        
        print(f"\n{target} Evaluation:")
        print(f"best_model: {results[target]['model'].best_model}")
        print(f"- MAE: {metrics['MAE']:.2f}")
        print(f"- MAPE: {metrics['MAPE']:.2f}%")
        print(f"- RMSE: {metrics['RMSE']:.2f}")
    
    # You can also access the full leaderboard for each commodity:
    for target in COMMODITIES:
        leaderboard = results[target]['model'].leaderboard()
        print(f"\n{target} Model Rankings:")
        print(leaderboard[['model', 'score_val']])
    
    # Compare metrics across commodities
    metrics_dict = {k: v['metrics'] for k, v in results.items()}
    plot_metrics(metrics_dict)

Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to '/Users/purplegeminii/Desktop/Undergraduate Thesis/commodity_price_prediction/autogluon_models/Crude_Oil_Brent_Price'
AutoGluon Version:  1.2
Python Version:     3.12.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:06:57 PDT 2024; root:xnu-11215.41.3~3/RELEASE_ARM64_T6041
CPU Count:          14
GPU Count:          0
Memory Avail:       5.10 GB / 24.00 GB (21.2%)
Disk Space Avail:   122.10 GB / 460.43 GB (26.5%)
Setting presets to: medium_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MAPE,
 'hyperparameters': {'ARIMA': {'maxiter': 50},
                     'DeepAR': {'num_layers': 2},
                     'ETS': {},
                     'SimpleFeedForward': {'epochs': 50}},
 'known_covariates_names': [],
 'num_val_windows': 3,
 'prediction_length': 12,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0

Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to '/Users/purplegeminii/Desktop/Undergraduate Thesis/commodity_price_prediction/autogluon_models/Cocoa_Price'
AutoGluon Version:  1.2
Python Version:     3.12.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.1.0: Thu Oct 10 21:06:57 PDT 2024; root:xnu-11215.41.3~3/RELEASE_ARM64_T6041
CPU Count:          14
GPU Count:          0
Memory Avail:       4.29 GB / 24.00 GB (17.9%)
Disk Space Avail:   122.10 GB / 460.43 GB (26.5%)
Setting presets to: medium_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MAPE,
 'hyperparameters': {'ARIMA': {'maxiter': 50},
                     'DeepAR': {'num_layers': 2},
                     'ETS': {},
                     'SimpleFeedForward': {'epochs': 50}},
 'known_covariates_names': [],
 'num_val_windows': 3,
 'prediction_length': 12,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],



Crude Oil Brent Price Evaluation:
- MAE: 6.67
- MAPE: 7.92%
- RMSE: 8.30


KeyboardInterrupt: 

In [None]:
df.head()

Unnamed: 0_level_0,"Revenue, excluding grants (% of GDP)",GDP (constant 2015 US$),"Employment to population ratio (15+, total %)",Crude Oil Brent Price,Cocoa Price,Gold Price,__id__,Crude Oil Brent Price_3mo_ma,Crude Oil Brent Price_12mo_ma,Cocoa Price_3mo_ma,Cocoa Price_12mo_ma,Gold Price_3mo_ma,Gold Price_12mo_ma
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1991-01-01,14.597519,13862820000.0,74.834,23.65,1.2417,383.64,main_series,23.65,,1.2417,,383.64,
1991-02-01,14.372773,13913550000.0,74.690804,19.4,1.2147,363.83,main_series,21.525,,1.2282,,373.735,
1991-03-01,14.148027,13961240000.0,74.556498,19.45,1.2103,363.34,main_series,20.833333,,1.222233,,370.27,
1991-04-01,13.923281,14006480000.0,74.430605,19.25,1.1458,358.38,main_series,19.366667,,1.190267,,361.85,
1991-05-01,13.698535,14049850000.0,74.312644,19.3,1.0622,356.95,main_series,19.333333,,1.139433,,359.556667,


In [None]:
# testing plots
"""
    date does not appear on x axis if index not set
    forecast index is not right
"""
target = "Crude Oil Brent Price"

forecast_df = results[target]['forecast'].set_index('date')

# Show interactive plot
plot_actual_vs_predicted(train_data.set_index('date'), test_data.set_index('date'), forecast_df, target)

KeyError: "None of ['date'] are in the columns"