# XGBoost Forecasting Model for Azure Cost Management

This notebook implements XGBoost (Extreme Gradient Boosting) for Azure cost prediction. XGBoost is a powerful machine learning algorithm that can capture complex non-linear relationships and handle various data types effectively.

## XGBoost Model Features
- **Gradient Boosting**: Uses ensemble of decision trees with gradient descent optimization
- **Feature Engineering**: Can incorporate multiple features including time-based, categorical, and numerical
- **Non-linear Relationships**: Captures complex patterns that traditional time series methods might miss
- **Robust Performance**: Handles missing values and outliers well
- **Feature Importance**: Provides insights into which features drive cost predictions

## Objectives
1. Load and prepare time series data for XGBoost
2. Create comprehensive feature engineering pipeline
3. Train XGBoost models for different cost categories
4. Implement time series cross-validation
5. Generate forecasts and evaluate model performance
6. Analyze feature importance and model interpretability


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# XGBoost and ML specific imports
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import shap

# Load forecasting data
import pickle
with open('/Users/sabbineni/projects/acm/data/forecasting_data.pkl', 'rb') as f:
    forecasting_data = pickle.load(f)

# Load the main dataset for feature engineering
df = pd.read_csv('/Users/sabbineni/projects/acm/data/sample_azure_costs.csv')
df['UsageDateTime'] = pd.to_datetime(df['UsageDateTime'])

print("Libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")
print(f"Available time series: {list(forecasting_data.keys())}")
print(f"Main dataset shape: {df.shape}")


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/sabbineni/projects/acm/venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <89AD948E-E564-3266-867D-7AF89D6488F0> /Users/sabbineni/projects/acm/venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]


In [None]:
# Feature Engineering for XGBoost
def create_features(df, target_col='PreTaxCost'):
    """
    Create comprehensive features for XGBoost model.
    """
    print("Creating features for XGBoost model...")
    
    # Create a copy to avoid modifying original data
    df_features = df.copy()
    
    # Time-based features
    df_features['Year'] = df_features['UsageDateTime'].dt.year
    df_features['Month'] = df_features['UsageDateTime'].dt.month
    df_features['Day'] = df_features['UsageDateTime'].dt.day
    df_features['DayOfWeek'] = df_features['UsageDateTime'].dt.dayofweek
    df_features['DayOfYear'] = df_features['UsageDateTime'].dt.dayofyear
    df_features['WeekOfYear'] = df_features['UsageDateTime'].dt.isocalendar().week
    df_features['Quarter'] = df_features['UsageDateTime'].dt.quarter
    df_features['IsWeekend'] = (df_features['DayOfWeek'] >= 5).astype(int)
    df_features['IsMonthStart'] = df_features['UsageDateTime'].dt.is_month_start.astype(int)
    df_features['IsMonthEnd'] = df_features['UsageDateTime'].dt.is_month_end.astype(int)
    df_features['IsQuarterStart'] = df_features['UsageDateTime'].dt.is_quarter_start.astype(int)
    df_features['IsQuarterEnd'] = df_features['UsageDateTime'].dt.is_quarter_end.astype(int)
    
    # Cyclical encoding for time features
    df_features['Month_sin'] = np.sin(2 * np.pi * df_features['Month'] / 12)
    df_features['Month_cos'] = np.cos(2 * np.pi * df_features['Month'] / 12)
    df_features['DayOfWeek_sin'] = np.sin(2 * np.pi * df_features['DayOfWeek'] / 7)
    df_features['DayOfWeek_cos'] = np.cos(2 * np.pi * df_features['DayOfWeek'] / 7)
    df_features['DayOfYear_sin'] = np.sin(2 * np.pi * df_features['DayOfYear'] / 365)
    df_features['DayOfYear_cos'] = np.cos(2 * np.pi * df_features['DayOfYear'] / 365)
    
    # Lag features (past values)
    df_features = df_features.sort_values('UsageDateTime')
    for lag in [1, 2, 3, 7, 14, 30]:
        df_features[f'{target_col}_lag_{lag}'] = df_features.groupby('MeterCategory')[target_col].shift(lag)
    
    # Rolling window features
    for window in [3, 7, 14, 30]:
        df_features[f'{target_col}_rolling_mean_{window}'] = df_features.groupby('MeterCategory')[target_col].rolling(window=window).mean().reset_index(0, drop=True)
        df_features[f'{target_col}_rolling_std_{window}'] = df_features.groupby('MeterCategory')[target_col].rolling(window=window).std().reset_index(0, drop=True)
        df_features[f'{target_col}_rolling_max_{window}'] = df_features.groupby('MeterCategory')[target_col].rolling(window=window).max().reset_index(0, drop=True)
        df_features[f'{target_col}_rolling_min_{window}'] = df_features.groupby('MeterCategory')[target_col].rolling(window=window).min().reset_index(0, drop=True)
    
    # Exponential moving averages
    for span in [3, 7, 14]:
        df_features[f'{target_col}_ema_{span}'] = df_features.groupby('MeterCategory')[target_col].ewm(span=span).mean().reset_index(0, drop=True)
    
    # Categorical features encoding
    categorical_features = ['MeterCategory', 'MeterSubCategory', 'ResourceLocation', 'ServiceTier', 'Currency']
    
    for feature in categorical_features:
        if feature in df_features.columns:
            le = LabelEncoder()
            df_features[f'{feature}_encoded'] = le.fit_transform(df_features[feature].astype(str))
    
    # Interaction features
    df_features['UsageQuantity_x_ResourceRate'] = df_features['UsageQuantity'] * df_features['ResourceRate']
    df_features['Cost_per_Unit'] = df_features['PreTaxCost'] / (df_features['UsageQuantity'] + 1e-8)
    
    # Statistical features by category
    category_stats = df_features.groupby('MeterCategory')[target_col].agg(['mean', 'std', 'min', 'max']).reset_index()
    category_stats.columns = ['MeterCategory', 'Category_mean', 'Category_std', 'Category_min', 'Category_max']
    df_features = df_features.merge(category_stats, on='MeterCategory', how='left')
    
    # Relative features
    df_features['Cost_vs_Category_mean'] = df_features[target_col] / (df_features['Category_mean'] + 1e-8)
    df_features['Cost_vs_Category_std'] = (df_features[target_col] - df_features['Category_mean']) / (df_features['Category_std'] + 1e-8)
    
    print(f"Created {len(df_features.columns)} features")
    return df_features

# Create features
df_features = create_features(df)
print(f"Feature engineering completed. Dataset shape: {df_features.shape}")

# Display feature information
print("\nFeature categories:")
time_features = [col for col in df_features.columns if any(x in col for x in ['Year', 'Month', 'Day', 'Week', 'Quarter', 'sin', 'cos'])]
lag_features = [col for col in df_features.columns if 'lag' in col]
rolling_features = [col for col in df_features.columns if 'rolling' in col]
categorical_features = [col for col in df_features.columns if 'encoded' in col]

print(f"Time features: {len(time_features)}")
print(f"Lag features: {len(lag_features)}")
print(f"Rolling features: {len(rolling_features)}")
print(f"Categorical features: {len(categorical_features)}")
print(f"Other features: {len(df_features.columns) - len(time_features) - len(lag_features) - len(rolling_features) - len(categorical_features)}")


In [None]:
# Prepare Data for XGBoost Training
def prepare_xgboost_data(df_features, target_col='PreTaxCost', test_size=0.2):
    """
    Prepare data for XGBoost training with proper train/test split for time series.
    """
    print("Preparing data for XGBoost training...")
    
    # Sort by date
    df_sorted = df_features.sort_values('UsageDateTime').reset_index(drop=True)
    
    # Select features (exclude target and non-predictive columns)
    exclude_cols = [
        'UsageDateTime', target_col, 'SubscriptionGuid', 'MeterId', 'InstanceId',
        'Tags', 'OfferId', 'AdditionalInfo', 'ServiceInfo1', 'ServiceInfo2',
        'MeterName', 'MeterRegion', 'ConsumedService', 'ResourceType',
        'MeterCategory', 'MeterSubCategory', 'ResourceLocation', 'ServiceTier', 'Currency'
    ]
    
    feature_cols = [col for col in df_sorted.columns if col not in exclude_cols]
    
    # Remove columns with too many NaN values
    nan_threshold = 0.5
    valid_features = []
    for col in feature_cols:
        if df_sorted[col].isna().sum() / len(df_sorted) < nan_threshold:
            valid_features.append(col)
        else:
            print(f"Removing {col} due to high NaN ratio")
    
    print(f"Using {len(valid_features)} features for training")
    
    # Create feature matrix and target
    X = df_sorted[valid_features].fillna(0)  # Fill remaining NaN with 0
    y = df_sorted[target_col]
    
    # Time series split (use last portion for testing)
    split_idx = int(len(X) * (1 - test_size))
    
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    print(f"Features: {X_train.shape[1]}")
    
    return X_train, X_test, y_train, y_test, valid_features

# Prepare data for each category
key_categories = ['Total', 'Compute', 'Storage', 'Database']
xgboost_data = {}

for category in key_categories:
    if category == 'Total':
        # Use all data for total cost prediction
        X_train, X_test, y_train, y_test, features = prepare_xgboost_data(df_features)
    else:
        # Filter data for specific category
        category_data = df_features[df_features['MeterCategory'] == category].copy()
        if len(category_data) > 100:  # Need sufficient data
            X_train, X_test, y_train, y_test, features = prepare_xgboost_data(category_data)
        else:
            print(f"Skipping {category} - insufficient data")
            continue
    
    xgboost_data[category] = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'features': features
    }
    
    print(f"\n{category} data prepared:")
    print(f"  Training samples: {len(X_train)}")
    print(f"  Test samples: {len(X_test)}")
    print(f"  Features: {len(features)}")

print(f"\nData prepared for categories: {list(xgboost_data.keys())}")


In [None]:
# Train XGBoost Models
def train_xgboost_model(X_train, X_test, y_train, y_test, category_name):
    """
    Train XGBoost model with hyperparameter optimization.
    """
    print(f"\nTraining XGBoost model for {category_name}...")
    
    # XGBoost parameters
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'max_depth': 6,
        'learning_rate': 0.1,
        'n_estimators': 1000,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 42,
        'n_jobs': -1
    }
    
    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    # Train model with early stopping
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=[(dtrain, 'train'), (dtest, 'test')],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    # Make predictions
    y_pred_train = model.predict(dtrain)
    y_pred_test = model.predict(dtest)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    print(f"Training RMSE: {train_rmse:.4f}")
    print(f"Test RMSE: {test_rmse:.4f}")
    print(f"Training MAE: {train_mae:.4f}")
    print(f"Test MAE: {test_mae:.4f}")
    print(f"Training RÂ²: {train_r2:.4f}")
    print(f"Test RÂ²: {test_r2:.4f}")
    
    # Feature importance
    importance = model.get_score(importance_type='weight')
    feature_importance = pd.DataFrame({
        'feature': list(importance.keys()),
        'importance': list(importance.values())
    }).sort_values('importance', ascending=False)
    
    return {
        'model': model,
        'y_pred_train': y_pred_train,
        'y_pred_test': y_pred_test,
        'metrics': {
            'train_rmse': train_rmse,
            'test_rmse': test_rmse,
            'train_mae': train_mae,
            'test_mae': test_mae,
            'train_r2': train_r2,
            'test_r2': test_r2
        },
        'feature_importance': feature_importance
    }

# Train models for all categories
xgboost_models = {}

for category, data in xgboost_data.items():
    result = train_xgboost_model(
        data['X_train'], data['X_test'],
        data['y_train'], data['y_test'],
        category
    )
    xgboost_models[category] = result

print(f"\nSuccessfully trained XGBoost models for: {list(xgboost_models.keys())}")


In [None]:
# Feature Importance Analysis
def plot_feature_importance(feature_importance, category_name, top_n=20):
    """
    Plot feature importance for XGBoost model.
    """
    # Get top N features
    top_features = feature_importance.head(top_n)
    
    # Create plot
    fig = go.Figure(data=[
        go.Bar(
            x=top_features['importance'],
            y=top_features['feature'],
            orientation='h',
            marker_color='lightblue'
        )
    ])
    
    fig.update_layout(
        title=f'Top {top_n} Feature Importance - {category_name}',
        xaxis_title='Importance',
        yaxis_title='Features',
        height=600,
        yaxis={'categoryorder': 'total ascending'}
    )
    
    fig.show()

# Plot feature importance for each model
for category, result in xgboost_models.items():
    plot_feature_importance(result['feature_importance'], category)

# Model Performance Comparison
print("=== Model Performance Summary ===")
performance_summary = pd.DataFrame()

for category, result in xgboost_models.items():
    metrics = result['metrics']
    performance_summary[category] = [
        metrics['test_rmse'],
        metrics['test_mae'],
        metrics['test_r2']
    ]

performance_summary.index = ['RMSE', 'MAE', 'RÂ²']
performance_summary = performance_summary.round(4)

print(performance_summary)

# Create performance visualization
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=('RMSE Comparison', 'MAE Comparison', 'RÂ² Comparison'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}, {"secondary_y": False}]]
)

# RMSE
fig.add_trace(
    go.Bar(x=performance_summary.columns, y=performance_summary.loc['RMSE'],
           name='RMSE', marker_color='red'),
    row=1, col=1
)

# MAE
fig.add_trace(
    go.Bar(x=performance_summary.columns, y=performance_summary.loc['MAE'],
           name='MAE', marker_color='blue'),
    row=1, col=2
)

# RÂ²
fig.add_trace(
    go.Bar(x=performance_summary.columns, y=performance_summary.loc['RÂ²'],
           name='RÂ²', marker_color='green'),
    row=1, col=3
)

fig.update_layout(height=400, title_text="XGBoost Model Performance Comparison")
fig.show()


In [None]:
# Generate Future Forecasts
def generate_future_forecasts(model, last_data, features, periods=30):
    """
    Generate future forecasts using trained XGBoost model.
    """
    print(f"Generating {periods}-day forecasts...")
    
    # Create future dates
    last_date = last_data['UsageDateTime'].iloc[-1]
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=periods, freq='D')
    
    # Create future features (simplified approach)
    future_data = []
    
    for i, date in enumerate(future_dates):
        # Create basic time features
        future_row = {
            'Year': date.year,
            'Month': date.month,
            'Day': date.day,
            'DayOfWeek': date.dayofweek,
            'DayOfYear': date.dayofyear,
            'WeekOfYear': date.isocalendar().week,
            'Quarter': date.quarter,
            'IsWeekend': 1 if date.dayofweek >= 5 else 0,
            'IsMonthStart': 1 if date.day == 1 else 0,
            'IsMonthEnd': 1 if date == (date + pd.offsets.MonthEnd(0)) else 0,
            'IsQuarterStart': 1 if date in [pd.Timestamp(f'{date.year}-01-01'), 
                                          pd.Timestamp(f'{date.year}-04-01'),
                                          pd.Timestamp(f'{date.year}-07-01'),
                                          pd.Timestamp(f'{date.year}-10-01')] else 0,
            'IsQuarterEnd': 1 if date in [pd.Timestamp(f'{date.year}-03-31'),
                                        pd.Timestamp(f'{date.year}-06-30'),
                                        pd.Timestamp(f'{date.year}-09-30'),
                                        pd.Timestamp(f'{date.year}-12-31')] else 0
        }
        
        # Add cyclical features
        future_row['Month_sin'] = np.sin(2 * np.pi * date.month / 12)
        future_row['Month_cos'] = np.cos(2 * np.pi * date.month / 12)
        future_row['DayOfWeek_sin'] = np.sin(2 * np.pi * date.dayofweek / 7)
        future_row['DayOfWeek_cos'] = np.cos(2 * np.pi * date.dayofweek / 7)
        future_row['DayOfYear_sin'] = np.sin(2 * np.pi * date.dayofyear / 365)
        future_row['DayOfYear_cos'] = np.cos(2 * np.pi * date.dayofyear / 365)
        
        # Add lag features (use recent averages)
        recent_avg = last_data['PreTaxCost'].tail(7).mean()
        for lag in [1, 2, 3, 7, 14, 30]:
            future_row[f'PreTaxCost_lag_{lag}'] = recent_avg
        
        # Add rolling features
        for window in [3, 7, 14, 30]:
            future_row[f'PreTaxCost_rolling_mean_{window}'] = recent_avg
            future_row[f'PreTaxCost_rolling_std_{window}'] = last_data['PreTaxCost'].tail(7).std()
            future_row[f'PreTaxCost_rolling_max_{window}'] = last_data['PreTaxCost'].tail(7).max()
            future_row[f'PreTaxCost_rolling_min_{window}'] = last_data['PreTaxCost'].tail(7).min()
        
        # Add EMA features
        for span in [3, 7, 14]:
            future_row[f'PreTaxCost_ema_{span}'] = recent_avg
        
        # Add other features with default values
        for feature in features:
            if feature not in future_row:
                future_row[feature] = 0
        
        future_data.append(future_row)
    
    # Create DataFrame
    future_df = pd.DataFrame(future_data)
    
    # Ensure all features are present
    for feature in features:
        if feature not in future_df.columns:
            future_df[feature] = 0
    
    # Reorder columns to match training data
    future_df = future_df[features]
    
    # Make predictions
    dmatrix = xgb.DMatrix(future_df)
    forecasts = model.predict(dmatrix)
    
    return future_dates, forecasts

# Generate forecasts for each model
future_forecasts = {}

for category, result in xgboost_models.items():
    print(f"\nGenerating forecasts for {category}...")
    
    # Get the last data for this category
    if category == 'Total':
        last_data = df_features.tail(100)  # Use last 100 records
    else:
        last_data = df_features[df_features['MeterCategory'] == category].tail(100)
    
    future_dates, forecasts = generate_future_forecasts(
        result['model'], 
        last_data, 
        xgboost_data[category]['features']
    )
    
    future_forecasts[category] = {
        'dates': future_dates,
        'forecasts': forecasts
    }
    
    print(f"Forecast period: {future_dates[0]} to {future_dates[-1]}")
    print(f"Average predicted cost: ${forecasts.mean():.2f}")
    print(f"Total predicted cost: ${forecasts.sum():.2f}")

print(f"\nFuture forecasts generated for: {list(future_forecasts.keys())}")


In [None]:
# Save XGBoost Results
print("=== Saving XGBoost Results ===")

# Save models and results
import joblib
import os

# Create results directory
results_dir = '/Users/sabbineni/projects/acm/results/xgboost'
os.makedirs(results_dir, exist_ok=True)

# Save models
for category, result in xgboost_models.items():
    model_path = f"{results_dir}/xgboost_model_{category.lower()}.pkl"
    joblib.dump(result['model'], model_path)
    print(f"Saved model: {model_path}")

# Save performance metrics
performance_data = {}
for category, result in xgboost_models.items():
    performance_data[category] = result['metrics']

performance_df = pd.DataFrame(performance_data).T
performance_path = f"{results_dir}/xgboost_performance.csv"
performance_df.to_csv(performance_path)
print(f"Saved performance metrics: {performance_path}")

# Save feature importance
for category, result in xgboost_models.items():
    importance_path = f"{results_dir}/xgboost_feature_importance_{category.lower()}.csv"
    result['feature_importance'].to_csv(importance_path, index=False)
    print(f"Saved feature importance: {importance_path}")

# Save future forecasts
for category, forecast_data in future_forecasts.items():
    forecast_df = pd.DataFrame({
        'date': forecast_data['dates'],
        'forecast': forecast_data['forecasts']
    })
    
    forecast_path = f"{results_dir}/xgboost_forecast_{category.lower()}.csv"
    forecast_df.to_csv(forecast_path, index=False)
    print(f"Saved forecast: {forecast_path}")

# Create forecast comparison
forecast_comparison = pd.DataFrame()
for category, forecast_data in future_forecasts.items():
    forecast_comparison[category] = forecast_data['forecasts']

forecast_comparison.index = future_forecasts['Total']['dates']
forecast_comparison.index.name = 'Date'

comparison_path = f"{results_dir}/xgboost_forecast_comparison.csv"
forecast_comparison.to_csv(comparison_path)
print(f"Saved forecast comparison: {comparison_path}")

# Save feature engineering data
features_path = f"{results_dir}/feature_engineering_data.csv"
df_features.to_csv(features_path, index=False)
print(f"Saved feature engineering data: {features_path}")

print("\nâœ… XGBoost model implementation completed successfully!")
print("ðŸ“Š Models trained, evaluated, and saved")
print("ðŸ”® Future forecasts generated for 30 days")
print("ðŸ“ˆ Feature importance analysis completed")
print("ðŸŽ¯ Results ready for comparison with other models")
