# Enhanced Bandwidth Prediction with Exogenous Events
This notebook demonstrates the enhanced bandwidth prediction system using shared modules and exogenous event features.

## Features
- Shared feature engineering modules
- Exogenous event processing
- Multi-combination model support
- Enhanced temporal features

In [None]:
# Install required packages if needed
!pip install optuna xgboost scikit-learn pandas numpy matplotlib seaborn

In [None]:
# Import libraries
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Add src to path to import our modules
sys.path.append(os.path.join('..'))  # Go up one level from notebooks/

# Import our custom modules
from src.data.loader import BandwidthDataLoader
from src.models.trainer import BandwidthModelTrainer
from src.models.predictor import BandwidthPredictor
from src.features.temporal_features import get_temporal_feature_names
from src.features.event_features import get_event_feature_names

# Set visualization style
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

print("✅ Enhanced Bandwidth Prediction System Loaded!")
print("📊 Using shared modules for consistency with the application")

## 1. Initialize Data Loader
Load bandwidth data and events with our enhanced data loader.

In [None]:
# Initialize data loader with events
bandwidth_file = '../sample_data/internet_details.csv'
events_file = '../sample_data/internet_event_details.csv'

data_loader = BandwidthDataLoader(bandwidth_file, events_file)

print("📂 Data loader initialized with:")
print(f"   📊 Bandwidth file: {bandwidth_file}")
print(f"   📅 Events file: {events_file}")

## 2. Explore Available Combinations
See what item/service_type combinations are available in the data.

In [None]:
# Get available combinations
combinations = data_loader.get_available_combinations()

print("🌐 Available Item/Service Type Combinations:")
print("-" * 60)
for i, (item, service_type) in enumerate(combinations, 1):
    print(f"{i:2d}. {item:<25} | {service_type}")

print(f"\n📊 Total combinations: {len(combinations)}")

## 3. Load and Analyze Data for Google/Cache
Let's focus on the Google/Cache combination as in the original notebook.

In [None]:
# Load data for Google/Cache combination
item = 'Google'
service_type = 'cache'

# Prepare training data with enhanced features
train_df, test_df = data_loader.prepare_training_data(
    item=item,
    service_type=service_type,
    train_end_date='2025-08-22',
    include_events=True,
    lookback_days=7
)

print(f"📊 Data for {item}/{service_type}:")
print(f"   📈 Training samples: {len(train_df)}")
print(f"   🧪 Test samples: {len(test_df)}")
print(f"   📅 Training period: {train_df.index.min()} to {train_df.index.max()}")
print(f"   📅 Test period: {test_df.index.min()} to {test_df.index.max()}")

In [None]:
# Display sample of the enhanced data
print("📋 Sample of Enhanced Data with Events:")
print("=" * 50)
print("First 5 rows:")
display(train_df.head())

print("\nLast 5 rows:")
display(train_df.tail())

## 4. Feature Analysis
Examine the temporal and event features we've created.

In [None]:
# Get feature information
temporal_features = get_temporal_feature_names()
event_features = get_event_feature_names(lookback_days=7)
all_features = data_loader.get_feature_columns(include_events=True, lookback_days=7)

print("🔧 Feature Engineering Summary:")
print("=" * 40)
print(f"📅 Temporal features: {len(temporal_features)}")
print(f"📊 Event features: {len(event_features)}")
print(f"🎯 Total features: {len(all_features)}")

print("\n📅 Temporal Features:")
for feature in temporal_features:
    print(f"   • {feature}")

print("\n📊 Event Features (sample):")
for feature in event_features[:10]:  # Show first 10
    print(f"   • {feature}")
if len(event_features) > 10:
    print(f"   ... and {len(event_features) - 10} more")

## 5. Visualize Data with Train/Test Split

In [None]:
# Combine train and test for visualization
combined_df = pd.concat([train_df, test_df], sort=False)

# Plot bandwidth usage with train/test split
fig, ax = plt.subplots(figsize=(15, 6))

# Plot training and test sets
train_df['peak_bandwidth_utilization'].plot(
    ax=ax, label='Training Set', color='blue', linewidth=2
)
test_df['peak_bandwidth_utilization'].plot(
    ax=ax, label='Test Set', color='orange', linewidth=2
)

# Add split line
split_date = '2025-08-22'
ax.axvline(split_date, color='red', linestyle='--', linewidth=2, alpha=0.7)
ax.text(split_date, ax.get_ylim()[1] * 0.9, 'Train/Test Split',
        rotation=90, color='red', verticalalignment='center', fontweight='bold')

# Customize plot
ax.set_title(f'Enhanced {item}/{service_type} Bandwidth Usage with Events', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Date', fontsize=14)
ax.set_ylabel('Peak Bandwidth Usage (Gbps)', fontsize=14)
ax.grid(True, alpha=0.3)
ax.legend(loc='upper left')

plt.tight_layout()
plt.show()

print(f"📊 Dataset includes {len(combined_df)} total observations")
print(f"📈 Peak usage: {combined_df['peak_bandwidth_utilization'].max():.2f} Gbps")
print(f"📉 Minimum usage: {combined_df['peak_bandwidth_utilization'].min():.2f} Gbps")
print(f"📊 Average usage: {combined_df['peak_bandwidth_utilization'].mean():.2f} Gbps")

## 6. Event Analysis
Analyze the impact of exogenous events on bandwidth usage.

In [None]:
# Check if we have event features in our data
event_cols = [col for col in combined_df.columns if col.startswith('event_')]
capacity_cols = [col for col in combined_df.columns if 'capacity_change' in col]

if event_cols:
    print("📅 Event Impact Analysis:")
    print("=" * 30)
    
    # Count events by type
    event_counts = {}
    for col in event_cols:
        count = combined_df[col].sum()
        if count > 0:
            event_type = col.replace('event_', '').replace('_recent_7d', ' (recent)')
            event_counts[event_type] = count
    
    # Display event counts
    print("📊 Event occurrences:")
    for event_type, count in sorted(event_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"   • {event_type}: {count}")
    
    # Capacity changes
    if capacity_cols:
        capacity_changes = combined_df[capacity_cols[0]]
        total_increases = capacity_changes[capacity_changes > 0].sum()
        total_decreases = abs(capacity_changes[capacity_changes < 0].sum())
        
        print(f"\n🔧 Capacity Changes:")
        print(f"   ⬆️ Total increases: {total_increases:.0f} Gbps")
        print(f"   ⬇️ Total decreases: {total_decreases:.0f} Gbps")
        print(f"   📊 Net change: {total_increases - total_decreases:.0f} Gbps")
else:
    print("❌ No event features found in the data.")
    print("   This might indicate an issue with event data loading.")

## 7. Model Training with Enhanced Features
Train the XGBoost model using our enhanced trainer with event features.

In [None]:
# Initialize model trainer
trainer = BandwidthModelTrainer(data_loader)

print("🚀 Starting Enhanced Model Training")
print("=" * 40)
print(f"🎯 Target: {item}/{service_type}")
print(f"📊 Including exogenous events: ✅")
print(f"📅 Event lookback window: 7 days")
print(f"🔧 Hyperparameter optimization: 30 trials")
print("\n⏱️ This may take a few minutes...")

In [None]:
# Train the model
training_results = trainer.train_model(
    item=item,
    service_type=service_type,
    train_end_date='2025-08-22',
    include_events=True,
    lookback_days=7,
    n_trials=30  # Reduced for notebook
)

# Display training results
metrics = training_results['metrics']
print("\n🎯 TRAINING RESULTS")
print("=" * 25)
print(f"📊 Test RMSE: {metrics['test_rmse']:.4f}")
print(f"📐 Test MAE: {metrics['test_mae']:.4f}")
print(f"📈 Test MAPE: {metrics['test_mape']:.2f}%")
print(f"🔧 Features used: {len(training_results['feature_columns'])}")
print(f"⚡ Best Optuna RMSE: {metrics['best_optuna_rmse']:.4f}")

## 8. Feature Importance Analysis

In [None]:
# Plot feature importance
import xgboost as xgb

model = training_results['model']

# Create feature importance plot
fig, ax = plt.subplots(figsize=(12, 8))
xgb.plot_importance(model, importance_type="weight", max_num_features=15, ax=ax)
ax.set_title('Top 15 Feature Importance (Enhanced Model with Events)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Get feature importance as dataframe for analysis
feature_importance = model.get_booster().get_score(importance_type='weight')
importance_df = pd.DataFrame([
    {'feature': k, 'importance': v} for k, v in feature_importance.items()
]).sort_values('importance', ascending=False)

print("🔝 Top 10 Most Important Features:")
print("-" * 35)
for i, row in importance_df.head(10).iterrows():
    feature_type = "📅 Temporal" if row['feature'] in get_temporal_feature_names() else "📊 Event"
    print(f"{feature_type:12} | {row['feature']:<25} | {row['importance']:>6}")

## 9. Save the Enhanced Model

In [None]:
# Save the trained model
model_path = trainer.save_model(training_results)

print("💾 Model Saved Successfully!")
print(f"📁 File: {model_path}")
print(f"🎯 Combination: {item}/{service_type}")
print(f"📊 Performance: {metrics['test_mape']:.2f}% MAPE")
print(f"✅ Ready for predictions via CLI or interactive app!")

## 10. Make Predictions with Enhanced Model

In [None]:
# Initialize predictor and make future predictions
predictor = BandwidthPredictor(data_loader)

# Make 14-day forecast
future_predictions = predictor.predict_future(model_path, n_days=14)

print("🔮 14-Day Future Forecast")
print("=" * 30)
display(future_predictions)

# Plot predictions
fig, ax = plt.subplots(figsize=(12, 6))
future_predictions.set_index('date')['predicted'].plot(
    ax=ax, marker='o', linewidth=2, markersize=6, color='green'
)
ax.set_title('14-Day Bandwidth Forecast (Enhanced Model with Events)', 
             fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Predicted Bandwidth (Gbps)')
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Summary statistics
mean_pred = future_predictions['predicted'].mean()
max_pred = future_predictions['predicted'].max()
min_pred = future_predictions['predicted'].min()

print(f"\n📊 Forecast Summary:")
print(f"   📈 Average: {mean_pred:.2f} Gbps")
print(f"   ⬆️ Maximum: {max_pred:.2f} Gbps")
print(f"   ⬇️ Minimum: {min_pred:.2f} Gbps")
print(f"   📊 Range: {max_pred - min_pred:.2f} Gbps")

## 11. Compare with Historical Performance

In [None]:
# Test historical predictions
historical_results = predictor.predict_historical(
    model_path, 
    start_date='2025-08-22', 
    end_date='2025-08-31'
)

print("📊 Historical Prediction Performance")
print("=" * 40)
display(historical_results[['date', 'actual', 'predicted', 'error', 'abs_error']])

# Plot actual vs predicted
fig, ax = plt.subplots(figsize=(12, 6))
historical_results.set_index('date')[['actual', 'predicted']].plot(
    ax=ax, marker='o', linewidth=2, markersize=6
)
ax.set_title('Actual vs Predicted (Enhanced Model)', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Bandwidth (Gbps)')
ax.grid(True, alpha=0.3)
ax.legend(['Actual', 'Predicted'])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Performance metrics
rmse = (historical_results['error'] ** 2).mean() ** 0.5
mae = historical_results['abs_error'].mean()
mape = (historical_results['abs_error'] / historical_results['actual']).mean() * 100

print(f"\n📈 Historical Performance:")
print(f"   📏 RMSE: {rmse:.4f}")
print(f"   📐 MAE: {mae:.4f}")
print(f"   📊 MAPE: {mape:.2f}%")

## 12. Integration with Production System

This notebook demonstrates the enhanced bandwidth prediction system. The trained model is now available for use in the production application.

### Using the CLI:
```bash
# Make future predictions
python main.py predict --item "Google" --service-type "cache" --days 14

# Predict for specific date
python main.py predict --item "Google" --service-type "cache" --date "2025-09-15"
```

### Using the Interactive Interface:
```bash
python main.py interactive
```

### Key Enhancements:
1. **📊 Exogenous Event Features**: Network events, capacity changes, external factors
2. **🔧 Shared Modules**: Consistent feature engineering across notebook and application
3. **🎯 Multi-Combination Support**: Train models for any item/service_type combination
4. **💾 Model Persistence**: Save and load trained models with metadata
5. **🚀 Production Ready**: CLI and interactive interfaces for end users