In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
import torch
import torch.nn as nn
from tqdm import tqdm  # For progress bars

from google.colab import drive
drive.mount('/content/drive')

# Load the CSV file
df = pd.read_csv('/content/drive/MyDrive/Deep Learning Group 12/Deep Learning Group 12/traffic.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Handle datetime column
df['datetime'] = pd.to_datetime(df.iloc[:, 0])  # First column is datetime
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month

# Drop original datetime
df = df.drop(columns=[df.columns[0], 'datetime'])

# Define features and target
X = df.iloc[:, :-1].values  # all columns except target
y = df.iloc[:, -1].values   # target column

# Print target value distribution and range
# print(f"Target value range: {y.min()} to {y.max()}")
unique_values, counts = np.unique(y, return_counts=True)
# print(f"Target value distribution: {len(unique_values)} unique values")
if len(unique_values) <= 10:
    for value, count in zip(unique_values, counts):
        print(f"  Value {value}: {count} occurrences")

# Determine if we're dealing with regression or classification
is_regression = len(unique_values) > 10
# print(f"Task type: {'Regression' if is_regression else 'Classification'}")

# Keep original targets for proper evaluation
y_original = y.copy()

if is_regression:
    # Normalize to [0,1] for neural network training
    y_normalized = (y - y.min()) / (y.max() - y.min())
    y = y_normalized
else:
    # Convert to 0/1 for binary classification
    if len(unique_values) == 2:
        y = (y == y.max()).astype(float)
    else:
        pass  # Keep multi-class as is

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)
_, _, y_train_original, y_test_original = train_test_split(
    X_scaled, y_original, test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

In [None]:
# CNN-LSTM Model Definition
class CNNLSTMModel(nn.Module):
    def __init__(self):
        super(CNNLSTMModel, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(64, 64, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(input_size=64, hidden_size=64, batch_first=True)
        self.fc1 = nn.Linear(64, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Get the last output
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

# Training function for neural networks
def train_model(model, X_tensor, y_tensor, epochs=10, batch_size=64, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()
    dataset_size = len(X_tensor)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        # Batch processing
        for i in range(0, dataset_size, batch_size):
            batch_x = X_tensor[i:i+batch_size]
            batch_y = y_tensor[i:i+batch_size]

            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        if epoch % 2 == 0:  # Print every 2 epochs
            epoch_loss = running_loss / ((dataset_size + batch_size - 1) // batch_size)
            print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

    model.eval()  # Set to evaluation mode
    return model

In [None]:
print("\nIMPLEMENTING BAGGING FOR CNN+LSTM")
n_bagging_models = 5
bagged_models = []

print(f"Training {n_bagging_models} bagged CNN+LSTM models...")
for i in range(n_bagging_models):
    print(f"  Training bagged model {i+1}/{n_bagging_models}...")

    # Sample with replacement (bootstrap)
    indices = np.random.choice(len(X_train), len(X_train), replace=True)
    X_bag = X_train[indices]
    y_bag = y_train[indices]

    # Convert to tensors
    X_bag_tensor = torch.tensor(X_bag, dtype=torch.float32).unsqueeze(1)
    y_bag_tensor = torch.tensor(y_bag, dtype=torch.float32).unsqueeze(1)

    # Create and train model
    model = CNNLSTMModel()
    model = train_model(model, X_bag_tensor, y_bag_tensor, epochs=5)
    bagged_models.append(model)

# Prediction function for bagged models
def predict_bagged(X_tensor, models):
    predictions = []
    with torch.no_grad():
        for model in models:
            pred = model(X_tensor).cpu().numpy().flatten()
            predictions.append(pred)

    # Average predictions
    ensemble_pred = np.mean(predictions, axis=0)
    return ensemble_pred

# Evaluate bagging performance
bagged_preds = predict_bagged(X_test_tensor, bagged_models)

if is_regression:
    # For regression, scale back to original range
    bagged_preds_orig = bagged_preds * (y_original.max() - y_original.min()) + y_original.min()
    bagged_mse = mean_squared_error(y_test_original, bagged_preds_orig)
    bagged_r2 = r2_score(y_test_original, bagged_preds_orig)
    print(f"Bagged CNN+LSTM - MSE: {bagged_mse:.4f}, R²: {bagged_r2:.4f}")
else:
    # For classification
    bagged_classes = (bagged_preds > 0.5).astype(int)
    bagged_acc = accuracy_score((y_test > 0.5).astype(int), bagged_classes)
    bagged_f1 = f1_score((y_test > 0.5).astype(int), bagged_classes)
    print(f"Bagged CNN+LSTM - Acc: {bagged_acc:.4f}, F1: {bagged_f1:.4f}")


IMPLEMENTING BAGGING FOR CNN+LSTM
Training 5 bagged CNN+LSTM models...
  Training bagged model 1/5...
Epoch 1/5, Loss: 0.5645
Epoch 3/5, Loss: 0.4617
Epoch 5/5, Loss: 0.4550
  Training bagged model 2/5...
Epoch 1/5, Loss: 0.5759
Epoch 3/5, Loss: 0.4643
Epoch 5/5, Loss: 0.4570
  Training bagged model 3/5...
Epoch 1/5, Loss: 0.5645
Epoch 3/5, Loss: 0.4622
Epoch 5/5, Loss: 0.4570
  Training bagged model 4/5...
Epoch 1/5, Loss: 0.5494
Epoch 3/5, Loss: 0.4652
Epoch 5/5, Loss: 0.4593
  Training bagged model 5/5...
Epoch 1/5, Loss: 0.5651
Epoch 3/5, Loss: 0.4662
Epoch 5/5, Loss: 0.4602
Bagged CNN+LSTM - MSE: 0.8171, R²: 0.9356


In [None]:
print("\nIMPLEMENTING BOOSTING FOR CNN+LSTM")
n_boost_models = 3
boosted_models = []
sample_weights = np.ones(len(X_train)) / len(X_train)

print(f"Training {n_boost_models} boosted CNN+LSTM models...")
for i in range(n_boost_models):
    print(f"  Training boosted model {i+1}/{n_boost_models}...")

    # Sample based on weights
    indices = np.random.choice(len(X_train), len(X_train), p=sample_weights)
    X_boost = X_train[indices]
    y_boost = y_train[indices]

    # Convert to tensors
    X_boost_tensor = torch.tensor(X_boost, dtype=torch.float32).unsqueeze(1)
    y_boost_tensor = torch.tensor(y_boost, dtype=torch.float32).unsqueeze(1)

    # Create and train model
    model = CNNLSTMModel()
    model = train_model(model, X_boost_tensor, y_boost_tensor, epochs=5)
    boosted_models.append(model)

    # Update weights based on errors (key boosting step)
    with torch.no_grad():
        preds = model(torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)).cpu().numpy().flatten()
        errors = np.abs(preds - y_train)
        # Increase weights for samples with higher errors
        sample_weights = errors / np.sum(errors)

# Function to make predictions with boosted models
def predict_boosted(X_tensor, models, weights=None):
    if weights is None:
        # Each subsequent model gets higher weight
        weights = np.linspace(1, 2, len(models))
        weights = weights / np.sum(weights)  # Normalize

    predictions = []
    with torch.no_grad():
        for i, model in enumerate(models):
            pred = model(X_tensor).cpu().numpy().flatten()
            predictions.append(pred * weights[i])

    # Weighted sum of predictions
    ensemble_pred = np.sum(predictions, axis=0)
    return ensemble_pred

# Evaluate boosting performance
boosted_preds = predict_boosted(X_test_tensor, boosted_models)

if is_regression:
    # For regression, scale back to original range
    boosted_preds_orig = boosted_preds * (y_original.max() - y_original.min()) + y_original.min()
    boosted_mse = mean_squared_error(y_test_original, boosted_preds_orig)
    boosted_r2 = r2_score(y_test_original, boosted_preds_orig)
    print(f"Boosted CNN+LSTM - MSE: {boosted_mse:.4f}, R²: {boosted_r2:.4f}")
else:
    # For classification
    boosted_classes = (boosted_preds > 0.5).astype(int)
    boosted_acc = accuracy_score((y_test > 0.5).astype(int), boosted_classes)
    boosted_f1 = f1_score((y_test > 0.5).astype(int), boosted_classes)
    print(f"Boosted CNN+LSTM - Acc: {boosted_acc:.4f}, F1: {boosted_f1:.4f}")


IMPLEMENTING BOOSTING FOR CNN+LSTM
Training 3 boosted CNN+LSTM models...
  Training boosted model 1/3...
Epoch 1/5, Loss: 0.5585
Epoch 3/5, Loss: 0.4626
Epoch 5/5, Loss: 0.4533
  Training boosted model 2/3...
Epoch 1/5, Loss: 0.5614
Epoch 3/5, Loss: 0.4835
Epoch 5/5, Loss: 0.4705
  Training boosted model 3/3...
Epoch 1/5, Loss: 0.5801
Epoch 3/5, Loss: 0.4946
Epoch 5/5, Loss: 0.4747
Boosted CNN+LSTM - MSE: 0.3275, R²: 0.9742


In [None]:
print("\nSUMMARY OF ENSEMBLE METHODS")
# Display comparison of bagging and boosting methods
if is_regression:
    print("\nRegression Performance Metrics (MSE, R²):")
    print(f"Bagged CNN+LSTM:    MSE={bagged_mse:.4f}, R²={bagged_r2:.4f}")
    print(f"Boosted CNN+LSTM:   MSE={boosted_mse:.4f}, R²={boosted_r2:.4f}")
else:
    print("\nClassification Performance Metrics (Accuracy, F1):")
    print(f"Bagged CNN+LSTM:    Acc={bagged_acc:.4f}, F1={bagged_f1:.4f}")
    print(f"Boosted CNN+LSTM:   Acc={boosted_acc:.4f}, F1={boosted_f1:.4f}")


SUMMARY OF ENSEMBLE METHODS

Regression Performance Metrics (MSE, R²):
Bagged CNN+LSTM:    MSE=0.8171, R²=0.9356
Boosted CNN+LSTM:   MSE=0.3275, R²=0.9742
