In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import random
import docker
import docker.errors
import pandas as pd
import time
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split



print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 3060


In [None]:
# Generate Training Data (Images must be build first, see Makefile)
client = docker.from_env()

def run_container(image, container_name, duration=200, interval=1):
    container = client.containers.run(
        image, 
        name=container_name,
        detach=True,
        auto_remove=True,
        cpu_period=100000,
        cpu_quota=50000,
    )

    container = client.containers.get(container_name)
    stats = []

    stat_gen = container.stats(decode=True, stream=True)
    for _ in range(int(duration / interval)):
        stat = next(stat_gen)
        cpu_stats = stat.get("cpu_stats", {})
        precpu_stats = stat.get("precpu_stats", {})
        cpu_delta = cpu_stats.get("cpu_usage", {}).get("total_usage", 0) - precpu_stats.get("cpu_usage", {}).get("total_usage", 0)
        system_cpu_delta = cpu_stats.get("system_cpu_usage", 0) - precpu_stats.get("system_cpu_usage", 0)
        cpu_percent = (cpu_delta / system_cpu_delta) * len(cpu_stats.get("cpu_usage", {}).get("percpu_usage", [])) * 100 if system_cpu_delta else 0.0

        stats.append(cpu_percent)
        print(f"{container_name}: CPU {cpu_percent}%")
        time.sleep(interval)
    
    container.kill()
    return stats

def collect_sequential_stats(image_list, duration=200):
    data = {}
    for i, image in enumerate(image_list):
        container_name = f"container_{i}"
        cpu_data = run_container(image, container_name, duration=duration)
        data[container_name] = cpu_data
    
    df = pd.DataFrame(data)
    return df

def aggregate(df):
    df['mean_cpu'] = df.mean(axis=1)
    df['max_cpu'] = df.max(axis=1)
    return df[['mean_cpu', 'max_cpu']]

def create_sliding_windows(df_agg, window_size=50):
    sequences = []
    targets = []
    for i in range(len(df_agg) - window_size):
        seq = df_agg.iloc[i:i+window_size].values  # shape (50, 2)
        target = df_agg.iloc[i+window_size].values  # predict next step (mean_cpu, max_cpu)
        sequences.append(seq)
        targets.append(target)
    return np.array(sequences), np.array(targets)


def generate_cpu_dataset(num_containers=5, duration_per_run=200, no_runs=10, window_size=50):
    all_sequences = []
    all_targets = []
    image_names = ["cpu_max", "periodic_cpu_spikes", "random_cpu"]
    for run_idx in range(no_runs):
        print(f"\n=== Generating sample {run_idx + 1}/{no_runs} ===")
        rand_image_set = [random.choice(image_names) for _ in range(num_containers)]

        df_raw = collect_sequential_stats(rand_image_set, duration_per_run)
        df_agg = aggregate(df_raw)

        seqs, targets = create_sliding_windows(df_agg, window_size=window_size)

        all_sequences.append(seqs)
        all_targets.append(targets)

    X = np.concatenate(all_sequences, axis=0)
    y = np.concatenate(all_targets, axis=0)

    return X, y

X, y = generate_cpu_dataset(num_containers=1, duration_per_run=30, no_runs=5, window_size=10)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=0.5, random_state=42, shuffle=True)

In [8]:
print(X.shape)
print(y.shape)

(100, 10, 2)
(100, 2)


In [10]:
class CPULSTM(nn.Module):
    def __init__(self, input_size=2, hidden_size=64, num_layers=2, output_size=2):
        super(CPULSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # x shape: (batch, seq_len, input_size)
        out, _ = self.lstm(x)  # out shape: (batch, seq_len, hidden_size)
        out = out[:, -1, :]    # Take last time step's output
        out = self.fc(out)     # Final prediction: (batch, output_size)
        return out

# Set Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create TensorDatasets from your prepared data
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
val_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Model, loss, optimizer
model = CPULSTM().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# init tensorboard, (ensure it is running first)
writer = SummaryWriter(log_dir='runs/cpu_forecast_experiment')

# Training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch_x, batch_y in train_dataloader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
         # Validation
    
    avg_train_loss = total_train_loss / len(train_dataloader)

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in val_dataloader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            total_val_loss += loss.item()
    
    avg_val_loss = total_val_loss / len(val_dataloader)

    # Log to TensorBoard
    writer.add_scalar('Loss/Train', avg_train_loss, epoch)
    writer.add_scalar('Loss/Val', avg_val_loss, epoch)

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {total_train_loss / len(train_dataloader):.4f}, Val Loss: {total_val_loss / len(val_dataloader):.4f}")

writer.close()

Using device: cuda
Epoch 1/50, Train Loss: 1442.9646, Val Loss: 1337.1406
Epoch 2/50, Train Loss: 1438.7935, Val Loss: 1382.7479
Epoch 3/50, Train Loss: 1350.3973, Val Loss: 1374.4821
Epoch 4/50, Train Loss: 1295.0834, Val Loss: 1365.8193
Epoch 5/50, Train Loss: 1363.6473, Val Loss: 1339.4604
Epoch 6/50, Train Loss: 1358.0842, Val Loss: 1254.1259
Epoch 7/50, Train Loss: 1319.7946, Val Loss: 1268.2215
Epoch 8/50, Train Loss: 1217.0345, Val Loss: 1238.9847
Epoch 9/50, Train Loss: 1196.2984, Val Loss: 1237.6154
Epoch 10/50, Train Loss: 1222.5677, Val Loss: 1187.6682
Epoch 11/50, Train Loss: 1165.8196, Val Loss: 1175.0207
Epoch 12/50, Train Loss: 1212.9906, Val Loss: 1160.5990
Epoch 13/50, Train Loss: 1158.2549, Val Loss: 1143.1648
Epoch 14/50, Train Loss: 1152.2730, Val Loss: 1100.0191
Epoch 15/50, Train Loss: 1065.5382, Val Loss: 1122.1763
Epoch 16/50, Train Loss: 1101.8694, Val Loss: 1114.5014
Epoch 17/50, Train Loss: 1079.3014, Val Loss: 1087.8870
Epoch 18/50, Train Loss: 1075.6380, Va

In [None]:
# Testing

# Evaluate performance with test data
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

    preds = model(X_test_tensor)
    preds = preds.cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()

# Calculate MAE or RMSE on test set
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_true, preds))
mae = mean_absolute_error(y_true, preds)
print(f"Test RMSE: {rmse:.4f}, Test MAE: {mae:.4f}")

# Plot prediction from a test sequence


import matplotlib.pyplot as plt

# Choose one random test sequence
sample_idx = np.random.randint(len(X_test))
input_seq = X_test[sample_idx]  # shape (50, 2)
true_target = y_test[sample_idx]  # shape (2, )
predicted_target = preds[sample_idx]  # shape (2, )

# Plotting the input sequence and the true vs predicted next step
plt.figure(figsize=(12, 5))

# Plot mean_cpu sequence
plt.subplot(1, 2, 1)
plt.plot(input_seq[:, 0], label="Input mean_cpu (past 50s)")
plt.axhline(y=true_target[0], color='green', linestyle='--', label="True mean_cpu (next)")
plt.axhline(y=predicted_target[0], color='red', linestyle='--', label="Predicted mean_cpu (next)")
plt.title("Mean CPU usage")
plt.legend()

# Plot max_cpu sequence
plt.subplot(1, 2, 2)
plt.plot(input_seq[:, 1], label="Input max_cpu (past 50s)")
plt.axhline(y=true_target[1], color='green', linestyle='--', label="True max_cpu (next)")
plt.axhline(y=predicted_target[1], color='red', linestyle='--', label="Predicted max_cpu (next)")
plt.title("Max CPU usage")
plt.legend()

plt.show()