In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMClassifier(nn.Module):
    def __init__(self,input_dim, hidden_dim, output_dim, num_layers=1, fc_hidden_dim=64, head_dropout: float = 0.0):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )

        self.fc1 = nn.Linear(hidden_dim, fc_hidden_dim)
        self.fc2 = nn.Linear(fc_hidden_dim, output_dim)
        self.dropout = nn.Dropout(head_dropout)

    def forward(self, x):
        batch_size = x.size(0)
        device = x.device

        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=device)

        out, (h_n, c_n) = self.lstm(x, (h0, c0))   # out: (B, T, H)

        # Take last time step
        feat = out[:, -1, :]                        # (B, H)

        # Two-layer head
        feat = F.relu(self.fc1(feat))
        feat = self.dropout(feat)
        logits = self.fc2(feat)                     # raw logits (B, C)

        return logits, (h_n, c_n)
        



In [4]:
import torch
import utils as utils
import os

evaluate_order = ["blackhole_var5_base", "disflooding_var5_base", "worstparent_var5_base", "localrepair_var5_base"]

current_directory = os.getcwd()
domains_path = current_directory + '/data/attack_data'

domains = utils.create_domains(domains_path)

train_domains_loader = {}
test_domains_loader = {}

         
domains = utils.create_domains(domains_path)

train_domains_loader = {}
test_domains_loader = {}

for key, files in domains.items():
    _, test_domains_loader[key] = utils.load_data(domains_path, key, files, window_size=10, step_size=3, batch_size=256)
    
print(test_domains_loader.keys())

dict_keys(['blackhole_var10_base', 'blackhole_var10_dec', 'blackhole_var10_oo', 'blackhole_var15_base', 'blackhole_var15_dec', 'blackhole_var15_oo', 'blackhole_var20_base', 'blackhole_var20_dec', 'blackhole_var20_oo', 'blackhole_var5_base', 'blackhole_var5_dec', 'blackhole_var5_oo', 'disflooding_var10_base', 'disflooding_var10_dec', 'disflooding_var10_oo', 'disflooding_var15_base', 'disflooding_var15_dec', 'disflooding_var15_oo', 'disflooding_var20_base', 'disflooding_var20_dec', 'disflooding_var20_oo', 'disflooding_var5_base', 'disflooding_var5_dec', 'disflooding_var5_oo', 'localrepair_var10_base', 'localrepair_var10_dec', 'localrepair_var10_oo', 'localrepair_var15_base', 'localrepair_var15_dec', 'localrepair_var15_oo', 'localrepair_var20_base', 'localrepair_var20_dec', 'localrepair_var20_oo', 'localrepair_var5_base', 'localrepair_var5_dec', 'localrepair_var5_oo', 'worstparent_var10_base', 'worstparent_var10_dec', 'worstparent_var10_oo', 'worstparent_var15_base', 'worstparent_var15_de

In [7]:
from sklearn.metrics import (
    precision_score, recall_score, f1_score, accuracy_score,
    roc_auc_score, roc_curve, auc, average_precision_score, confusion_matrix, balanced_accuracy_score
)
import matplotlib.pyplot as plt
import logging
import os
import torch


def evaluate_metrics(y_true, y_pred, y_prob, test_domain_name, training_domain_name):
    """
    Compute evaluation metrics including confusion matrix, specificity, and balanced accuracy,
    plot the ROC curve, log and print the results, and return all metrics in a dictionary.
    
    Args:
      y_true (np.array): True labels.
      y_pred (np.array): Predicted labels.
      y_prob (np.array): Predicted probabilities for the positive class.
      test_domain_name (str): Name of the test domain.
      training_domain_name (str): Name of the training domain.
      
    Returns:
      metrics (dict): Dictionary with evaluation metrics.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_true, y_prob)
    avg_precision = average_precision_score(y_true, y_prob)

    # Compute confusion matrix and additional metrics
    cm = confusion_matrix(y_true, y_pred)
    # logging.info(f"Confusion matrix shape: {cm.shape}, values: \n{cm}")
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    
    log_msg = (f"Train Domain: {training_domain_name} | Test Domain: {test_domain_name} | "
               f"Acc: {accuracy:.4f} | Prec: {precision:.4f} | Rec: {recall:.4f} | "
               f"F1: {f1:.4f} | "
               f"Specificity: {specificity:.4f} | Balanced Acc: {balanced_acc:.4f} | "
               f"CM: {cm.tolist()}")    # ROC-AUC: {roc_auc:.4f} 
    # print(log_msg)
    # logging.info(log_msg)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
        "average_precision": avg_precision,
        "specificity": specificity,
        "balanced_accuracy": balanced_acc,
        "confusion_matrix": cm
    }

In [13]:
import torch
import numpy as np
import logging
import evaluation as evaluate
from utils import confidence_from_logits


def eval_model(model,test_domain_loader, train_domain, device, domain_id=None):
    all_y_true, all_y_pred, all_y_prob = [], [], []
    all_confidences, all_conf_correct, all_conf_incorrect = [], [], []
    with torch.no_grad():
        for X_batch, y_batch in test_domain_loader[train_domain]:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            if domain_id is not None:
                outputs, _ = model(X_batch, domain_id=domain_id)  # Pass domain_id to the model
            else:
                outputs, _ = model(X_batch)  # Pass domain_id to the model
            
            # === confidence here (eval only) ===
            probs, preds, confs = confidence_from_logits(outputs)
            all_y_true.extend(y_batch.cpu().numpy())
            all_y_pred.extend(preds.cpu().numpy())
            all_y_prob.extend(probs[:, 1].cpu().numpy())   # you already use class-1 prob for ROC
            all_confidences.extend(confs.cpu().numpy())

            correct_mask = (preds == y_batch)
            if correct_mask.any():
                all_conf_correct.extend(confs[correct_mask].cpu().numpy())
            if (~correct_mask).any():
                all_conf_incorrect.extend(confs[~correct_mask].cpu().numpy())
            _, predicted = torch.max(outputs.data, 1)
            all_y_true.extend(y_batch.cpu().numpy())
            all_y_pred.extend(predicted.cpu().numpy())
            all_y_prob.extend(torch.nn.functional.softmax(outputs, dim=1)[:, 1].cpu().numpy())
        
    metrics = evaluate.evaluate_metrics(np.array(all_y_true), np.array(all_y_pred),
                        np.array(all_y_prob), train_domain, train_domain)
    
    avg_conf           = float(np.mean(all_confidences))   if all_confidences else float("nan")
    avg_conf_correct   = float(np.mean(all_conf_correct))  if all_conf_correct else float("nan")
    avg_conf_incorrect = float(np.mean(all_conf_incorrect))if all_conf_incorrect else float("nan")
    metrics["avg_conf"] = avg_conf
    metrics["avg_conf_correct"] = avg_conf_correct
    metrics["avg_conf_incorrect"] = avg_conf_incorrect
    
    return metrics

In [16]:
for scenario in evaluate_order:
    print(f"Evaluating scenario: {scenario}")
    domains = utils.create_domains(domains_path)

    # Add your evaluation code here
    model = LSTMClassifier(input_dim=140, hidden_dim=10, output_dim=2, num_layers=1, fc_hidden_dim=10, head_dropout=0.05)
    state_dict = torch.load(
    current_directory + f"/models/exp_no_1_LSTM_WCL_random/best_model_after_{scenario}.pt",
    map_location=torch.device("cpu"), weights_only=True
    )
    model.load_state_dict(state_dict)
    model.eval()
    for eval_scenario in evaluate_order:
        metrics = eval_model(model=model,test_domain_loader=test_domains_loader, train_domain=eval_scenario, device="cpu", domain_id=None)
        print(f"Metrics for training on {scenario} and testing on {eval_scenario}: {metrics}")
    
    

Evaluating scenario: blackhole_var5_base


  return self.fget.__get__(instance, owner)()


Metrics for training on blackhole_var5_base and testing on blackhole_var5_base: {'accuracy': 0.837037037037037, 'precision': 0.8517064234045367, 'recall': 0.837037037037037, 'f1': 0.8357618331286198, 'roc_auc': 0.8117537467061924, 'average_precision': 0.8856491704770661, 'specificity': 0.9356060606060606, 'balanced_accuracy': 0.8391798418972332, 'confusion_matrix': array([[1976,  136],
       [ 568, 1640]]), 'avg_conf': 0.8798449635505676, 'avg_conf_correct': 0.8805820941925049, 'avg_conf_incorrect': 0.8760589361190796}
Metrics for training on blackhole_var5_base and testing on disflooding_var5_base: {'accuracy': 0.5005841121495327, 'precision': 0.5006905730029898, 'recall': 0.5005841121495327, 'f1': 0.5006084795275568, 'roc_auc': 0.3037864186341101, 'average_precision': 0.40984617005376145, 'specificity': 0.5011848341232228, 'balanced_accuracy': 0.5005924170616114, 'confusion_matrix': array([[1692, 1684],
       [1736, 1736]]), 'avg_conf': 0.9929722547531128, 'avg_conf_correct': 0.986

  return self.fget.__get__(instance, owner)()


Metrics for training on disflooding_var5_base and testing on blackhole_var5_base: {'accuracy': 0.5550925925925926, 'precision': 0.5578818118902508, 'recall': 0.5550925925925926, 'f1': 0.5417206680364576, 'roc_auc': 0.5992062266688626, 'average_precision': 0.5942305460311652, 'specificity': 0.3816287878787879, 'balanced_accuracy': 0.5513216403162056, 'confusion_matrix': array([[ 806, 1306],
       [ 616, 1592]]), 'avg_conf': 0.97314453125, 'avg_conf_correct': 0.9790336489677429, 'avg_conf_incorrect': 0.9657968282699585}
Metrics for training on disflooding_var5_base and testing on disflooding_var5_base: {'accuracy': 0.9985397196261683, 'precision': 0.9985440323266226, 'recall': 0.9985397196261683, 'f1': 0.9985397464101843, 'roc_auc': 0.9999331142027212, 'average_precision': 0.9999374703870548, 'specificity': 1.0, 'balanced_accuracy': 0.9985599078341014, 'confusion_matrix': array([[3376,    0],
       [  10, 3462]]), 'avg_conf': 0.9954902529716492, 'avg_conf_correct': 0.9957225918769836, 

  return self.fget.__get__(instance, owner)()
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Metrics for training on worstparent_var5_base and testing on blackhole_var5_base: {'accuracy': 0.5111111111111111, 'precision': 0.26123456790123456, 'recall': 0.5111111111111111, 'f1': 0.3457516339869281, 'roc_auc': 0.5, 'average_precision': 0.5111111111111111, 'specificity': 0.0, 'balanced_accuracy': 0.5, 'confusion_matrix': array([[   0, 2112],
       [   0, 2208]]), 'avg_conf': 1.0, 'avg_conf_correct': 1.0, 'avg_conf_incorrect': 1.0}
Metrics for training on worstparent_var5_base and testing on disflooding_var5_base: {'accuracy': 0.5070093457943925, 'precision': 0.25705847672285786, 'recall': 0.5070093457943925, 'f1': 0.3411504745345215, 'roc_auc': 0.5, 'average_precision': 0.5070093457943925, 'specificity': 0.0, 'balanced_accuracy': 0.5, 'confusion_matrix': array([[   0, 3376],
       [   0, 3472]]), 'avg_conf': 1.0, 'avg_conf_correct': 1.0, 'avg_conf_incorrect': 1.0}
Metrics for training on worstparent_var5_base and testing on worstparent_var5_base: {'accuracy': 0.7452247191011236,

  return self.fget.__get__(instance, owner)()
  _warn_prf(average, modifier, msg_start, len(result))


Metrics for training on localrepair_var5_base and testing on blackhole_var5_base: {'accuracy': 0.5111111111111111, 'precision': 0.26123456790123456, 'recall': 0.5111111111111111, 'f1': 0.3457516339869281, 'roc_auc': 0.5, 'average_precision': 0.5111111111111111, 'specificity': 0.0, 'balanced_accuracy': 0.5, 'confusion_matrix': array([[   0, 2112],
       [   0, 2208]]), 'avg_conf': 1.0, 'avg_conf_correct': 1.0, 'avg_conf_incorrect': 1.0}
Metrics for training on localrepair_var5_base and testing on disflooding_var5_base: {'accuracy': 0.5467289719626168, 'precision': 0.7606817199899948, 'recall': 0.5467289719626168, 'f1': 0.4239015897980561, 'roc_auc': 0.7448750464105531, 'average_precision': 0.6686503916895805, 'specificity': 0.08056872037914692, 'balanced_accuracy': 0.5402843601895735, 'confusion_matrix': array([[ 272, 3104],
       [   0, 3472]]), 'avg_conf': 0.9900760650634766, 'avg_conf_correct': 0.9909858107566833, 'avg_conf_incorrect': 0.9889787435531616}
Metrics for training on lo