In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

# ------------------------------
# 1. Check for GPU
# ------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n✅ Using device: {device}")

# ------------------------------
# 2. Load the Dataset
# ------------------------------
file_path = "IL_T500_cleaned.csv"
if not os.path.exists(file_path):
    print(f"❌ ERROR: File '{file_path}' not found! Place it in the same directory.")
    exit(1)

df = pd.read_csv(file_path)

# Rename 'Label' to 'label' if needed
df.rename(columns={'Label': 'label'}, inplace=True)

# ------------------------------
# 3. Handle Missing Values
# ------------------------------
print("Missing values per column before handling:\n", df.isnull().sum())

# Fill NaN values with column mean
df.fillna(df.mean(), inplace=True)

# Optional: Drop rows with any remaining NaNs
df.dropna(inplace=True)

print("Missing values per column after handling:\n", df.isnull().sum())

# ------------------------------
# 4. Separate Features & Labels
# ------------------------------
if 'label' not in df.columns:
    raise ValueError("❌ 'label' column is missing from the CSV. Please check your data.")

X = df.drop(columns=['label'])
y = df['label'].values  # 0 = Normal, 1 = Trojan (assuming binary)

# ------------------------------
# 5. Normalize Features
# ------------------------------
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print("✅ Features normalized.")

# ------------------------------
# 6. Build the Graph Edges
# ------------------------------
# (A) Temporal adjacency: connect each time step i -> i+1 (and i+1 -> i for undirected)
temporal_edges = []
num_samples = len(X_scaled)
for i in range(num_samples - 1):
    temporal_edges.append([i, i+1])
    temporal_edges.append([i+1, i])  # comment out if you prefer a directed chain

temporal_edges = np.array(temporal_edges).T  # shape (2, 2*(num_samples-1))

# (B) KNN adjacency
k_neighbors = 5  # Adjust based on dataset size
knn_graph = kneighbors_graph(X_scaled, k_neighbors, mode="connectivity", include_self=False)
knn_coo = knn_graph.tocoo()
knn_edges = np.vstack([knn_coo.row, knn_coo.col])  # shape (2, number_of_knn_edges)

# Combine edges
all_edges = np.concatenate([temporal_edges, knn_edges], axis=1)
edge_index = torch.tensor(all_edges, dtype=torch.long)

# ------------------------------
# 7. Create PyTorch Geometric Data
# ------------------------------
x_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.long).to(device)

data = Data(x=x_tensor, edge_index=edge_index, y=y_tensor).to(device)
print("✅ Graph dataset created with temporal + KNN edges.")

# ------------------------------
# 8. Train/Test Split
# ------------------------------
# Randomly assign ~80% of nodes to training, 20% to testing
mask = torch.rand(num_samples).to(device)
train_mask = mask < 0.8
test_mask = ~train_mask

print(f"\n✅ Training samples: {train_mask.sum().item()}")
print(f"✅ Testing samples: {test_mask.sum().item()}")

# ------------------------------
# 9. Define GNN Model
# ------------------------------
class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, output_dim=2):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Initialize model, optimizer, loss
model = GNN(input_dim=X.shape[1], hidden_dim=256, output_dim=2).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
print("✅ GNN model initialized.")

# ------------------------------
# 10. Train the Model
# ------------------------------
epochs = 2000
print("\n✅ Training Started...")

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    out = model(data)  # shape: [num_samples, 2]
    loss = criterion(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()

    # Compute training accuracy
    train_pred = out[train_mask].argmax(dim=1)
    train_acc = (train_pred == data.y[train_mask]).float().mean().item()

    if (epoch + 1) % 100 == 0:  # Print every 100 epochs
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Train Accuracy: {train_acc:.2%}")

print("\n✅ Training completed.")

# ------------------------------
# 11. Save the Model
# ------------------------------
torch.save(model.state_dict(), "trojan_gnn_model.pth")
print("\n✅ Model saved as 'trojan_gnn_model.pth'.")

# ------------------------------
# 12. Evaluate the Model
# ------------------------------
model.eval()
with torch.no_grad():
    out = model(data)
    test_pred = out[test_mask].argmax(dim=1)

# Compute Metrics
y_true = data.y[test_mask].cpu().numpy()
y_pred = test_pred.cpu().numpy()

accuracy = accuracy_score(y_true, y_pred) * 100
precision = precision_score(y_true, y_pred, zero_division=0) * 100
recall = recall_score(y_true, y_pred, zero_division=0) * 100
f1 = f1_score(y_true, y_pred, zero_division=0) * 100

# Print Results
print("\n🎯 Model Performance on Test Nodes:")
print(f"✅ Accuracy:  {accuracy:.2f}%")
print(f"✅ Precision: {precision:.2f}%")
print(f"✅ Recall:    {recall:.2f}%")
print(f"✅ F1-score:  {f1:.2f}%")



✅ Using device: cuda
Missing values per column before handling:
 Mean                  0
RMS                   0
Variance              0
Standard Deviation    0
Peak-to-Peak          0
Crest Factor          0
Skewness              0
Kurtosis              0
Energy                0
Entropy               0
Max                   0
Min                   0
peak_magnitude        0
spectral_centroid     0
spectral_bandwidth    0
spectral_flatness     0
spectral_rolloff      0
spectral_entropy      0
spectral_contrast     0
label                 0
dtype: int64
Missing values per column after handling:
 Mean                  0
RMS                   0
Variance              0
Standard Deviation    0
Peak-to-Peak          0
Crest Factor          0
Skewness              0
Kurtosis              0
Energy                0
Entropy               0
Max                   0
Min                   0
peak_magnitude        0
spectral_centroid     0
spectral_bandwidth    0
spectral_flatness     0
spectral_rollo