In [36]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [37]:
data_path = 'stage3_data_cleaning/v2/type1_label_merged_final_decoded_clean3.xlsx'
data = pd.read_excel(data_path)

In [38]:
# Ensure the output directory exists
import os
output_dir = "can_graphs/v2"
os.makedirs(output_dir, exist_ok=True)

In [39]:
def create_graph(window_data):
    G = nx.DiGraph()
    for i in range(len(window_data) - 1):
        src = window_data.iloc[i]['can_id']
        dst = window_data.iloc[i + 1]['can_id']
        if src != dst:  # Avoid self-loops
            G.add_edge(src, dst, weight=window_data.iloc[i + 1]['timestamp'] - window_data.iloc[i]['timestamp'])
    return G


In [52]:
def create_graph(window_df):
    G = nx.DiGraph()
    index_tracker = {}
    
    for i in range(len(window_df) - 1):
        node1 = window_df.iloc[i]['can_id']
        node2 = window_df.iloc[i + 1]['can_id']
        timestamp_diff = window_df.iloc[i + 1]['timestamp'] - window_df.iloc[i]['timestamp']
        label = window_df.iloc[i]['label']
        
        if node1 != node2:  # Avoid self-loops
            if G.has_edge(node1, node2):
                G[node1][node2]['weight'] += timestamp_diff
            else:
                G.add_edge(node1, node2, weight=timestamp_diff)
        
        if node1 not in index_tracker:
            index_tracker[node1] = []
        index_tracker[node1].append((i, label))
    print(index_tracker)
    return G, index_tracker

In [41]:
def calculate_metrics(G):
    pagerank = nx.pagerank(G)
    indegree = dict(G.in_degree())
    nx.set_node_attributes(G, pagerank, 'pagerank')
    nx.set_node_attributes(G, indegree, 'indegree')
    return pagerank, indegree

In [42]:
# def visualize_graph(G, pagerank, indegree, window_index):
#     pos = nx.spring_layout(G)
#     plt.figure(figsize=(10, 8))
#     nx.draw(G, pos, with_labels=True, node_size=[v * 10000 for v in pagerank.values()], node_color='skyblue', edge_color='gray')
#     nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): f"{d['weight']:.2f}" for u, v, d in G.edges(data=True)}, font_color='red')
#     labels = {n: f"{n}\nPR: {pagerank[n]:.2f}\nInDeg: {indegree[n]}" for n in G.nodes()}
#     nx.draw_networkx_labels(G, pos, labels, font_size=12)
#     plt.title(f"Graph for Window {window_index}")
#     plt.show()

def visualize_graph(G, window_index):
    pos = nx.spring_layout(G)
    pagerank = nx.get_node_attributes(G, 'pagerank')
    indegree = nx.get_node_attributes(G, 'indegree')
    labels = {node: f'{node}\nPR: {pagerank[node]:.2f}\nInDeg: {indegree[node]}' for node in G.nodes()}
    
    plt.figure(figsize=(12, 8))
    nx.draw(G, pos, with_labels=True, labels=labels, node_size=7000, node_color='skyblue', font_size=10, edge_color='gray')
    plt.title(f"Graph for Window {window_index}")
    output_path = os.path.join(output_dir, f"can_graph_window_{window_index}.png")
    plt.savefig(output_path)
    plt.close()

In [43]:
window_size = 100
index_tracker_all_windows = []

In [44]:
for window_start in tqdm(range(0, len(data), window_size)):
    window_end = min(window_start + window_size, len(data))
    window_data = data.iloc[window_start:window_end]
    G,index_tracker  = create_graph(window_data)
    calculate_metrics(G)
    visualize_graph(G, window_start // window_size)
    index_tracker_all_windows.append(index_tracker)

index_tracker_df = pd.DataFrame(index_tracker_all_windows)
index_tracker_df.to_csv(os.path.join(output_dir, "index_tracker.csv"), index=False)
print(f"Graphs saved to {output_dir}")

100%|██████████| 2079/2079 [02:17<00:00, 15.11it/s]

Graphs saved to can_graphs/v2





In [45]:
data.head()

Unnamed: 0,label,timestamp,can_id,data_length,source_node_id_decimal,service_flag,priority,message_type_decimal,destination_node_id_decimal,request_or_response,...,end_of_message,single_message_frame,transfer_ID,effective_data_0,effective_data_1,effective_data_2,effective_data_3,effective_data_4,effective_data_5,effective_data_6
0,0,0.0,10015501,8,1,0,16,341,-99,-99,...,1,1,0,0,0,0,0,8,0,0
1,0,0.192053,104E2001,2,1,0,16,20000,-99,-99,...,1,1,0,0,-199,-199,-199,-199,-199,-199
2,0,0.192335,1F043901,8,1,0,31,1081,-99,-99,...,0,0,0,0,0,0,246,0,-199,-199
3,0,0.192504,1F043901,8,1,0,31,1081,-99,-99,...,0,0,0,0,248,0,0,247,255,223
4,0,0.192637,1F043901,4,1,0,31,1081,-99,-99,...,1,0,0,254,0,0,-199,-199,-199,-199


In [46]:
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GraphSAGE
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.utils import from_networkx
from sklearn.metrics import confusion_matrix, classification_report
import joblib

In [56]:
# Preprocess data into a list of Data objects for PyG
def preprocess_data(data, window_size=100):
    pyg_data_list = []
    for window_start in tqdm(range(0, len(data), window_size)):
        window_end = min(window_start + window_size, len(data))
        window_data = data.iloc[window_start:window_end]
        G, index_tracker = create_graph(window_data)
        
        # Convert networkx graph to PyG data object
        pyg_data = from_networkx(G)
        print(pyg_data)
        # Add labels to PyG data object
        pyg_data.y = torch.tensor([index_tracker[node][0][1] for node in pyg_data.x.tolist()], dtype=torch.long)
        pyg_data_list.append(pyg_data)
        break
    
    return pyg_data_list

In [48]:
class EGraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(EGraphSAGE, self).__init__()
        self.conv1 = GraphSAGE(in_channels, hidden_channels)
        self.conv2 = GraphSAGE(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = self.lin(x)
        return F.log_softmax(x, dim=-1)

In [49]:
def train_model(model, train_loader, optimizer, criterion, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for data in train_loader:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index)
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')

# Evaluate the model
def evaluate_model(model, test_loader):
    model.eval()
    y_true = []
    y_pred = []
    for data in test_loader:
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        y_true.extend(data.y.tolist())
        y_pred.extend(pred.tolist())
    
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    return cm, report

# Save the model
def save_model(model, path):
    torch.save(model.state_dict(), path)


In [51]:
data.head()

Unnamed: 0,label,timestamp,can_id,data_length,source_node_id_decimal,service_flag,priority,message_type_decimal,destination_node_id_decimal,request_or_response,...,end_of_message,single_message_frame,transfer_ID,effective_data_0,effective_data_1,effective_data_2,effective_data_3,effective_data_4,effective_data_5,effective_data_6
0,0,0.0,10015501,8,1,0,16,341,-99,-99,...,1,1,0,0,0,0,0,8,0,0
1,0,0.192053,104E2001,2,1,0,16,20000,-99,-99,...,1,1,0,0,-199,-199,-199,-199,-199,-199
2,0,0.192335,1F043901,8,1,0,31,1081,-99,-99,...,0,0,0,0,0,0,246,0,-199,-199
3,0,0.192504,1F043901,8,1,0,31,1081,-99,-99,...,0,0,0,0,248,0,0,247,255,223
4,0,0.192637,1F043901,4,1,0,31,1081,-99,-99,...,1,0,0,254,0,0,-199,-199,-199,-199


In [57]:
pyg_data_list = preprocess_data(data)
train_size = int(0.7 * len(pyg_data_list))
train_data = pyg_data_list[:train_size]
test_data = pyg_data_list[train_size:]

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


  0%|          | 0/2079 [00:00<?, ?it/s]

{'10015501': [(0, 0)], '104E2001': [(1, 0)], '1F043901': [(2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (18, 0), (33, 0), (47, 0), (62, 0), (76, 0), (91, 0)], '05040601': [(7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (19, 0), (20, 0), (21, 0), (22, 0), (23, 0), (24, 0), (25, 0), (26, 0), (27, 0), (28, 0), (29, 0), (30, 0), (31, 0), (32, 0), (34, 0), (35, 0), (36, 0), (37, 0), (38, 0), (39, 0), (40, 0), (41, 0), (42, 0), (43, 0), (44, 0), (45, 0), (46, 0), (48, 0), (49, 0), (50, 0), (51, 0), (52, 0), (53, 0), (54, 0), (55, 0), (57, 0), (58, 0), (59, 0), (60, 0), (61, 0), (63, 0), (64, 0), (65, 0), (66, 0), (67, 0), (68, 0), (69, 0), (70, 0), (71, 0), (72, 0), (73, 0), (74, 0), (75, 0), (77, 0), (78, 0), (79, 0), (80, 0), (81, 0), (82, 0), (83, 0), (84, 0), (85, 0), (86, 0), (87, 0), (88, 0), (89, 0), (90, 0), (92, 0), (93, 0), (95, 0), (96, 0), (97, 0), (98, 0)], '18043801': [(56, 0), (94, 0)]}
Data(edge_index=[2, 6], weight=[6], num_nodes=5)




AttributeError: 'NoneType' object has no attribute 'tolist'

In [None]:
# for window_start in range(0, len(data), window_size):
#     window_end = min(window_start + window_size, len(data))
#     window_data = data.iloc[window_start:window_end]
#     G = create_graph(window_data)
#     pagerank, indegree = calculate_metrics(G)
#     visualize_graph(G, pagerank, indegree, window_start // window_size)
#     break

In [None]:
# def generate_all_graphs(df, window_size=200):
#     graphs = []
#     step_size = int(window_size * 0.75)
#     max_timestamp = df['timestamp'].max()
#     end_time = 0.75 * max_timestamp
    
#     for start in tqdm(range(0, len(df) - window_size, step_size)):
#         window_df = df.iloc[start:start+window_size]
#         if window_df['timestamp'].iloc[-1] > end_time:
#             break
        
#         G = create_graph(df, start, window_size)
#         G = calculate_metrics(G)
#         graphs.append(G)
    
#     return graphs

In [None]:
from tqdm import tqdm

In [None]:
all_graphs = generate_all_graphs(data)

In [None]:
visualize_graph(all_graphs[1])