In [1]:
import pandas as pd
import networkx as nx
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import TransformerConv
from torch_geometric.nn import GATConv

from torch_geometric.nn import GCNConv
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.utils import from_networkx
from sklearn.metrics import confusion_matrix, classification_report
import os
from tqdm import tqdm
import joblib
import matplotlib.pyplot as plt
import glob
import time

from torch.nn import LSTM


import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

In [2]:
def visualize_graph(G, window_index,visualization_dir):
    pos = nx.spring_layout(G)
    pagerank = nx.get_node_attributes(G, 'pagerank')
    
    # Ensure all nodes have a pagerank value, set default if missing
    for node in G.nodes():
        if node not in pagerank:
            pagerank[node] = 0.0  # Default PageRank value
    
    labels = {node: f'{node}\nPR: {pagerank[node]:.2f}' for node in G.nodes()}
    
    plt.figure(figsize=(12, 8))
    nx.draw(G, pos, with_labels=True, labels=labels, node_size=7000, node_color='skyblue', font_size=10, edge_color='gray')
    plt.title(f"Graph for Window {window_index}")
    output_path = os.path.join(visualization_dir, f'graph_window_{window_index}.png')
    plt.savefig(output_path)
    plt.close()

def plot_roc_curve(y_true, y_prob, title,result_dir):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {title}')
    plt.legend(loc='lower right')
    plt.savefig(f'{result_dir}/roc_curve_{title}.png')
    plt.show()

def train_model(model, train_loader, optimizer, criterion,title,result_dir, epochs=100):
    model.train()
    loss_values = []
    for epoch in range(epochs):
        total_loss = 0
        for data in train_loader:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index)
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        loss_values.append(avg_loss)
        print(f'Epoch {epoch + 1}, Loss: {avg_loss}')
    
    # Plot training loss
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, epochs + 1), loss_values, label='Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title(f'Training Loss over Epochs {title}')
    plt.legend()
    plt.savefig(f'{result_dir}/traing_loss_over_epochs_{title}.png')
    plt.show()

def evaluate_model(model, test_loader, title,result_dir):
    model.eval()
    y_true = []
    y_pred = []
    y_prob = []
    for data in test_loader:
        out = model(data.x, data.edge_index)
        prob = out.softmax(dim=1)[:, 1]  # Get probability of class 1
        pred = out.argmax(dim=1)
        y_true.extend(data.y.tolist())
        y_pred.extend(pred.tolist())
        y_prob.extend(prob.tolist())
    
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred,output_dict=True)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1], yticklabels=[0, 1])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {title}')
    plt.savefig(f'{result_dir}/confusion_matrix_{title}.png')
    plt.show()
    
    # Plot ROC curve
    plot_roc_curve(y_true, y_prob, title,result_dir)
    
    return cm, report

# Save the model
def save_model(model, path):
    torch.save(model.state_dict(), path)



In [3]:
def load_saved_graphs(graph_dir):
    pyg_data_list = []
    for graph_file in sorted(os.listdir(graph_dir)):
        if graph_file.endswith('.graphml'):
            graph_path = os.path.join(graph_dir, graph_file)
            G = nx.read_graphml(graph_path)
            
            # Convert networkx graph to PyG data object
            pyg_data = from_networkx(G, group_node_attrs=['pagerank'])
            pyg_data.x = pyg_data.x.float()  # Ensure x is Float
            
            # Assuming labels are stored in the graph attributes
            labels = [G.nodes[node]['label'] for node in G.nodes]
            pyg_data.y = torch.tensor(labels, dtype=torch.long)
            
            pyg_data_list.append(pyg_data)
    
    return pyg_data_list

In [4]:

class GCNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = self.lin(x)
        return F.log_softmax(x, dim=-1)
    

def run_GCnn(train_loader,test_loader,title,result_dir):

    model = GCNN(in_channels=1, hidden_channels=128, out_channels=2)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = torch.nn.CrossEntropyLoss()

    model_dir =  os.path.join(result_dir, "saved_models")
    title_dir = os.path.join(result_dir, title)
    classification_report_dir = os.path.join(result_dir, "classification_report")
    os.makedirs(title_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(classification_report_dir, exist_ok=True)

    train_model(model, train_loader, optimizer, criterion,title,title_dir)
    cm, report = evaluate_model(model, test_loader,title,title_dir)


    report_df = pd.DataFrame(report).transpose()
    time_str = str(time.time()).replace('.','_')
    report_df.to_csv(f'{classification_report_dir}/classification_report_{title}_{time_str}.csv', index=True)
    

    save_model(model, os.path.join(model_dir, f'graph_{title}_model.pth'))
    return cm,report,report['accuracy']

In [5]:
# class LSTMAggregator(torch.nn.Module):
#     def __init__(self, in_channels, out_channels):
#         super(LSTMAggregator, self).__init__()
#         self.lstm = LSTM(in_channels, out_channels, batch_first=True)

#     def forward(self, x, edge_index):
#         # Assuming x is of shape [num_nodes, num_features]
#         # Reshape x to [num_nodes, 1, num_features] for LSTM
#         x = x.unsqueeze(1)
#         x, (hn, cn) = self.lstm(x)
#         # Flatten the output to [num_nodes, out_channels]
#         x = x.squeeze(1)
#         return x
    


# class EGraphSAGE_with_LSTM(torch.nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels):
#         super(EGraphSAGE_with_LSTM, self).__init__()
#         self.lstm_agg = LSTMAggregator(in_channels, hidden_channels)
#         self.conv1 = SAGEConv(in_channels, hidden_channels, aggr=self.lstm_agg)
#         self.conv2 = SAGEConv(hidden_channels, hidden_channels, aggr=self.lstm_agg)
#         self.lin = Linear(hidden_channels, out_channels)

#     def forward(self, x, edge_index):
#         x = self.conv1(x, edge_index)
#         x = F.relu(x)
#         x = F.dropout(x, p=0.5, training=self.training)
#         x = self.conv2(x, edge_index)
#         x = self.lin(x)
#         return F.log_softmax(x, dim=-1)
    


# def run_SageConv_lstm(train_loader,test_loader,title,result_dir):

#     model = EGraphSAGE_with_LSTM(in_channels=1, hidden_channels=128, out_channels=2)
#     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#     criterion = torch.nn.CrossEntropyLoss()

#     model_dir =  os.path.join(result_dir, "saved_models")
#     title_dir = os.path.join(result_dir, title)
#     classification_report_dir = os.path.join(result_dir, "classification_report")
#     os.makedirs(title_dir, exist_ok=True)
#     os.makedirs(model_dir, exist_ok=True)
#     os.makedirs(classification_report_dir, exist_ok=True)

#     train_model(model, train_loader, optimizer, criterion,title,title_dir)
#     cm, report = evaluate_model(model, test_loader,title,title_dir)


#     report_df = pd.DataFrame(report).transpose()
#     time_str = str(time.time()).replace('.','_')
#     report_df.to_csv(f'{classification_report_dir}/classification_report_{title}_{time_str}.csv', index=True)
    

#     save_model(model, os.path.join(model_dir, f'graph_{title}_model.pth'))
#     return cm,report,report['accuracy']

In [6]:
import torch
from torch.nn import LSTM, Linear
from torch_geometric.nn import SAGEConv
from torch_geometric.data import DataLoader

class LSTMAggregator(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(LSTMAggregator, self).__init__()
        self.lstm = LSTM(in_channels, out_channels, batch_first=True)

    def forward(self, x, edge_index):
        # Implement the LSTM aggregation logic here
        return x  # Placeholder implementation

class EGraphSAGE_with_LSTM(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(EGraphSAGE_with_LSTM, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels, aggr='mean')  # Specify aggregation method here
        self.lstm_agg = LSTMAggregator(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels, aggr='mean')
        self.linear = Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.lstm_agg(x, edge_index)
        x = self.conv2(x, edge_index)
        return self.linear(x)
    

def run_SageConv_lstm(train_loader,test_loader,title,result_dir):

    model = EGraphSAGE_with_LSTM(in_channels=1, hidden_channels=128, out_channels=2)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = torch.nn.CrossEntropyLoss()

    model_dir =  os.path.join(result_dir, "saved_models")
    title_dir = os.path.join(result_dir, title)
    classification_report_dir = os.path.join(result_dir, "classification_report")
    os.makedirs(title_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(classification_report_dir, exist_ok=True)

    train_model(model, train_loader, optimizer, criterion,title,title_dir)
    cm, report = evaluate_model(model, test_loader,title,title_dir)


    report_df = pd.DataFrame(report).transpose()
    time_str = str(time.time()).replace('.','_')
    report_df.to_csv(f'{classification_report_dir}/classification_report_{title}_{time_str}.csv', index=True)
    

    save_model(model, os.path.join(model_dir, f'graph_{title}_model.pth'))
    return cm,report,report['accuracy']

In [7]:
# new_execution_flag = input("Do u want new execution")
# new_execution_flag
def assign_value():
    user_input = input("Please enter 'yes' or 'no': ").strip().lower()
    
    if user_input == '1':
        return 1
    elif user_input == '0':
        return 0
    else:
        print("Invalid input. Please enter '1' or '0'.")
        return assign_value()
    
# assigned_value = assign_value()

In [8]:
new_execution_flag = assign_value()
if new_execution_flag == 1:
    current_time = time.localtime()
    folder_name = time.strftime("%Y-%m-%d_%H-%M-%S", current_time)
else:
    # folder_name = input("Input folder name from all_execution_data")
    folder_name = '2024-07-10_15-26-40'
    

In [9]:
folder_name


'2024-07-10_15-26-40'

In [10]:
def preprocess_data(data,output_dir,visualization_dir):
    print("inside preprocess")

data_clean_folder = "stage3_data_cleaning/v2"
folder_path = data_clean_folder
xlsx_files = glob.glob(os.path.join(folder_path, '*.xlsx'))
xlsx_files

['stage3_data_cleaning/v2\\type10_label_merged_final_decoded_clean3.xlsx',
 'stage3_data_cleaning/v2\\type1_label_merged_final_decoded_clean3.xlsx',
 'stage3_data_cleaning/v2\\type2_label_merged_final_decoded_clean3.xlsx',
 'stage3_data_cleaning/v2\\type3_label_merged_final_decoded_clean3.xlsx',
 'stage3_data_cleaning/v2\\type4_label_merged_final_decoded_clean3.xlsx',
 'stage3_data_cleaning/v2\\type5_label_merged_final_decoded_clean3.xlsx',
 'stage3_data_cleaning/v2\\type6_label_merged_final_decoded_clean3.xlsx',
 'stage3_data_cleaning/v2\\type7_label_merged_final_decoded_clean3.xlsx',
 'stage3_data_cleaning/v2\\type8_label_merged_final_decoded_clean3.xlsx',
 'stage3_data_cleaning/v2\\type9_label_merged_final_decoded_clean3.xlsx']

In [11]:
result_dict = {}
for file_path in xlsx_files:
    

    if new_execution_flag == 1:
        data = pd.read_excel(file_path)
        data['can_id'] = data['can_id'].astype(str)
    
    label_key = os.path.basename(file_path).split('.')[0][0:12]
    print(label_key)

    output_dir = f"all_execution_data/{folder_name}/{label_key}/graphs"
    visualization_dir = os.path.join(f"all_execution_data/{folder_name}/{label_key}", "visualizations")
    result_dir =  os.path.join(f"all_execution_data/{folder_name}/{label_key}", "results")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(visualization_dir, exist_ok=True)
    os.makedirs(result_dir, exist_ok=True)
    
    if new_execution_flag == 1:
        pyg_data_list = preprocess_data(data,output_dir,visualization_dir)
    else:
        pyg_data_list = load_saved_graphs(output_dir)

    print(len(pyg_data_list))
    # break
    train_size = int(0.7 * len(pyg_data_list))
    train_data = pyg_data_list[:train_size]
    test_data = pyg_data_list[train_size:]

    train_loader = DataLoader(train_data, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

    # _,_,acc = run_SageConv(train_loader,test_loader,'SageConv',result_dir)
    # _,_,gat_acc = run_GAT(train_loader,test_loader,'GAT',result_dir)
    # _,_,transformer_acc = run_GTransformer(train_loader,test_loader,'Transformer',result_dir)
    _,_,sageConv_lstm_acc =run_SageConv_lstm(train_loader,test_loader,'sageConv_lstm',result_dir)
    # _,_,gcnn_acc =run_GCnn(train_loader,test_loader,'gcnn__graph_loaded',result_dir)

    result_dict[label_key] = {
        # 'GSageConv': acc,
        # 'GAT': gat_acc,
        # 'GTransformer': transformer_acc,
        'GSage_conv_lstm': sageConv_lstm_acc,
        # 'GCNN': gcnn_acc
    }

    # print(result_dict)

type10_label
2074




RuntimeError: mat1 and mat2 shapes cannot be multiplied (108x2 and 128x2)

In [None]:
print(file_path)
label_key = os.path.basename(file_path).split('.')[0][0:12]
print(label_key)

output_dir = f"all_execution_data/{folder_name}/{label_key}/graphs"
visualization_dir = os.path.join(f"all_execution_data/{folder_name}/{label_key}", "visualizations")
result_dir =  os.path.join(f"all_execution_data/{folder_name}/{label_key}", "results")

pyg_data_list = load_saved_graphs(output_dir)
print(len(pyg_data_list))

train_size = int(0.7 * len(pyg_data_list))
train_data = pyg_data_list[:train_size]
test_data = pyg_data_list[train_size:]

train_loader = DataLoader(train_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)



In [None]:

    
model = EGraphSAGE_with_LSTM(in_channels=1, hidden_channels=128, out_channels=2)


In [None]:

# Define a sample train_loader and test_loader
# train_data = [Data(x=torch.randn(10, 1), edge_index=torch.tensor([[0, 1], [1, 0]]), y=torch.tensor([0, 1]))]
# test_data = [Data(x=torch.randn(10, 1), edge_index=torch.tensor([[0, 1], [1, 0]]), y=torch.tensor([0, 1]))]

# train_loader = DataLoader(train_data, batch_size=1, shuffle=False)
# test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

# Define the run_SageConv_lstm function
# def run_SageConv_lstm(train_loader, test_loader, title, result_dir):
#     model = EGraphSAGE_with_LSTM(in_channels=1, hidden_channels=128, out_channels=2)
#     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#     criterion = torch.nn.CrossEntropyLoss()

#     model.train()
#     for epoch in range(2):  # Training for 2 epochs for demonstration
#         for data in train_loader:
#             optimizer.zero_grad()
#             out = model(data.x, data.edge_index)
#             loss = criterion(out, data.y)
#             loss.backward()
#             optimizer.step()

#     model.eval()
#     correct = 0
#     for data in test_loader:
#         out = model(data.x, data.edge_index)
#         pred = out.argmax(dim=1)
#         correct += pred.eq(data.y).sum().item()
#     accuracy = correct / len(test_loader.dataset)

#     return model, optimizer, accuracy

# Run the function to ensure it works
# model, optimizer, accuracy = run_SageConv_lstm(train_loader, test_loader, 'sageConv_lstm', './')
# accuracy
