In [None]:
import pandas as pd
import numpy as np
import torch 
from tqdm import tqdm 
from sklearn.model_selection import train_test_split
import glob, os, pickle
from matplotlib import pyplot as plt
from torch.utils.data import DataLoader
from torch import nn
from torchvision import transforms
from torch_geometric.data import Data, Dataset
import dgl

In [1]:
import torch.nn.functional as F 
from torch.nn import Linear, BatchNorm1d, ModuleList
from torch_geometric.nn import TransformerConv, TopKPooling, GATConv
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

NameError: name 'torch' is not defined

# Need to define the data class 
- Here focus mainly on the get() method. We don't need to process anything
- We also return masks for each graph, that will help with training 

In [None]:
class WSI_Graph_Class(Dataset):
    
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(None, transform, pre_transform)
        self.root_dir = root
        self.WSI_df = pd.read_csv(root) #get the WSI metadata
        self.masks = {} #map node num -> train/val/test masks
        
    def create_mask(self, nodes, train_mask, val_mask, test_mask):        
        #create train/test/val nodes (75/25)
        train, test = train_test_split(nodes)
        test, val = train_test_split(test)
        
        #now create masks
        for i in range(len(nodes)):
            if i in train: 
                train_mask[i] = True 
                
        for i in range(len(nodes)):
            if nodes[i] in val: 
                val_mask[i] = True 
                
        for i in range(len(nodes)):
            if nodes[i] in test: 
                test_mask[i] = True 
                
    #just pass here, we aren't going to return any raw file names
    def raw_file_names(self):
        pass 
    #here we can return each of the WSI 
    def processed_file_names(self):
        return list(self.WSI_df["sample_id"])
    
    def len(self):
        return len(self.processed_file_names())
    
    #return the graph class for that idx 
    def get(self, idx):
        path = self.WSI_df["path"].iloc[idx]
        #this is the graph. We also need to return the training/validation/testing masks 
        data = torch.load(path)
        nodes = [i for i in range(data.x.shape[0])] #node 0 is in 0th pos, 1 in 1, and so on 
        #all of the masks 
        train_mask = [False] * len(nodes)
        val_mask = [False] * len(nodes)
        test_mask = [False] * len(nodes)
        self.create_mask(nodes, train_mask, val_mask, test_mask)
        
        return (data, torch.tensor(train_mask), torch.tensor(val_mask), torch.tensor(test_mask))

In [None]:
root = "/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Students/users/Gokul_Srinivasan/SCC-Tumor-Detection/Gokul_files/graph_data/metadata.csv"

dataset = WSI_Graph_Class(root = root, transform = None, pre_transform = None, pre_filter = None)

In [None]:
dataset.get(0)

# Define Model 
- This mainly draws upon HIV project code 

In [None]:
torch.manual_seed(42)

# class GNN(torch.nn.Module):
#     def __init__(self, feature_size):
#         super(GNN, self).__init__()
#         num_classes = 2
#         embedding_size = 2048 # from resnet  

#         #define the GNN layers 

#         #layer 1
#         #the first graph attention layer which will create 3*embed size embeddings for each node. This will also take care of all the message passing and aggregation
#         self.conv1 = GATConv(feature_size, embedding_size, heads=3, dropout = 0.3)
#         #reduce the dimensionality back
#         self.head_transform1 = Linear(embedding_size*3, embedding_size)
#         self.pool1 = TopKPooling(embedding_size, ratio=0.8)

#         #layer 2
#         self.conv2 = GATConv(embedding_size, embedding_size, heads=3, dropout = 0.3)
#         self.head_transform2 = Linear(embedding_size*3, embedding_size)
#         self.pool2 = TopKPooling(embedding_size, ratio=0.5)

#         #layer 3
#         self.conv3 = GATConv(embedding_size, embedding_size, heads=3, dropout = 0.3)
#         self.head_transform3 = Linear(embedding_size*3, embedding_size)
#         self.pool3 = TopKPooling(embedding_size, ratio=0.2)


#         #linear layers - these need to be modified to match the output size? Or maybe not
#         self.linear1 = Linear(embedding_size*2, embedding_size)
#         self.linear2 = Linear(embedding_size, 2)

#     def forward(self, x, edge_attr, edge_index, batch_index):
#         #block 1 
#         x = self.conv1(x, edge_index)
#         x = self.head_transform1(x)

#         x, edge_index, edge_attr, batch_index, _, _ = self.pool1(x, edge_index, None, batch_index)
#         #graph rep. 
#         x1 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)
#         #block 2 
#         x = self.conv2(x, edge_index)
#         x = self.head_transform2(x)

#         x, edge_index, edge_attr, batch_index, _, _ = self.pool2(x, edge_index, None, batch_index)
#         #graph rep. 
#         x2 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)
#         #block 3
#         x = self.conv3(x, edge_index)
#         x = self.head_transform3(x)

#         x, edge_index, edge_attr, batch_index, _, _ = self.pool3(x, edge_index, None, batch_index)
#         #graph rep. 
#         x3 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)
#         #element wise addition , and each is 2048 
#         x = x1 + x2 + x3
#         #output block 
#         x = self.linear1(x).relu()
#         x = F.dropout(x, p=0.5)
#         x = self.linear2(x)

#         return x

In [None]:
torch.manual_seed(42)

class simple_GNN(torch.nn.Module):
    def __init__(self, feature_size):
        super(simple_GNN, self).__init__()
        
        self.num_classes = 2 #scc or normal
        self.embedding_size = 2048 # this is what we want the embedding to be
        
        #define the GNN layers 

        #layer 1
        #the first graph attention layer which will create 3*embed size embeddings for each node. This will also take care of all the message passing and aggregation
        self.conv1 = GATConv(feature_size, self.embedding_size, heads=3, dropout = 0.3)
        #reduce the dimensionality back
        self.head_transform1 = Linear(self.embedding_size*3, self.embedding_size)
        
        #layer 2
        self.conv2 = GATConv(self.embedding_size, self.embedding_size, heads=3, dropout = 0.3)
        self.head_transform2 = Linear(self.embedding_size*3, self.embedding_size)

           
        #layer 2
        self.conv3 = GATConv(self.embedding_size, self.embedding_size, heads=3, dropout = 0.3)
        self.head_transform3 = Linear(self.embedding_size*3, self.embedding_size)

        #linear layers - these need to be modified to match the output size? Or maybe not
        self.linear1 = Linear(self.embedding_size, self.embedding_size)
        self.linear2 = Linear(self.embedding_size, self.num_classes) #prediction for each class

    def forward(self,data):
        x, edge_index = data.x, data.edge_index
        x = x.to(device)
        edge_index = edge_index.to(device) #we put stuff on the device here? maybe memory error? 
        
        #block 1 
        x = self.conv1(x, edge_index) #this is does all the aggregation and message passing
        x = self.head_transform1(x)       
       
        #block 2
        x = self.conv2(x, edge_index) 
        x = self.head_transform2(x)      
        
        #block 3
        x = self.conv3(x, edge_index) #this is does all the aggregation and message passing
        x = self.head_transform3(x)   
        
        #output block 
        x = self.linear1(x).relu()
        x = F.dropout(x, p=0.5)
        x = self.linear2(x)

        return x

# Dataloader

In [None]:
num_features = 2048
model = simple_GNN(feature_size=num_features)
model = model.to(device)

In [None]:
#loss and optimizer 
import torch.optim as optim
loss_fn = torch.nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr=0.01)

In [None]:
#prepare training 
from torch_geometric.data import DataLoader
data_loader = DataLoader(dataset, batch_size=1)
num_epochs = 10

# Actual Model Training 

In [None]:
for epoch in range(num_epochs):
    #training portion
    epoch_loss = []
    model.train()
    for data in tqdm(data_loader):
        #get graph
        graph = data[0]
        y = graph.y 
        y = y.to(device)
        #get masks
        train_mask = data[1].T.reshape([data[1].T.shape[0]])

        #get predictions 
        logits = model(graph)
        
        loss = loss_fn(logits[train_mask], y[train_mask])
        epoch_loss.append(loss.item())

        opt.zero_grad()
        loss.backward()
        opt.step()
    #now find the average training loss for this epoch 
    epoch_loss = sum(epoch_loss)/len(epoch_loss)
    print("Epoch :%d. Epoch loss: %f" %(epoch, epoch_loss))    
    
    #validation portion
    validation_correct = 0
    validation_total = 0
    model.eval()
    with torch.no_grad():
        for data in tqdm(data_loader):
            #get graph
            graph = data[0]
            y = graph.y 
            y = y.to(device)
            #get masks
            val_mask = data[2].T.reshape([data[2].T.shape[0]])

            #get predictions 
            logits = model(graph)
            #get them into label predictions
            _, indices = torch.max(logits, dim=1).detach()

            validation_correct += sum(indices[val_mask] == y[val_mask]).item()
            validation_total += sum(val_mask == True).item()
    
    print("Epoch :%d. Validation accuracy: %f" %(epoch, validation_correct/validation_total))

In [None]:
!nvidia-smi