In [14]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path


import torch
import torch_geometric

from sklearn.metrics import confusion_matrix, classification_report
from src.graphs import Graph

from src.utils import batcher

# SGNet

In [2]:
import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.nn import SGConv


class SGNet(torch.nn.Module):
    def __init__(self, data, K=1, num_classes=None):
        super().__init__()
        if num_classes == None:
            num_classes = 2
        self.conv = SGConv(in_channels=data.num_features, out_channels=num_classes, K=K, cached=False)
    
    def forward(self, data):
        x = self.conv(data.x, data.edge_index)
        
        return F.log_softmax(x, dim=1)
    

# Training and Testing Routines

In [3]:
def train(model, data, optimizer, mask):
    """
    Single iteration of training
    """
    # set training mode to True (enabling dropout, etc)
    model.train()
    
#     # make sure format of weights is correct
#     model.double()
    
    # reset gradients
    optimizer.zero_grad()
    
    # get output of model, which is log-probability (log of softmax)
    # note mask is not applied because message passing needs all nodes
    log_softmax = model(data)
    
    labels = data.y # labels of each node
    
    # apply training mask
    nll_loss = F.nll_loss(log_softmax[mask], labels[mask])
    
    # backprop- compute gradients
    nll_loss.backward()
    
    # backprop- update parameters
    optimizer.step()
    

def compute_accuracy(model, data, mask):
    # set eval mode to True (disable dropout, etc)
    model.eval()
    
    #model.double()
    
    # get output of model
    log_softmax = model(data)
    
    # get index of max value from softmax, equivalent to y pred
    yp = log_softmax[mask].argmax(dim=1) 
    
    
    
    return yp == data.y[mask]

# run without gradient (faster)
@torch.no_grad() 
def test(model, data):
    return compute_accuracy(model, data, data.mask)

# Determine label using criteria for AGG

In [4]:
def detect_agg(g):
    if g.graph_attr['candidate_growth_ratio'] > 10 and g.graph_attr['candidate_rgr'] > 2.5:
        return True
    else:
        return False

# Method to normalize features for neural network input

In [5]:
def normalize_features(d, mask=None):
    if mask == None:
        mask = np.ones(len(d.x), np.bool)
    d.x = (d.x - d.x[mask].mean(dim=0))/d.x[mask].std(dim=0)
    d.edge_attr = (d.edge_attr - d.edge_attr.mean(dim=0)/d.edge_attr.std(dim=0))

# Loading data

In [6]:
data = Path('..','data','candidate-grains-processed')
data.exists()

True

In [7]:
runs_all = [list(x.glob('*.json')) for x in data.glob('*') if x.is_dir() and len(list(x.glob('*.json'))) > 500] 
temp = []
[temp.extend(r) for r in runs_all]
runs_all = sorted(temp)
rs = np.random.RandomState(seed=3346665170)
rs.shuffle(runs_all)
from multiprocessing import get_context, Pool
def load_wrapper(x):
    from src.graphs import Graph
    g = Graph.from_json(x)
    d = g.to_pyg_dataset()
    y = np.zeros(len(g.nodes), np.int)
    y[d.mask] = int(detect_agg(g))
    d.y = torch.tensor(y, dtype=torch.long)
    return d

#with Pool(processes=8) as p:
#    datasets_large = p.map(load_wrapper, runs_all[:1000])

datasets_large = list(map(load_wrapper, runs_all[:1000]))

# Combine graphs into batches 
Combine individual runs into disconnected graphs containing sets of 100 runs.

In [8]:
batches = batcher(datasets_large, batch_size=100, min_size=30)
batches = [torch_geometric.data.Batch().from_data_list(b) for b in batches]
for b in batches:
    normalize_features(b)
    

# Metrics to be reported during training

In [9]:
def loss_per_node(model, batches):
    # total number of test nodes per batch
    n_test_nodes = torch.tensor([b.mask.sum() for b in batches])
    # average loss per test node per batch
    avg_loss_batch = torch.tensor([F.nll_loss(model(b)[b.mask], b.y[b.mask]).detach() for b in batches])
    
    # total loss
    total_loss = (n_test_nodes * avg_loss_batch).sum()
    
    # avg loss per node
    avg_loss = total_loss / n_test_nodes.sum()
    return avg_loss

def mean_acc(model, batches):
    # tensor of predictions on test nodes in each batch, concatenated into single array
    predictions = torch.cat([test(model, b) for b in batches], dim=0)
    acc = predictions.sum()/len(predictions) # total number correct (True) vs total number (all)
    return acc
    

# Create model

In [11]:
sgn = SGNet(batches[0], K=3)
sgn.double() # needed to prevent pytorch errors during training
model = sgn

# Train model

In [12]:
#optimizer = torch.optim.Adam(gat.parameters(), lr=0.005, weight_decay=1e-3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

log = 'Epoch: {:03d}, Train acc: {:.4f} Val acc: {:.4f}, Train Loss: {:.4f}'#', Val: {:.4f}'
for epoch in range(1, 201):
    for batch in batches[:-1]:
        train(model, batch, optimizer, batch.mask)
    if epoch % 5 == 0:
        
        tr_accs = mean_acc(model, batches[:-1])
        val_accs = mean_acc(model, batches[-1:])        
        losses = loss_per_node(model, batches)
        
        print(log.format(epoch, tr_accs, val_accs, losses))
        

Epoch: 005, Train acc: 0.4867 Val acc: 0.5200, Train Loss: 0.6990
Epoch: 010, Train acc: 0.4967 Val acc: 0.5700, Train Loss: 0.6923
Epoch: 015, Train acc: 0.5589 Val acc: 0.5600, Train Loss: 0.6864
Epoch: 020, Train acc: 0.5822 Val acc: 0.5800, Train Loss: 0.6811
Epoch: 025, Train acc: 0.5978 Val acc: 0.6200, Train Loss: 0.6765
Epoch: 030, Train acc: 0.5944 Val acc: 0.6000, Train Loss: 0.6724
Epoch: 035, Train acc: 0.6011 Val acc: 0.6100, Train Loss: 0.6687
Epoch: 040, Train acc: 0.6022 Val acc: 0.6000, Train Loss: 0.6656
Epoch: 045, Train acc: 0.5944 Val acc: 0.6000, Train Loss: 0.6628
Epoch: 050, Train acc: 0.6022 Val acc: 0.6200, Train Loss: 0.6604
Epoch: 055, Train acc: 0.5967 Val acc: 0.6300, Train Loss: 0.6583
Epoch: 060, Train acc: 0.6011 Val acc: 0.6300, Train Loss: 0.6564
Epoch: 065, Train acc: 0.6033 Val acc: 0.6300, Train Loss: 0.6548
Epoch: 070, Train acc: 0.6033 Val acc: 0.6200, Train Loss: 0.6535
Epoch: 075, Train acc: 0.6033 Val acc: 0.6200, Train Loss: 0.6523
Epoch: 080

# Consolidate predictions from all graphs and report confusion matrix

In [16]:
gt = []
pred = []
for b in batches[:-1]:
    gt.extend(b.y[b.mask].numpy().tolist())
    pred.extend(model(b)[b.mask].argmax(1).numpy().tolist())
cm_tr = confusion_matrix(gt, pred)

gtv = []
predv = []
for b in batches[-1:]:
    gtv.extend(b.y[b.mask].numpy().tolist())
    predv.extend(model(b)[b.mask].argmax(1).numpy().tolist())
cm_val = confusion_matrix(gtv, predv)

print('Train')
print(cm_tr)
print(classification_report(gt, pred))
print('Validation')
print(cm_val)
print(classification_report(gtv, predv))


Train
[[340 137]
 [210 213]]
              precision    recall  f1-score   support

           0       0.62      0.71      0.66       477
           1       0.61      0.50      0.55       423

    accuracy                           0.61       900
   macro avg       0.61      0.61      0.61       900
weighted avg       0.61      0.61      0.61       900

Validation
[[37 16]
 [21 26]]
              precision    recall  f1-score   support

           0       0.64      0.70      0.67        53
           1       0.62      0.55      0.58        47

    accuracy                           0.63       100
   macro avg       0.63      0.63      0.63       100
weighted avg       0.63      0.63      0.63       100

