In [1]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import torch
import torch_geometric

from src.graphs import Graph
from src.utils import batcher

# GATNET

In [223]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import GATConv, SGConv

class GATNet(torch.nn.Module):
    def __init__(self, data, heads_layer1, 
               heads_layer2, dropout, dropout_alphas, num_classes=None):
        super().__init__()

        self.dropout = dropout
        num_features = data.num_features
        if num_classes == None:
            num_classes = 2  # hardcoded for now

        self.conv1 = GATConv(in_channels=num_features, out_channels=8,
                             heads=heads_layer1, concat=True, negative_slope=0.2, 
                             dropout=dropout_alphas)

        self.conv2 = GATConv(in_channels=8*heads_layer1, out_channels=num_classes, 
                             heads=heads_layer2, concat=False, negative_slope=0.2,
                             dropout=dropout_alphas)

    def forward(self, data):
        x=data.x
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv1(x, data.edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, data.edge_index)
      
        return F.log_softmax(x, dim=1)

    
class SGNet(torch.nn.Module):
    def __init__(self, data, K=1, num_classes=None):
        super().__init__()
        if num_classes == None:
            num_classes = 2
        self.conv = SGConv(in_channels=data.num_features, out_channels=num_classes, K=K, cached=False)
    
    def forward(self, data):
        x = self.conv(data.x, data.edge_index)
        
        return F.log_softmax(x, dim=1)
    

In [259]:
def train(model, data, optimizer, mask):
    """
    Single iteration of training
    """
    # set training mode to True (enabling dropout, etc)
    model.train()
    
#     # make sure format of weights is correct
#     model.double()
    
    # reset gradients
    optimizer.zero_grad()
    
    # get output of model, which is log-probability (log of softmax)
    # note mask is not applied because message passing needs all nodes
    log_softmax = model(data)
    
    labels = data.y # labels of each node
    
    # apply training mask
    nll_loss = F.nll_loss(log_softmax[mask], labels[mask])
    
    # backprop- compute gradients
    nll_loss.backward()
    
    # backprop- update parameters
    optimizer.step()
    

def compute_accuracy(model, data, mask):
    # set eval mode to True (disable dropout, etc)
    model.eval()
    
    #model.double()
    
    # get output of model
    log_softmax = model(data)
    
    # get index of max value from softmax, equivalent to y pred
    yp = log_softmax[mask].argmax(dim=1) 
    
    
    
    return yp == data.y[mask]

# run without gradient (faster)
@torch.no_grad() 
def test(model, data):
    return compute_accuracy(model, data, data.mask)

In [4]:
def detect_agg(g):
    if g.graph_attr['candidate_growth_ratio'] > 10 and g.graph_attr['candidate_rgr'] > 2.5:
        return True
    else:
        return False

In [5]:
data = Path('..','data','candidate-grains-processed')
data.exists()

False

In [7]:
json_paths = list(sorted(data.glob('*'))[-1].glob('*.json'))

In [287]:
def normalize_features(d, mask=None):
    if mask == None:
        mask = np.ones(len(d.x), np.bool)
    d.x = (d.x - d.x[mask].mean(dim=0))/d.x[mask].std(dim=0)
    d.edge_attr = (d.edge_attr - d.edge_attr.mean(dim=0)/d.edge_attr.std(dim=0))

In [288]:
graphs = [Graph.from_json(x) for x in json_paths[:200]]
datasets = [g.to_pyg_dataset() for g in graphs]
for g, d in zip(graphs, datasets):
    y = np.zeros(len(g.nodes), np.int)
    y[d.mask] = int(detect_agg(g))
    d.y = torch.tensor(y, dtype=torch.long)
    normalize_features(d)

In [289]:
batch = torch_geometric.data.Batch().from_data_list(datasets)

In [291]:
gat = GATNet(datasets[0], 4, 4, 0.5, 0.5)
gat.double()


#optimizer = torch.optim.Adam(gat.parameters(), lr=0.005, weight_decay=1e-3)
optimizer = torch.optim.Adam(gat.parameters(), lr=0.005, weight_decay=5e-4)

log = 'Epoch: {:03d}, Train: {:.4f}, Loss: {:.4f}'#', Val: {:.4f}'
for epoch in range(1, 51):
    train(gat, batch, optimizer, batch.mask)
    #for d in datasets:
    #    train(gat, d, optimizer, d.mask)
    if epoch % 5 == 0:
        tests = [test(gat, d) for d in datasets]
        losses = [F.nll_loss(gat(d)[d.mask], d.y[d.mask]).detach().numpy() for d in datasets]
        
        print(log.format(epoch, np.mean(tests), np.mean(losses)), )
        

Epoch: 005, Train: 0.4100, Loss: 0.7171
Epoch: 010, Train: 0.4500, Loss: 0.7032
Epoch: 015, Train: 0.5800, Loss: 0.6858
Epoch: 020, Train: 0.6150, Loss: 0.6785
Epoch: 025, Train: 0.6100, Loss: 0.6731
Epoch: 030, Train: 0.6200, Loss: 0.6703
Epoch: 035, Train: 0.6300, Loss: 0.6685
Epoch: 040, Train: 0.6300, Loss: 0.6669
Epoch: 045, Train: 0.6300, Loss: 0.6653
Epoch: 050, Train: 0.6300, Loss: 0.6633


In [12]:
runs_all = [list(x.glob('*.json')) for x in data.glob('*') if x.is_dir() and len(list(x.glob('*.json'))) > 500] 
temp = []
[temp.extend(r) for r in runs_all]
runs_all = sorted(temp)
rs = np.random.RandomState(seed=3346665170)
rs.shuffle(runs_all)
from multiprocessing import get_context, Pool
def load_wrapper(x):
    from src.graphs import Graph
    g = Graph.from_json(x)
    d = g.to_pyg_dataset()
    y = np.zeros(len(g.nodes), np.int)
    y[d.mask] = int(detect_agg(g))
    d.y = torch.tensor(y, dtype=torch.long)
    return d

#with Pool(processes=8) as p:
#    datasets_large = p.map(load_wrapper, runs_all[:1000])

datasets_large = list(map(load_wrapper, runs_all[:1000]))

In [292]:
batches = batcher(datasets_large, batch_size=100, min_size=30)
batches = [torch_geometric.data.Batch().from_data_list(b) for b in batches]
for b in batches:
    normalize_features(b, b.mask)
    

In [293]:
def loss_per_node(model, batches):
    # total number of test nodes per batch
    n_test_nodes = torch.tensor([b.mask.sum() for b in batches])
    # average loss per test node per batch
    avg_loss_batch = torch.tensor([F.nll_loss(model(b)[b.mask], b.y[b.mask]).detach() for b in batches])
    
    # total loss
    total_loss = (n_test_nodes * avg_loss_batch).sum()
    
    # avg loss per node
    avg_loss = total_loss / n_test_nodes.sum()
    return avg_loss

def mean_acc(model, batches):
    # tensor of predictions on test nodes in each batch, concatenated into single array
    predictions = torch.cat([test(model, b) for b in batches], dim=0)
    acc = predictions.sum()/len(predictions) # total number correct (True) vs total number (all)
    return acc
    

In [294]:
gat = GATNet(datasets[0], 4, 4, 0.5, 0.5)
gat.double()
sgn = SGNet(datasets[0], K=2)
sgn.double()
model = sgn

model(datasets[0]).argmax(1)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [295]:
# TRAINING DATA NOT NORMALIZED IN BATCHES
# TODO FIGURE OUT NORMALIZATION (pre or post mask????)

#optimizer = torch.optim.Adam(gat.parameters(), lr=0.005, weight_decay=1e-3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

log = 'Epoch: {:03d}, Train: {:.4f}, Loss: {:.4f}'#', Val: {:.4f}'
for epoch in range(1, 201):
    for batch in batches:
        train(model, batch, optimizer, batch.mask)
    if epoch % 5 == 0:
        
        accs = mean_acc(model, batches)
        losses = loss_per_node(model, batches)
        print(log.format(epoch, accs, losses))
        

Epoch: 005, Train: 0.5300, Loss: nan
Epoch: 010, Train: 0.5300, Loss: nan
Epoch: 015, Train: 0.5300, Loss: nan


KeyboardInterrupt: 

In [212]:
[model(b)[b.mask].argmax(1) for b in batches]

[tensor([0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
         1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
         0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
         0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
         0, 0, 0, 1]),
 tensor([0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
         1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
         1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         1, 1, 0, 0]),
 tensor([0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
         0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
         1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
         0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 

In [211]:
np.unique(model(datasets[0]).argmax(1), return_counts=True)

(array([0, 1]), array([292, 149]))

In [249]:
cora = Planetoid(root = './tmp', name='Cora', transform=T.NormalizeFeatures())

In [280]:
cdata = cora[0]
cdata.mask = cdata.train_mask
model_cora = GATNet(cdata, 4, 4, 0.5, 0.5, num_classes=len(cdata.y.unique()))

model_cora.float()
optimizer = torch.optim.Adam(model_cora.parameters(), lr=0.003, weight_decay=5e-4)

In [281]:
log = 'Epoch: {:03d}, Train: {:.4f}, Loss: {:.4f}'#', Val: {:.4f}'

for epoch in range(0, 201):
    train(model_cora, cdata, optimizer, cdata.train_mask)
    if epoch % 20  == 0:
        accs = mean_acc(model_cora, [cdata,])
        losses = loss_per_node(model_cora, [cdata,])
        print(log.format(epoch, accs, losses))


Epoch: 000, Train: 0.1500, Loss: 1.9448
Epoch: 020, Train: 0.9286, Loss: 1.9149
Epoch: 040, Train: 0.9214, Loss: 1.8505
Epoch: 060, Train: 0.9357, Loss: 1.7319
Epoch: 080, Train: 0.9286, Loss: 1.5594
Epoch: 100, Train: 0.9500, Loss: 1.3485
Epoch: 120, Train: 0.9643, Loss: 1.1335
Epoch: 140, Train: 0.9643, Loss: 0.9461
Epoch: 160, Train: 0.9786, Loss: 0.8035
Epoch: 180, Train: 0.9786, Loss: 0.7017
Epoch: 200, Train: 0.9857, Loss: 0.6321
