# How to represent a graph

In [None]:
import torch 
from torch_geometric.data import Data

A graph is used to model pairwise relations (edges) between objects (nodes). A single graph in PyG is described by an instance of torch_geometric.data.Data, which holds the following attributes by default:

data.x: Node feature matrix with shape [num_nodes, num_node_features]

data.edge_index: Graph connectivity in COO format with shape [2, num_edges] and type torch.long

data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]

data.y: Target to train against (may have arbitrary shape), e.g., node-level targets of shape [num_nodes, *] or graph-level targets of shape [1, *]

data.pos: Node position matrix with shape [num_nodes, num_dimensions]



None of these attributes are required. In fact, the Data object is not even restricted to these attributes. 

In [None]:
edge_index = torch.tensor([[0,1,1,2], [1, 0, 2, 1]], dtype=torch.long)

this is not a list of index tuples, but a set of 2 vectors that define source and target. this can also be done in a more intuitive way (mind the .t().contiguous() call at the end that makes sure we have the right shape and memory layout)

In [None]:
edge_index = torch.tensor([[0, 1], [1, 0], [1, 2], [2, 1]], dtype=torch.long).t().contiguous()

apparently, edges are directed by default and become undirected when there is a reverse (presumably with the same weight)

In [None]:
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

In [None]:
data = Data(x = x, edge_index=edge_index,)

In [None]:
data

check if data is compact enough

In [None]:
data.validate(raise_on_error=True)

utility functions

In [None]:
data.keys()

In [None]:
data['x']

In [None]:
for key, item in data: # mind that there is no `data.items()` call here
    print(f"{key}: {item}")

In [None]:
'edge_attr' in data

In [None]:
data.num_nodes 

In [None]:
data.num_edges

In [None]:
data.num_node_features

In [None]:
data.has_isolated_nodes()

In [None]:
data.has_self_loops()

In [None]:
data.is_directed()


put stuff onto gpu

In [None]:
device = torch.device('cuda')

In [None]:
data = data.to(device)

In [None]:
data

# Common benchmark datasets

In [None]:
from torch_geometric.datasets import TUDataset

In [None]:
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')

In [None]:
dataset

In [None]:
len(dataset)

In [None]:
dataset.num_classes

In [None]:
dataset.num_node_features

extract single graph in the dataset

In [None]:
data = dataset[0]

In [None]:
data

In [None]:
data.is_undirected()

In [None]:
train_dataset = dataset[:540]
test_dataset = dataset[540:]

In [None]:
train_dataset

In [None]:
test_dataset

In [None]:
dataset = dataset.shuffle()

a call to 'shuffle' is equivalent to the torch call

In [None]:
perm = torch.randperm(len(dataset))
dataset = dataset[perm]

In [None]:
from torch_geometric.datasets import Planetoid

In [None]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')


In [None]:
len(dataset)

In [None]:
dataset.num_classes

In [None]:
dataset.num_node_features

In [None]:
data = dataset[0
            ]

In [None]:
data

In [None]:
data.is_undirected()

In [None]:
data.train_mask.sum().item()

In [None]:
data.val_mask.sum().item()

In [None]:
data.test_mask.sum().item()

this dataset uses a transductive approach and masks part of the graph instead of having millions of small graphs. This is a node-level dataset with node-level masks that tell us on which nodes we train and on which we validate and test

# Mini-batches

parallelization achieved via creating a sparse block diagonal adjacency matrix and concatenating feature and target matrices in the node dimension. This is where graphneuralnetworks has this from. can be alright, but I am not sure if this won´t become a problem again

In [None]:
from torch_geometric.datasets import TUDataset 
from torch_geometric.loader import DataLoader

In [None]:
dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)

In [None]:
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
for batch in loader: 
    print(batch)
    print(batch.num_graphs)

indicator thing that tells to which node each graph belongs. 

In [None]:
next(iter(loader)).batch

 this can, e.g., be used to average node features on each graph individually

In [None]:
from torch_geometric.utils import scatter

In [None]:
for data in loader: 
    x = scatter(data.x, data.batch, dim=0, reduce='mean')
    print(x.shape)

# Data transforms

skip this dogshit b/c the url isn´t working and everything is hardcoded... 
see here: https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html#data-transforms

In [None]:
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

In [None]:
dataset = Planetoid(root='/tmp/Cora', name='Cora', force_reload=True,
                    pre_transform=T.NormalizeFeatures(),
                    transform=T.RandomJitter(0.01))

# TODO: learn how to use your own data

# Learning methods on graphs

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import Sequential


The below is a somewhat modified version of the tutorial code here: 
https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html#data-transforms

In [None]:
class GCN(torch.nn.Module): 
    def __init__(self, input_dim, hidden_dim, output_dim): 
        super().__init__() 
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.dropoutlayer = lambda x: F.dropout(x, training = self.training)
        self.layers = Sequential( 'x, edge_index', 
            [
            (self.conv1, 'x, edge_index -> x'),
            F.relu, 
            self.dropoutlayer,
            (self.conv2, 'x, edge_index -> x'),
            lambda x: F.log_softmax(x, dim=1)
            ]
        )

    def forward(self, data): 
        x, edge_index = data.x, data.edge_index
        return self.layers(x, edge_index)

 

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

In [None]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [None]:
model = GCN(dataset.num_node_features, 16, dataset.num_classes).to(device)

In [None]:
data = dataset[0].to(device)


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

train mode

In [None]:
model.train()

In [None]:
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}')

eval mode

In [None]:
model.eval()


In [None]:
pred = model(data).argmax(dim=1) # why the argmax? --> classifier output


In [None]:
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()

In [None]:
acc = int(correct) / int(data.test_mask.sum())


In [None]:
print(f'Accuracy: {acc:.4f}')