In [1]:
import torch
import torch_geometric as pyg

print(torch.backends.mps.is_available())

device = "mps" if torch.backends.mps.is_available() else "cpu"


True


We show a simple example of an unweighted and undirected graph with three nodes and four edges. Each node contains exactly one feature:


In [2]:
from torch_geometric.data import Data

x = torch.tensor([[-1], [0], [1]], dtype=torch.float)
#                           src             tgt
edge_index = torch.tensor([[0, 1, 1, 2], [1, 0, 2, 1]], dtype=torch.long)
data = pyg.data.Data(x=x, edge_index=edge_index)
# OR
#                          src,tgt
edge_index = torch.tensor([[0, 1], [1, 0], [1, 2], [2, 1]], dtype=torch.long)


data: Data = Data(x=x, edge_index=edge_index.t().contiguous())
data.validate(raise_on_error=True)
print(data.keys)
print(data["x"])
print(data.num_nodes)
print(data.num_edges)
print(data.has_isolated_nodes())
print(data.has_self_loops())
data


['edge_index', 'x']
tensor([[-1.],
        [ 0.],
        [ 1.]])
3
4
False
False


Data(x=[3, 1], edge_index=[2, 4])

In [3]:
from torch_geometric.datasets import TUDataset, Planetoid

dataset = TUDataset(root="./data/ENZYMES", name="ENZYMES")
# dataset = Planetoid(root="data/Cora", name="Cora")

# dataset = dataset.shuffle()
print(len(dataset))
print(dataset.num_classes)
print(dataset.num_node_features)
data = dataset[0]
print(data.is_directed())
print(data.keys)
data


600
6
3
False
['edge_index', 'x', 'y']


Data(edge_index=[2, 168], x=[37, 3], y=[1])

In [4]:
train_dataset = dataset[:540]

test_dataset = dataset[540:]
train_dataset


ENZYMES(540)

## Mini batches

https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html#mini-batches


In [5]:
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

dataset = TUDataset(root="./data/ENZYMES", name="ENZYMES", use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
for batch in loader:
    print(batch, batch.num_graphs)
    break


DataBatch(edge_index=[2, 3388], x=[901, 21], y=[32], batch=[901], ptr=[33]) 32


## Data Transforms


In [6]:
from torch_geometric.datasets import ShapeNet
import torch_geometric.transforms as T

dataset = ShapeNet(root="data/ShapeNet", categories=["Airplane"])
print(dataset[0])

dataset = ShapeNet(
    root="data/ShapeNet", categories=["Airplane"], pre_transform=T.KNNGraph()
)

dataset[0]


Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])




Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])

## Learning methods on graph

:)


In [7]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root="data/Cora", name="Cora")
num_train = sum(dataset[0]['train_mask'])
print(num_train)
num_test = sum(dataset[0]['test_mask'])
print(num_test)
num_val = sum(dataset[0]['val_mask'])
print(num_val)
dataset[0]


tensor(140)
tensor(1000)
tensor(500)


Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [8]:
import torch
import torch.nn.functional as F
import torch_geometric.nn as gnn


class GCN(torch.nn.Module):
    def __init__(self, in_feats, h_feats, outputs) -> None:
        super().__init__()
        self.c1 = gnn.GCNConv(in_feats, h_feats)
        self.c2 = gnn.GCNConv(h_feats, outputs)

    def forward(self, data):
        x, edge_idx = data.x, data.edge_index

        h = self.c1(x, edge_idx)
        h = F.relu(h)
        h = F.dropout(h, training=self.training)
        h = self.c2(h, edge_idx)

        return F.log_softmax(h, dim=1)


In [9]:
dataset[0].y[dataset[0].train_mask]

tensor([3, 4, 4, 0, 3, 2, 0, 3, 3, 2, 0, 0, 4, 3, 3, 3, 2, 3, 1, 3, 5, 3, 4, 6,
        3, 3, 6, 3, 2, 4, 3, 6, 0, 4, 2, 0, 1, 5, 4, 4, 3, 6, 6, 4, 3, 3, 2, 5,
        3, 4, 5, 3, 0, 2, 1, 4, 6, 3, 2, 2, 0, 0, 0, 4, 2, 0, 4, 5, 2, 6, 5, 2,
        2, 2, 0, 4, 5, 6, 4, 0, 0, 0, 4, 2, 4, 1, 4, 6, 0, 4, 2, 4, 6, 6, 0, 0,
        6, 5, 0, 6, 0, 2, 1, 1, 1, 2, 6, 5, 6, 1, 2, 2, 1, 5, 5, 5, 6, 5, 6, 5,
        5, 1, 6, 6, 1, 5, 1, 6, 5, 5, 5, 1, 5, 1, 1, 1, 1, 1, 1, 1])

In [10]:
from torchmetrics import F1Score

print("device:", device)
model = GCN(dataset.num_node_features, 16, dataset.num_classes).to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()


device: mps


In [11]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f"Accuracy: {acc:.4f}")
metric = F1Score(task="multiclass", num_classes=dataset.num_classes)
target = data.y[data.test_mask].to("cpu")

f"f1: {metric(pred[data.test_mask].to('cpu'), target)}"


Accuracy: 0.7950


'f1: 0.7950000166893005'

## Exercises

In [12]:
"""
What does edge_index.t().contiguous() do?
turns [[src],[tgt]] into [[src,tgt]]
"""

# Load the "IMDB-BINARY" dataset from the TUDataset benchmark suite and randomly split it into 80%/10%/10% training, validation and test graphs.
dataset = TUDataset("data", name="IMDB-BINARY").shuffle()

tr = int(len(dataset) * 0.8)
te = tr + int(len(dataset) * 0.1)
val = tr + te + int(len(dataset) * 0.1)
train = dataset[:tr]
test = dataset[tr:te]
eval = dataset[te:]
print(len(dataset))
print(len(train) / len(dataset))
print(len(test) / len(dataset))
print(len(eval) / len(dataset))

""" What does each number of the following output mean?
DataBatch(batch=[1082], edge_index=[2, 4066], x=[1082, 21], y=[32])

num graphs, len of edges, num nodes x num feats, num labels
"""


1000
0.8
0.1
0.1


' What does each number of the following output mean?\nDataBatch(batch=[1082], edge_index=[2, 4066], x=[1082, 21], y=[32])\n\nnum graphs, len of edges, num nodes x num feats, num labels\n'