In [None]:
import dgl
from dgl.data import FraudDataset

dataset = FraudDataset("yelp")
g = dataset[0]
print(g.etypes)
print(g.ntypes)
print(g.ndata)
print(g.ndata.keys())
print()
print(g.ndata["feature"])


In [None]:
mask = g.ndata["train_mask"].to(bool)
l = {0: 0, 1: 0}

for x in g.ndata["label"][mask].numpy():
    l[x] += 1
print(l, l[0] / (mask.sum()))

mask = g.ndata["val_mask"].to(bool)
l = {0: 0, 1: 0}

for x in g.ndata["label"][mask].numpy():
    l[x] += 1
print(l, l[0] / (mask.sum()))

mask = g.ndata["test_mask"].to(bool)
l = {0: 0, 1: 0}

for x in g.ndata["label"][mask].numpy():
    l[x] += 1
print(l, l[0] / (mask.sum()))


In [None]:
import torch
from torch import nn
from torch.nn import functional as F
# import pytorch_lightning as pl
import dgl.nn.pytorch as gnn


# class GCN(pl.LightningModule):
class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super().__init__()
        self.conv1 = gnn.HeteroGraphConv(
            {
                "net_rsr": gnn.SAGEConv(in_feats, h_feats,'pool'),
                "net_rtr": gnn.SAGEConv(in_feats, h_feats,'pool'),
                "net_rur": gnn.SAGEConv(in_feats, h_feats,'pool'),
            },
            aggregate="sum",
        )
        self.conv2 = gnn.HeteroGraphConv(
            {
                "net_rsr": gnn.SAGEConv(h_feats, num_classes,'pool'),
                "net_rtr": gnn.SAGEConv(h_feats, num_classes,'pool'),
                "net_rur": gnn.SAGEConv(h_feats, num_classes,'pool'),
            },
            aggregate="sum",
        )

        self.conv0 = gnn.HeteroGraphConv(
            {
                "net_rsr": gnn.GraphConv(in_feats, num_classes),
                "net_rtr": gnn.GraphConv(in_feats, num_classes),
                "net_rur": gnn.GraphConv(in_feats, num_classes),
            },
            aggregate="sum",
        )

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h['review'])
        h = self.conv2(g, {'review':h})
        # h  = self.conv0(g,in_feat)
        return h


In [None]:
from torchmetrics import F1Score
import pandas as pd


def train(g, model):
    history = {"acc": [], "f1": [], "val_acc": [], "test_acc": []}
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    #todo add scheduler?
    best_val_acc = 0
    best_test_acc = 0
    f1 = F1Score("binary")

    features = {"review": g.ndata["feature"]}
    labels = g.ndata["label"]
    train_mask = g.ndata["train_mask"].to(bool)
    val_mask = g.ndata["val_mask"].to(bool)
    test_mask = g.ndata["test_mask"].to(bool)
    for e in range(10_000):
        # Forward
        logits = model(g, features)["review"]

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()
        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        history["acc"].append(train_acc)
        history["f1"].append(f1(pred[test_mask], labels[test_mask]))
        history["val_acc"].append(val_acc)
        history['test_acc'].append(test_acc)
        if e % 5 == 0:
            # print(
            #     "In epoch {}, acc: {:.3f} loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})".format(
            #         e, train_acc, loss, val_acc, best_val_acc, test_acc, best_test_acc
            #     )
            # )
            print(
                "In epoch {}, acc: {:.3f} f1: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})".format(
                    e,
                    train_acc,
                    f1(pred[test_mask], labels[test_mask]),
                    val_acc,
                    best_val_acc,
                    test_acc,
                    best_test_acc,
                )
            )
    return pd.DataFrame(history)

gpu = True
if gpu:
    g = g.to("cuda")
    model = GCN(g.ndata["feat"].shape[1], 16, dataset.num_classes).to("cuda")
else:
    model = GCN(
        g.ndata["feature"].shape[1],
        16,
        dataset.num_classes,
    )
history = train(g, model)

In [None]:
history[['train_acc','val_acc','test_acc']].plot()

In [None]:
history['f1'].plot()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

features = {"review": g.ndata["feature"]}
logits = model(g, features)["review"]
test_mask = g.ndata["test_mask"].to(bool)
labels = g.ndata["label"][test_mask]
pred = logits.argmax(1)[test_mask]

cm = confusion_matrix(labels, pred)
cm = ConfusionMatrixDisplay(cm)
cm.plot(cmap="Blues")

