In [8]:
# torch_version = str(torch.__version__)
# scatter_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
# sparse_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
# !pip install torch-scatter -f $scatter_src
# !pip install torch-sparse -f $sparse_src
# !pip install torch-geometric
# !pip install ogb

In [4]:
import torch
import pandas as pd
import networkx as nx
import random
import torch.nn.functional as F
print(torch.__version__)

# The PyG built-in GCNConv
from torch_geometric.nn import GCNConv

import torch_geometric.transforms as T

from torch_geometric.data import DataLoader, Data, Dataset
from tqdm.notebook import tqdm
from ogb.graphproppred.mol_encoder import AtomEncoder
from torch_geometric.nn import global_add_pool, global_mean_pool
from ogb.graphproppred import PygGraphPropPredDataset, Evaluator
from torch.nn import BatchNorm1d
from torch_geometric.nn import GCNConv
import copy

2.1.1


In [5]:
class AcyclicGraphDataset(Dataset):
    def __init__(self, pyg_dataset):
        super(AcyclicGraphDataset, self).__init__()
        self.pyg_dataset = pyg_dataset

    def len(self):
        return len(self.pyg_dataset)

    def get(self, idx):
        return self.pyg_dataset[idx]

class CyclicGraphDataset(Dataset):
    def __init__(self, data_list):
        super(CyclicGraphDataset, self).__init__()
        self.data_list = data_list

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        return self.data_list[idx]

cyclic_dataset = torch.load("cyclic_dataset.pt")
acyclic_dataset = torch.load("acyclic_dataset.pt")
print('The {} dataset has {} graphs'.format("cyclic", len(cyclic_dataset)))
print('The {} dataset has {} graphs'.format("acyclic", len(acyclic_dataset)))
cyclic_data = cyclic_dataset[0]
acyclic_data = acyclic_dataset[0]

The cyclic dataset has 13081 graphs
The acyclic dataset has 13759 graphs


In [39]:
import random

class IsAcyclic(Dataset):
    def __init__(self, cyclic_data, acyclic_data):
        super(IsAcyclic, self).__init__()
        self.cyclic_data = cyclic_data
        self.acyclic_data = acyclic_data
        # Combine the two datasets
        
        self.data_list = [(data, 0) for data in cyclic_data] + [(data, 1) for data in acyclic_data]

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        data, label = self.data_list[idx]
        # Ensure the label is a tensor and attach it to the data object
        data.y = torch.tensor([label], dtype=torch.float)
        return data

    def get_idx_split(self, train_ratio=0.7, val_ratio=0.15):
        def split_indices(data, train_ratio, val_ratio):
            dataset_size = len(data)
            indices = list(range(dataset_size))
            random.shuffle(indices)

            train_split = int(train_ratio * dataset_size)
            val_split = int(val_ratio * dataset_size) + train_split

            return indices[:train_split], indices[train_split:val_split], indices[val_split:]

        # Split cyclic and acyclic datasets separately
        cyclic_train, cyclic_val, cyclic_test = split_indices(self.cyclic_data, train_ratio, val_ratio)
        acyclic_train, acyclic_val, acyclic_test = split_indices(self.acyclic_data, train_ratio, val_ratio)

        # Offset acyclic indices by the size of cyclic dataset
        offset = len(self.cyclic_data)
        acyclic_train = [i + offset for i in acyclic_train]
        acyclic_val = [i + offset for i in acyclic_val]
        acyclic_test = [i + offset for i in acyclic_test]

        # Combine the splits from cyclic and acyclic datasets
        train_indices = cyclic_train + acyclic_train
        val_indices = cyclic_val + acyclic_val
        test_indices = cyclic_test + acyclic_test

        # Shuffle combined splits to mix cyclic and acyclic graphs
        random.shuffle(train_indices)
        random.shuffle(val_indices)
        random.shuffle(test_indices)

        return {
            'train': train_indices,
            'valid': val_indices,
            'test': test_indices
        }


# Assuming 'cyclic_dataset' and 'acyclic_dataset' are already created as per your provided code
dataset = IsAcyclic(cyclic_dataset, acyclic_dataset)

torch.save(dataset, 'is_acyclic.pt')

In [40]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: {}'.format(device))

Device: cuda


In [41]:
split_idx = dataset.get_idx_split()

train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=False, num_workers=0)



In [44]:
args = {
    'device': device,
    'input_dim' : 1,
    'gcn_output_dim' : [8, 16],
    'dropout': 0.5,
    'lr': 0.01,
    'weight_decay' : 0.00001,
    'epochs': 30,
}

In [None]:
# we use the node degrees as the initial features for all nodes.
# Then we apply two layers of GCNs with output dimensions
# equal to 8, 16 respectively and perform global averaging to obtain
# the graph representations. Finally, we employ one fully-connected
# layer as the classifier.

In [49]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, gcn_output_dims, dropout, return_embeds=False):
        super(GCN, self).__init__()

        # A list of GCNConv layers
        self.convs = None

        # A list of 1D batch normalization layers
        self.bns = None

        # The log softmax layer
        self.softmax = None

        self.convs = torch.nn.ModuleList([GCNConv(in_channels=input_dim, out_channels=gcn_output_dims[0])])
        self.convs.extend([GCNConv(in_channels=gcn_output_dims[i + 0], out_channels=gcn_output_dims[i + 1]) for i in range(len(gcn_output_dims) - 1)])

        self.bns = torch.nn.ModuleList([BatchNorm1d(num_features=gcn_output_dims[l]) for l in range(len(gcn_output_dims) - 1)])
        
        self.softmax = torch.nn.LogSoftmax()

        # Probability of an element getting zeroed
        self.dropout = dropout

        # Skip classification layer and return node embeddings
        self.return_embeds = return_embeds

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t):
        out = None

        for i in range(len(self.convs)-1):
          x = F.relu(self.bns[i](self.convs[i](x, adj_t)))
          if self.training:
            x = F.dropout(x, p=self.dropout)
        x = self.convs[-1](x, adj_t)
        if self.return_embeds:
          out = x
        else:
          out = self.softmax(x)

        return out

### GCN to predict graph property
class GCN_Graph(torch.nn.Module):
    def __init__(self, input_dim, gcn_output_dims, output_dim, dropout):
        super(GCN_Graph, self).__init__()

        # self.node_encoder = AtomEncoder(hidden_dim)
        
        self.gnn_node = GCN(input_dim, gcn_output_dims, dropout, return_embeds=True)

        self.pool = global_mean_pool # global averaging to obtain graph representation

        # Output layer
        self.linear = torch.nn.Linear(gcn_output_dims[-1], output_dim) # One fully connected layer as a classifier


    def reset_parameters(self):
      self.gnn_node.reset_parameters()
      self.linear.reset_parameters()

    def forward(self, batched_data):
        # Extract important attributes of our mini-batch
        x, edge_index, batch = batched_data.x, batched_data.edge_index, batched_data.batch
        
        device = edge_index.device
        degrees = torch.sum(edge_index[0] == torch.arange(edge_index.max() + 1, device=device)[:, None], dim=1, dtype=torch.float)
        x = degrees.unsqueeze(1)  # Add feature dimension
        embed = x.to(device)  # Ensure the embedding tensor is on the correct device

        out = None

        node_embeddings = self.gnn_node(embed, edge_index)
        agg_features = self.pool(node_embeddings, batch)
        out = self.linear(agg_features)

        return out

def train(model, device, data_loader, optimizer, loss_fn):
    model.train()
    loss = 0

    for step, batch in enumerate(tqdm(data_loader, desc="Iteration")):
      batch = batch.to(device)

      if batch.x.shape[0] == 1 or batch.batch[-1] == 0:
          pass
      else:
        ## ignore nan targets (unlabeled) when computing training loss.
        is_labeled = batch.y == batch.y

        optimizer.zero_grad()
        out = model(batch)
        filtered_output = out[is_labeled]

        # Reshape the labels to match the output shape
        filtered_labels = batch.y[is_labeled].unsqueeze(1).type(torch.float32)

        loss = loss_fn(filtered_output, filtered_labels)

        loss.backward()
        optimizer.step()

    return loss.item()

def compute_accuracy(y_true, y_pred):
    # Assuming y_pred are logits; apply sigmoid and round off to get binary predictions
    preds = torch.sigmoid(y_pred) > 0.5
    correct = preds.eq(y_true.view_as(preds)).sum()
    accuracy = correct.float() / y_true.numel()
    return accuracy.item()

def eval(model, device, loader):
    model.eval()
    total_accuracy = 0
    total_samples = 0

    for batch in loader:
        batch = batch.to(device)
        with torch.no_grad():
            pred = model(batch)

        # Assuming binary classification and batch.y is your ground truth
        accuracy = compute_accuracy(batch.y, pred)
        total_accuracy += accuracy * batch.y.size(0)
        total_samples += batch.y.size(0)

    return total_accuracy / total_samples

# # The evaluation function
# def eval(model, device, loader, evaluator, save_model_results=False, save_file=None):
#     model.eval()
#     y_true = []
#     y_pred = []

#     for step, batch in enumerate(tqdm(loader, desc="Iteration")):
#         batch = batch.to(device)

#         if batch.x.shape[0] == 1:
#             pass
#         else:
#             with torch.no_grad():
#                 pred = model(batch)

#             y_true.append(batch.y.view(pred.shape).detach().cpu())
#             y_pred.append(pred.detach().cpu())

#     y_true = torch.cat(y_true, dim = 0).numpy()
#     y_pred = torch.cat(y_pred, dim = 0).numpy()

#     input_dict = {"y_true": y_true, "y_pred": y_pred}

#     if save_model_results:
#         print ("Saving Model Predictions")

#         # Create a pandas dataframe with a two columns
#         # y_pred | y_true
#         data = {}
#         data['y_pred'] = y_pred.reshape(-1)
#         data['y_true'] = y_true.reshape(-1)

#         df = pd.DataFrame(data=data)
#         # Save to csv
#         df.to_csv('ogbg-molhiv_graph_' + save_file + '.csv', sep=',', index=False)

#     return evaluator.eval(input_dict)


model = GCN_Graph(args['input_dim'], args['gcn_output_dim'],
            output_dim=1, dropout=args['dropout']).to(device)
# evaluator = Evaluator(name='ogbg-molhiv')

model.reset_parameters()

optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['weight_decay'])
loss_fn = torch.nn.BCEWithLogitsLoss()

best_model = None
best_valid_acc = 0

# Training loop remains the same...

# Evaluation in your main loop
for epoch in range(1, 1 + args["epochs"]):
    print('Training...')
    train_loss = train(model, device, train_loader, optimizer, loss_fn)

    print('Evaluating...')
    train_acc = eval(model, device, train_loader)
    val_acc = eval(model, device, valid_loader)
    test_acc = eval(model, device, test_loader)

    if val_acc > best_valid_acc:
        best_valid_acc = val_acc
        best_model = copy.deepcopy(model)

    print(f'Epoch: {epoch:02d}, '
          f'Loss: {train_loss:.4f}, '
          f'Train Acc: {100 * train_acc:.2f}%, '
          f'Valid Acc: {100 * val_acc:.2f}% '
          f'Test Acc: {100 * test_acc:.2f}%')

# Evaluate the best model
best_train_acc = eval(best_model, device, train_loader)
best_val_acc = eval(best_model, device, valid_loader)
best_test_acc = eval(best_model, device, test_loader)

print(f'Best model: '
      f'Train: {100 * best_train_acc:.2f}%, '
      f'Valid: {100 * best_val_acc:.2f}% '
      f'Test: {100 * best_test_acc:.2f}%')


# for epoch in range(1, 1 + args["epochs"]):
#   print('Training...')
#   loss = train(model, device, train_loader, optimizer, loss_fn)

#   print('Evaluating...')
#   train_result = eval(model, device, train_loader, evaluator)
#   val_result = eval(model, device, valid_loader, evaluator)
#   test_result = eval(model, device, test_loader, evaluator)

#   train_acc, valid_acc, test_acc = train_result[dataset.eval_metric], val_result[dataset.eval_metric], test_result[dataset.eval_metric]
#   if valid_acc > best_valid_acc:
#       best_valid_acc = valid_acc
#       best_model = copy.deepcopy(model)
#   print(f'Epoch: {epoch:02d}, '
#         f'Loss: {loss:.4f}, '
#         f'Train: {100 * train_acc:.2f}%, '
#         f'Valid: {100 * valid_acc:.2f}% '
#         f'Test: {100 * test_acc:.2f}%')


# train_auroc = eval(best_model, device, train_loader, evaluator)[dataset.eval_metric]
# valid_auroc = eval(best_model, device, valid_loader, evaluator, save_model_results=True, save_file="valid")[dataset.eval_metric]
# test_auroc  = eval(best_model, device, test_loader, evaluator, save_model_results=True, save_file="test")[dataset.eval_metric]

# print(f'Best model: '
#     f'Train: {100 * train_auroc:.2f}%, '
#     f'Valid: {100 * valid_auroc:.2f}% '
#     f'Test: {100 * test_auroc:.2f}%')

Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 01, Loss: 0.1767, Train Acc: 83.24%, Valid Acc: 83.93% Test Acc: 82.57%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 02, Loss: 0.2097, Train Acc: 82.96%, Valid Acc: 83.80% Test Acc: 82.27%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 03, Loss: 0.1271, Train Acc: 83.29%, Valid Acc: 83.93% Test Acc: 82.55%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 04, Loss: 0.8971, Train Acc: 83.29%, Valid Acc: 84.07% Test Acc: 82.55%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 05, Loss: 0.4581, Train Acc: 82.99%, Valid Acc: 83.73% Test Acc: 82.30%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 06, Loss: 0.5921, Train Acc: 82.02%, Valid Acc: 83.18% Test Acc: 81.06%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 07, Loss: 0.4054, Train Acc: 83.50%, Valid Acc: 84.15% Test Acc: 82.57%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 08, Loss: 0.1448, Train Acc: 82.95%, Valid Acc: 83.98% Test Acc: 82.27%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 09, Loss: 0.3150, Train Acc: 82.97%, Valid Acc: 84.00% Test Acc: 81.78%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 10, Loss: 0.2481, Train Acc: 83.59%, Valid Acc: 83.98% Test Acc: 82.60%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 11, Loss: 0.3956, Train Acc: 83.27%, Valid Acc: 84.10% Test Acc: 82.55%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 12, Loss: 0.4177, Train Acc: 78.95%, Valid Acc: 80.30% Test Acc: 78.60%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 13, Loss: 1.4654, Train Acc: 83.15%, Valid Acc: 83.88% Test Acc: 82.42%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 14, Loss: 0.8676, Train Acc: 83.28%, Valid Acc: 84.20% Test Acc: 82.52%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 15, Loss: 0.8861, Train Acc: 81.92%, Valid Acc: 82.14% Test Acc: 81.43%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 16, Loss: 0.0780, Train Acc: 83.45%, Valid Acc: 84.27% Test Acc: 82.77%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 17, Loss: 0.5209, Train Acc: 83.12%, Valid Acc: 83.73% Test Acc: 82.17%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 18, Loss: 0.7682, Train Acc: 82.84%, Valid Acc: 83.60% Test Acc: 82.13%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 19, Loss: 1.3003, Train Acc: 83.21%, Valid Acc: 84.07% Test Acc: 82.32%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 20, Loss: 0.3042, Train Acc: 83.46%, Valid Acc: 84.17% Test Acc: 82.57%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 21, Loss: 0.2948, Train Acc: 83.34%, Valid Acc: 83.98% Test Acc: 82.42%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 22, Loss: 0.2855, Train Acc: 83.31%, Valid Acc: 83.90% Test Acc: 82.40%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 23, Loss: 0.6934, Train Acc: 83.43%, Valid Acc: 84.17% Test Acc: 82.45%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 24, Loss: 0.9340, Train Acc: 83.15%, Valid Acc: 83.80% Test Acc: 82.40%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 25, Loss: 0.4872, Train Acc: 83.49%, Valid Acc: 84.20% Test Acc: 82.65%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 26, Loss: 1.0675, Train Acc: 83.03%, Valid Acc: 83.68% Test Acc: 82.10%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 27, Loss: 0.3447, Train Acc: 83.37%, Valid Acc: 84.32% Test Acc: 82.80%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 28, Loss: 0.5902, Train Acc: 83.30%, Valid Acc: 84.00% Test Acc: 82.35%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 29, Loss: 0.1217, Train Acc: 83.54%, Valid Acc: 84.20% Test Acc: 82.55%
Training...


Iteration:   0%|          | 0/588 [00:00<?, ?it/s]

Evaluating...
Epoch: 30, Loss: 0.2691, Train Acc: 83.21%, Valid Acc: 83.90% Test Acc: 82.37%
Best model: Train: 83.37%, Valid: 84.32% Test: 82.80%
