# Clone Repository and Download Datasets from Dwivedi et al.

In [None]:
!git clone https://github.com/shamim-hussain/benchmarking-gnns.git
%cd benchmarking-gnns/data/
!bash script_download_all_datasets.sh
%cd ..

# Install Libraries

In [None]:
!pip install ogb==1.1.1
!pip install dgl==0.4.2

import ogb, dgl

print(ogb.__version__)
print(dgl.__version__)

# Convert Datasets

## SBM_PATTERN

In [None]:
import numpy as np
import h5py
import torch
from tqdm import tqdm
from pathlib import Path

from data.data import LoadData

class DotDict(dict):
    def __init__(self, **kwds):
        self.update(kwds)
        self.__dict__ = self



DATASET_NAME = 'SBM_PATTERN'
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test

In [None]:
def save_dataset(ds,data):
    for i, (g,l) in enumerate(tqdm(data)):
        grp = ds.create_group(f'{i:0>10d}')
        dgrp = grp.create_group('data')
        
        dgrp.attrs['num_nodes'] = g.number_of_nodes()
        dgrp.attrs['num_edges'] = g.number_of_edges()
        
        dgrp['edges'] = torch.stack(g.edges(),axis=-1).numpy()
        
        dnfgrp= dgrp.create_group('features/nodes')
        for fname, fval in g.ndata.items():
            dnfgrp[fname] = fval.numpy()
        
        tgrp = grp.create_group('targets')
        tgrp['node_labels'] = np.array(l, dtype=int)


dest_file = r'../datasets/SBM_PATTERN/SBM_PATTERN.h5'
Path(dest_file).parent.mkdir(exist_ok=True, parents=True)

with h5py.File(dest_file, 'w') as file:
    ds = file.create_group('SBM_PATTERN')
    ds.attrs['max_node_feat'] = 2
    ds.attrs['min_node_feat'] = 0
    ds.attrs['num_min_nodes'] = 44
    ds.attrs['num_max_nodes'] = 188
    train_ds, val_ds, test_ds = ds.create_group('training'), ds.create_group('validation'), ds.create_group('test')
    save_dataset(train_ds, trainset)
    save_dataset(val_ds, valset)
    save_dataset(test_ds, testset)

In [None]:
%reset

## SBM_CLUSTER

In [None]:
import numpy as np
import h5py
import torch
from tqdm import tqdm
from pathlib import Path

from data.data import LoadData

class DotDict(dict):
    def __init__(self, **kwds):
        self.update(kwds)
        self.__dict__ = self

DATASET_NAME = 'SBM_CLUSTER'
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test

In [None]:
def save_dataset(ds,data):
    for i, (g,l) in enumerate(tqdm(data)):
        grp = ds.create_group(f'{i:0>10d}')
        dgrp = grp.create_group('data')
        
        dgrp.attrs['num_nodes'] = g.number_of_nodes()
        dgrp.attrs['num_edges'] = g.number_of_edges()
        
        dgrp['edges'] = torch.stack(g.edges(),axis=-1).numpy()
        
        dnfgrp= dgrp.create_group('features/nodes')
        for fname, fval in g.ndata.items():
            dnfgrp[fname] = fval.numpy()
        
        tgrp = grp.create_group('targets')
        tgrp['node_labels'] = np.array(l, dtype=int)

dest_file = r'../datasets/SBM_CLUSTER/SBM_CLUSTER.h5'
Path(dest_file).parent.mkdir(exist_ok=True, parents=True)

with h5py.File(dest_file, 'w') as file:
    ds = file.create_group('SBM_CLUSTER')
    ds.attrs['max_node_feat'] = 6
    ds.attrs['min_node_feat'] = 0
    ds.attrs['num_node_classes'] = 6
    ds.attrs['num_min_nodes'] = 41
    ds.attrs['num_max_nodes'] = 190
    train_ds, val_ds, test_ds = ds.create_group('training'), ds.create_group('validation'), ds.create_group('test')
    save_dataset(train_ds, trainset)
    save_dataset(val_ds, valset)
    save_dataset(test_ds, testset)

In [None]:
%reset

## MNIST

In [None]:
import numpy as np
import h5py
import torch
from tqdm import tqdm
from pathlib import Path

from data.data import LoadData

DATASET_NAME = 'MNIST'
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test

In [None]:

def save_dataset(ds,data):
    for i, (g,l) in enumerate(tqdm(data)):
        grp = ds.create_group(f'{i:0>10d}')
        dgrp = grp.create_group('data')
        
        dgrp.attrs['num_nodes'] = g.number_of_nodes()
        dgrp.attrs['num_edges'] = g.number_of_edges()
        
        dgrp['edges'] = torch.stack(g.edges(),axis=-1).numpy()
        
        dnfgrp,defgrp = dgrp.create_group('features/nodes'), dgrp.create_group('features/edges')
        for fname, fval in g.ndata.items():
            dnfgrp[fname] = fval.numpy()
        for fname, fval in g.edata.items():
            defgrp[fname] = fval.numpy()
        
        tgrp = grp.create_group('targets')
        tgrp['label'] = l.numpy()


dest_file = r'../datasets/MNIST/MNIST.h5'
Path(dest_file).parent.mkdir(exist_ok=True, parents=True)

with h5py.File(dest_file, 'w') as file:
    ds = file.create_group('MNIST')
    ds.attrs['num_node_feat'] = 3
    ds.attrs['num_edge_feat'] = 1
    ds.attrs['num_classes'] = 10
    ds.attrs['num_min_nodes'] = 40
    ds.attrs['num_max_nodes'] = 75
    train_ds, val_ds, test_ds = ds.create_group('training'), ds.create_group('validation'), ds.create_group('test')
    save_dataset(train_ds, dataset.train)
    save_dataset(val_ds, dataset.val)
    save_dataset(test_ds, dataset.test)

In [None]:
%reset

## CIFAR10

In [None]:
import numpy as np
import h5py
import torch
from tqdm import tqdm
from pathlib import Path

from data.data import LoadData

DATASET_NAME = 'CIFAR10'
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test

In [None]:
def save_dataset(ds,data):
    for i, (g,l) in enumerate(tqdm(data)):
        grp = ds.create_group(f'{i:0>10d}')
        dgrp = grp.create_group('data')
        
        dgrp.attrs['num_nodes'] = g.number_of_nodes()
        dgrp.attrs['num_edges'] = g.number_of_edges()
        
        dgrp['edges'] = torch.stack(g.edges(),axis=-1).numpy()
        
        dnfgrp,defgrp = dgrp.create_group('features/nodes'), dgrp.create_group('features/edges')
        for fname, fval in g.ndata.items():
            dnfgrp[fname] = fval.numpy()
        for fname, fval in g.edata.items():
            defgrp[fname] = fval.numpy()
        
        tgrp = grp.create_group('targets')
        tgrp['label'] = l.numpy()


dest_file = r'../datasets/CIFAR10/CIFAR10.h5'
Path(dest_file).parent.mkdir(exist_ok=True, parents=True)

with h5py.File(dest_file, 'w') as file:
    ds = file.create_group('CIFAR10')
    ds.attrs['num_node_feat'] = 5
    ds.attrs['num_edge_feat'] = 1
    ds.attrs['num_classes'] = 10
    ds.attrs['num_min_nodes'] = 85
    ds.attrs['num_max_nodes'] = 150
    train_ds, val_ds, test_ds = ds.create_group('training'), ds.create_group('validation'), ds.create_group('test')
    save_dataset(train_ds, dataset.train)
    save_dataset(val_ds, dataset.val)
    save_dataset(test_ds, dataset.test)

In [None]:
%reset

## TSP

In [None]:
import numpy as np
import h5py
import torch
from tqdm import tqdm
from pathlib import Path

from data.data import LoadData

DATASET_NAME = 'TSP'
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test

In [None]:

def save_dataset(ds,data):
    for i, (g,l) in enumerate(tqdm(data)):
        grp = ds.create_group(f'{i:0>10d}')
        dgrp = grp.create_group('data')
        
        dgrp.attrs['num_nodes'] = g.number_of_nodes()
        dgrp.attrs['num_edges'] = g.number_of_edges()
        
        dgrp['edges'] = torch.stack(g.edges(),axis=-1).numpy()
        
        dnfgrp,defgrp = dgrp.create_group('features/nodes'), dgrp.create_group('features/edges')
        for fname, fval in g.ndata.items():
            dnfgrp[fname] = fval.numpy()
        for fname, fval in g.edata.items():
            defgrp[fname] = fval.numpy()
        
        tgrp = grp.create_group('targets')
        tgrp['edge_labels'] = np.array(l, dtype=int)

dest_file = r'../datasets/TSP/TSP.h5'
Path(dest_file).parent.mkdir(exist_ok=True, parents=True)

with h5py.File(dest_file, 'w') as file:
    ds = file.create_group('TSP')
    ds.attrs['num_node_feat'] = 2
    ds.attrs['num_edge_feat'] = 1
    ds.attrs['num_min_nodes'] = 50
    ds.attrs['num_max_nodes'] = 499
    train_ds, val_ds, test_ds = ds.create_group('training'), ds.create_group('validation'), ds.create_group('test')
    save_dataset(train_ds, dataset.train)
    save_dataset(val_ds, dataset.val)
    save_dataset(test_ds, dataset.test)

In [None]:
%reset

## ZINC

In [None]:
import numpy as np
import h5py
import torch
from tqdm import tqdm
from pathlib import Path

from data.data import LoadData

DATASET_NAME = 'ZINC'
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test

In [None]:


def save_dataset(ds,data):
    for i, (g,l) in tqdm(enumerate(data)):
        grp = ds.create_group(f'{i:0>10d}')
        dgrp = grp.create_group('data')
        
        dgrp.attrs['num_nodes'] = g.number_of_nodes()
        dgrp.attrs['num_edges'] = g.number_of_edges()
        
        dgrp['edges'] = torch.stack(g.edges(),axis=-1).numpy()
        
        dnfgrp,defgrp = dgrp.create_group('features/nodes'), dgrp.create_group('features/edges')
        for fname, fval in g.ndata.items():
            dnfgrp[fname] = fval.numpy()
        for fname, fval in g.edata.items():
            defgrp[fname] = fval.numpy()
        
        tgrp = grp.create_group('targets')
        tgrp['value'] = l.numpy()

dest_file = r'../datasets/ZINC/ZINC.h5'
Path(dest_file).parent.mkdir(exist_ok=True, parents=True)

with h5py.File(dest_file, 'w') as file:
    ds = file.create_group('ZINC')
    ds.attrs['num_atom_type'] = 28
    ds.attrs['num_bond_type'] = 4
    ds.attrs['num_min_atoms'] = 9
    ds.attrs['num_max_atoms'] = 37
    train_ds, val_ds, test_ds = ds.create_group('training'), ds.create_group('validation'), ds.create_group('test')
    save_dataset(train_ds, dataset.train)
    save_dataset(val_ds, dataset.val)
    save_dataset(test_ds, dataset.test)

In [None]:
%reset

## ZINC-full

In [None]:
import numpy as np
import h5py
import torch
from tqdm import tqdm
from pathlib import Path

from data.data import LoadData

DATASET_NAME = 'ZINC-full'
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test #154s

In [None]:
def save_dataset(ds,data):
    for i, (g,l) in enumerate(tqdm(data)):
        grp = ds.create_group(f'{i:0>10d}')
        dgrp = grp.create_group('data')
        
        dgrp.attrs['num_nodes'] = g.number_of_nodes()
        dgrp.attrs['num_edges'] = g.number_of_edges()
        
        dgrp['edges'] = torch.stack(g.edges(),axis=-1).numpy()
        
        dnfgrp,defgrp = dgrp.create_group('features/nodes'), dgrp.create_group('features/edges')
        for fname, fval in g.ndata.items():
            dnfgrp[fname] = fval.numpy()
        for fname, fval in g.edata.items():
            defgrp[fname] = fval.numpy()
        
        tgrp = grp.create_group('targets')
        tgrp['value'] = l.numpy()

dest_file = r'../datasets/ZINC_full/ZINC_full.h5'
Path(dest_file).parent.mkdir(exist_ok=True, parents=True)
with h5py.File(dest_file, 'w') as file:
    ds = file.create_group('ZINC_full')
    ds.attrs['num_atom_type'] = 28
    ds.attrs['num_bond_type'] = 4
    ds.attrs['num_min_atoms'] = 6
    ds.attrs['num_max_atoms'] = 38
    train_ds, val_ds, test_ds = ds.create_group('training'), ds.create_group('validation'), ds.create_group('test')
    save_dataset(train_ds, dataset.train)
    save_dataset(val_ds, dataset.val)
    save_dataset(test_ds, dataset.test)

In [None]:
%reset

# Create a Parent HDF with External Soft-links (Optional)

In [None]:
%cd ../datasets/
%ls

In [None]:
import h5py as h5

def add_link(data_file, dataset_name):
    data_file[f'{dataset_name}'] = h5.ExternalLink(f"{dataset_name}/{dataset_name}.h5", f"/{dataset_name}")


with h5.File("gnn_benchmark.h5", 'w') as data_file:
    add_link(data_file,"SBM_PATTERN")
    add_link(data_file,"SBM_CLUSTER")
    add_link(data_file,"MNIST")
    add_link(data_file,"CIFAR10")
    add_link(data_file,"TSP")
    add_link(data_file,"ZINC")
    add_link(data_file,"ZINC_full")

## Test the File

In [None]:
with h5.File("gnn_benchmark.h5", 'r') as data_file:
    datasets = list(data_file.keys())
    print(datasets)
    for dset in datasets:
        print()
        print(dset)
        print('------------')
        print(f'Training examples  : {len(data_file[dset]["training"])}')
        print(f'Validation examples: {len(data_file[dset]["validation"])}')
        print(f'Test examples      : {len(data_file[dset]["test"])}')

# Compress Files for Storage, Download etc. (Optional)

In [None]:
%cd ../

In [None]:
!tar -czf datasets.tar.gz datasets

In [None]:
%ls -alh

## (On Google Colab) Download File

In [None]:
from google.colab import files

files.download('datasets.tar.gz')

## (On Google Colab) Or Transfer to Google Drive

In [None]:
from google.colab import drive
drive.mount('./drive')
!cp -v datasets.tar.gz ./drive/MyDrive/
drive.flush_and_unmount()