# Load Dataset

In [1]:
from ogb.lsc.pcqm4m_dgl import DglPCQM4MDataset
dataset=DglPCQM4MDataset(root='./')

Using backend: pytorch


# Explore Datset

In [2]:
import numpy as np
from tqdm import tqdm

print('Total length:',len(dataset))
splitted_idx = dataset.get_idx_split()
print(dict((k,len(v)) for k,v in splitted_idx.items()))
print()

first_s, first_t = dataset[0]
max_t = first_t.numpy()
min_t = first_t.numpy()
max_nodes = first_s.num_nodes()
min_nodes = max_nodes
n_maximum = first_s.ndata['feat'].numpy().max(0)
n_minimum = n_maximum.copy()
e_maximum = first_s.edata['feat'].numpy().max(0)
e_minimum = e_maximum.copy()

for i, (g,t) in tqdm(enumerate(dataset)):
    t_np = t.numpy()
    if not np.any(np.isnan(t_np)):
        max_t = np.maximum(max_t, t_np)
        min_t = np.minimum(min_t, t_np)
    
    nn = g.num_nodes()
    max_nodes = np.maximum(max_nodes, nn)
    min_nodes = np.minimum(min_nodes, nn)

    n_feat = g.ndata['feat'].numpy()
    n_maximum = np.maximum(n_maximum, n_feat.max(0))
    n_minimum = np.minimum(n_minimum, n_feat.min(0))
    
    e_feat = g.edata['feat'].numpy()
    if e_feat.shape[0] > 0:
        e_maximum = np.maximum(e_maximum, e_feat.max(0))
        e_minimum = np.minimum(e_minimum, e_feat.min(0))

print()
print(f'max_t = {max_t}')
print(f'min_t = {min_t}')

print()
print(f'max_nodes = {max_nodes}')
print(f'min_nodes = {min_nodes}')

print()
print(f'n_maximum = {n_maximum}')
print(f'n_minimum = {n_minimum}')

print()
print(f'e_maximum = {e_maximum}')
print(f'e_minimum = {e_minimum}')

Total length: 3803453
{'train': 3045360, 'valid': 380670, 'test': 377423}



3803453it [07:22, 8596.61it/s]


max_t = 47.02399444580078
min_t = 0.3755171000957489

max_nodes = 51
min_nodes = 1

n_maximum = [52  2  6  9  4  4  5  1  1]
n_minimum = [0 0 0 4 0 0 0 0 0]

e_maximum = [3 2 1]
e_minimum = [0 0 0]





# Save HDF5 Parts

In [None]:
import tqdm
import torch
import h5py
from pathlib import Path

DATASET_NAME = 'PCQM4M'
trainset, valset, testset = dataset[splitted_idx['train']], dataset[splitted_idx['valid']], dataset[splitted_idx['test']]

iter_tracker = tqdm.tqdm
int_type = 'uint8'

def save_dataset(ds, data, start, chunk_size):
    for i in iter_tracker(range(start, min(start+chunk_size, len(data)))):
        g, t = data[i]
        grp = ds.create_group(f'{i:0>10d}')
        dgrp = grp.create_group('data')
        
        dgrp.attrs['num_nodes'] = g.num_nodes()
        dgrp.attrs['num_edges'] = g.num_edges()
        
        dgrp['edges'] = torch.stack(g.edges(),axis=-1).numpy().astype(int_type)
        
        dnfgrp = dgrp.create_group('features/nodes')
        defgrp = dgrp.create_group('features/edges')
        for fname, fval in g.ndata.items():
            dnfgrp[fname] = fval.numpy().reshape([-1,9]).astype(int_type)
        for fname, fval in g.edata.items():
            defgrp[fname] = fval.numpy().reshape([-1,3]).astype(int_type)
        
        tgrp = grp.create_group('targets')
        t = t.numpy()
        t[np.isnan(t)] = -10.
        tgrp['value'] = t
        if not (i%10000):
            ds.file.flush()


chunk_size = 400000

def save_data_chunks(split, dat):
    for i in range(0, len(dat), chunk_size):
        part_no = i//chunk_size
        dest_file = f'datasets/{DATASET_NAME}/parts/{DATASET_NAME}_{split}_{part_no:0>3d}.h5'
        Path(dest_file).parent.mkdir(exist_ok=True, parents=True)
        
        print(f'Saving to {dest_file}...')

        with h5py.File(dest_file, 'w') as file:
            ds = file.create_group(f'{DATASET_NAME}')
            ds.attrs['num_atom_type'] = (n_maximum+1)
            ds.attrs['num_bond_type'] = (e_maximum+1)
            ds.attrs['num_atom_feats'] = n_maximum.shape[0]
            ds.attrs['num_bond_feats'] = e_maximum.shape[0]
            ds.attrs['num_min_atoms'] = min_nodes
            ds.attrs['num_max_atoms'] = max_nodes
            ds.attrs['min_targ_val'] = min_t
            ds.attrs['max_targ_val'] = max_t

            part_ds = ds.create_group(f'{split}_{part_no:0>3d}')
            save_dataset(part_ds, dat, i, chunk_size)


for s, ds in zip(['training', 'validation', 'test'],
                 [trainset, valset, testset]):
    save_data_chunks(s, ds)

# Create Parent Dataset

In [14]:
import h5py as h5
from pathlib import Path
from collections import defaultdict

DATASET_NAME = 'PCQM4M'
parts_p = Path(f'datasets/{DATASET_NAME}/parts')

groups = defaultdict(list)
for fl in parts_p.rglob('*.h5'):
    with h5.File(str(fl),'r') as file:
        for name, grp in file[DATASET_NAME].items():
            gg = name.split('_')[0]
            print(name, gg)
            groups[gg].append( (name, fl.relative_to(fl.parents[1]).as_posix(), grp.name) )
            
with h5.File(f'datasets/{DATASET_NAME}/{DATASET_NAME}.h5','w') as file:
    ds = file.create_group(f'{DATASET_NAME}')
    ds.attrs['num_atom_type'] = (n_maximum+1)
    ds.attrs['num_bond_type'] = (e_maximum+1)
    ds.attrs['num_atom_feats'] = n_maximum.shape[0]
    ds.attrs['num_bond_feats'] = e_maximum.shape[0]
    ds.attrs['num_min_atoms'] = min_nodes
    ds.attrs['num_max_atoms'] = max_nodes
    ds.attrs['min_targ_val'] = min_t
    ds.attrs['max_targ_val'] = max_t
    
    for grp_name, grp_links in groups.items():
        grp_ds = ds.create_group(grp_name)
        for chunk_name, fl_link, item_link in grp_links:
            grp_ds[chunk_name] = h5.ExternalLink(fl_link, item_link)

test_000 test
training_000 training
training_001 training
training_002 training
training_003 training
training_004 training
training_005 training
training_006 training
training_007 training
validation_000 validation


# Check File

In [16]:
with h5.File(f'datasets/{DATASET_NAME}/{DATASET_NAME}.h5','r') as file:
    ds = file[f'{DATASET_NAME}']
    print(len(ds['training/training_007']))
    print(len(ds['validation']))
    print(len(ds['test']))
    print(ds[f'test/test_000/{300000:0>10d}/targets/value'][()])

245360
1
1
-10.0
