In [1]:
import torch
from torch_geometric.datasets import QM9
from torch_geometric.loader import DataLoader

# 1) Download + process QM9 automatically
dataset = QM9(root="data/QM9")

print(dataset)            # -> QM9(130831)
print(dataset[0])         # -> Data(...)

# 2) Select a target property index (0..18)
target = 0  # e.g. dipole moment Î¼

# 3) Optionally normalize the target
y = dataset.data.y[:, target]
mean = y.mean().item()
std = y.std().item()
print("Target mean/std:", mean, std)

# 4) Create splits and loaders
torch.manual_seed(42)
dataset = dataset.shuffle()
train_dataset = dataset[:110000]
val_dataset   = dataset[110000:120000]
test_dataset  = dataset[120000:]

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64)
test_loader  = DataLoader(test_dataset, batch_size=64)


  from .autonotebook import tqdm as notebook_tqdm
Downloading https://data.pyg.org/datasets/qm9_v3.zip
Extracting data/QM9/raw/qm9_v3.zip
Processing...
Using a pre-processed version of the dataset. Please install 'rdkit' to alternatively process the raw data.
Done!


QM9(130831)
Data(x=[5, 11], edge_index=[2, 8], edge_attr=[8, 4], y=[1, 19], pos=[5, 3], idx=[1], name='gdb_1', z=[5])
Target mean/std: 2.6729531288146973 1.5034793615341187


  y = dataset.data.y[:, target]


In [2]:
from torch_geometric.data import Data

data: Data = dataset[0]

print(data)
print("x:", data.x.shape)           # [num_atoms, num_node_features]
print("z:", data.z.shape)           # [num_atoms] atomic numbers
print("pos:", data.pos.shape)       # [num_atoms, 3] 3D coords
print("edge_index:", data.edge_index.shape)  # [2, num_edges]
print("edge_attr:", data.edge_attr.shape)    # [num_edges, num_edge_features]
print("y:", data.y.shape)           # [1, 19] all QM9 targets


Data(x=[21, 11], edge_index=[2, 46], edge_attr=[46, 4], y=[1, 19], pos=[21, 3], idx=[1], name='gdb_72082', z=[21])
x: torch.Size([21, 11])
z: torch.Size([21])
pos: torch.Size([21, 3])
edge_index: torch.Size([2, 46])
edge_attr: torch.Size([46, 4])
y: torch.Size([1, 19])


In [3]:
batch = next(iter(train_loader))
print(batch)
# batch.x, batch.pos, batch.edge_index, batch.edge_attr, batch.y
# batch.batch indicates which node belongs to which molecule


DataBatch(x=[1153, 11], edge_index=[2, 2382], edge_attr=[2382, 4], y=[64, 19], pos=[1153, 3], idx=[64], name=[64], z=[1153], batch=[1153], ptr=[65])
