In [15]:
import glob
import os.path as osp
import json
from pathlib  import Path
import pickle

import numpy as np

In [16]:
DATA_PATH = Path('data/QM9')
DATASET_UNPROCESSED = DATA_PATH / 'qm9'

In [17]:
smiles_all = json.loads((DATA_PATH / 'summary_preprocessed.json').read_text())
smiles_small = {k: v for k, v in smiles_all.items() if 'full' in v.get('subsets', [])}

In [18]:
print(f"Number of SMILES in the original dataset: {len(smiles_all)}\n"
      f"Number of SMILES in the COSMIC subsampled dataset: {len(smiles_small)}")

Number of SMILES in the original dataset: 133258
Number of SMILES in the COSMIC subsampled dataset: 133008


In [19]:
pickles = np.array(sorted(glob.glob(osp.join(DATASET_UNPROCESSED, '*.pickle'))))
print(f"Number of pickle files: {len(pickles)} is the same as the number of SMILES in COSMIC: {len(smiles_all)}")

Number of pickle files: 133232 is the same as the number of SMILES in COSMIC: 133258


In [20]:
train, val, test = np.load(DATA_PATH / 'split.npy', allow_pickle=True)
print(f"Split of the torsional-diffusion dataset:\ntrain: {len(train)} validation: {len(val)} test: {len(test)}")

Split of the torsional-diffusion dataset:
train: 106586 validation: 13323 test: 13323


In [23]:
smiles_small = {k: v for k, v in smiles_all.items() if 'full' in v.get('subsets', [])}
small_train_old = {k: v for k, v in smiles_small.items() if v['split'] == 'train'}
small_val_old = {k: v for k, v in smiles_small.items() if v['split'] == 'val'}
small_test_old = {k: v for k, v in smiles_small.items() if v['split'] == 'test'}
print(f"Split of the COSMIC dataset:\ntrain: {len(small_train_old)} validation: {len(small_val_old)} test: {len(small_test_old)}")

Split of the COSMIC dataset:
train: 109318 validation: 7134 test: 16556


In [24]:
def create_new_index(dataset_root, split_idx_file_old, smiles_use_list):
    """Creates a new index for the dataset based on the smiles_use_list."""
    train, val, test = np.load(split_idx_file_old, allow_pickle=True)
    train_smiles, val_smiles, test_smiles = smiles_use_list
    
    pickles = np.array(sorted(glob.glob(osp.join(dataset_root, '*.pickle'))))
    index = np.arange(len(pickles))
    index_use = index[np.concatenate([train, val, test])]
    pickles_use = pickles[index_use]
    split_train = []
    split_val = []
    split_test = []
    for idx, pickle_file in zip(index_use, pickles_use):
        obj = pickle.load(open(pickle_file, 'rb'))
        if obj['smiles'] in train_smiles:
            split_train.append(idx)
        elif obj['smiles'] in val_smiles:
            split_val.append(idx)
        elif obj['smiles'] in test_smiles:
            split_test.append(idx)
        else:
            continue
    return [np.array(split_train), np.array(split_val), np.array(split_test)]

In [25]:
new_splits = create_new_index(DATASET_UNPROCESSED, DATA_PATH / 'split.npy', [small_train_old, small_val_old, small_test_old])

In [26]:
np.save(DATA_PATH / 'full_split.npy', np.array(new_splits, dtype=object), allow_pickle=True)

In [27]:
smiles_small = {k: v for k, v in smiles_all.items() if 'full' in v.get('subsets', [])}
small_train_old = {k: v for k, v in smiles_small.items() if v['split'] == 'train'}
small_val_old = {k: v for k, v in smiles_small.items() if v['split'] == 'val'}
small_test_old = {k: v for k, v in smiles_small.items() if v['split'] == 'test'}
print(f"Split of the COSMIC dataset:\ntrain: {len(small_train_old)} validation: {len(small_val_old)} test: {len(small_test_old)}")

Split of the COSMIC dataset:
train: 109318 validation: 7134 test: 16556


In [28]:
small_train, small_val, small_test = np.load(DATA_PATH / 'full_split.npy', allow_pickle=True)
print(f"Split of the new index set:\ntrain: {len(small_train)} validation:  {len(small_val)} test: {len(small_test)}")

Split of the new index set:
train: 109318 validation:  7134 test: 16556
