In [10]:
import json
from pathlib import Path
import pickle
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from tqdm import tqdm

In [11]:
DATASET = 'DRUGS'
NUM_CONFS = 50

DATA_PATH = Path(f'data/{DATASET}')
DATASET_UNPROCESSED = DATA_PATH / DATASET
summary_path = DATA_PATH / 'summary_preprocessed.json'

smiles_all = json.loads((DATA_PATH / 'summary_preprocessed.json').read_text())
smiles_sampling = {k: v for k, v in smiles_all.items() if v.get('split') == 'test'}
smiles_sampling_small = {k: v for k, v in smiles_all.items() if v.get('split') == 'test' and 'small' in v.get('subsets', [])}

In [12]:
# Write smiles from test set to torsional diffusion format
output_csv = DATA_PATH / 'test_smiles_small.csv'
output_csv.write_text('smi, n_confs, smi_fixed')
output_csv.write_text('\n'.join(f'{k}, {NUM_CONFS}, {k}' for k in smiles_sampling_small.keys()))

269743

##### Write out validation set smiles and conformations

In [15]:
DATASET = 'DRUGS'
SPLIT = 'val'

DATA_PATH = Path(f'data/{DATASET}')
DATASET_UNPROCESSED = DATA_PATH / DATASET.lower()
summary_path = DATA_PATH / 'summary_preprocessed.json'

def geom_to_df(data_path: str, summary_path: str, split: str = 'val', subset: str = 'small'):
    print(f'Data path: {data_path}, summary path: {summary_path}')
    summary = json.loads(Path(summary_path).read_text())
    smiles = [k for k, v in summary.items() if v.get('split') == split and 'small' in v.get('subsets', [])]
    result = []
    n_skipped = 0
    for smi in tqdm(smiles, desc='Processing molecules'):
        mol_path = Path(data_path) / (smi + '.pickle')
        try:
            conf_dict = pickle.loads(Path(mol_path).read_bytes())
        except FileNotFoundError:
            # print(f'File {mol_path} not found')
            n_skipped += 1
            continue
        confs = [Chem.RemoveHs(elem['rd_mol']) for elem in conf_dict['conformers']]
        result.append(pd.DataFrame({'smiles': [Chem.CanonSmiles(smi),]*len(confs), 'ROMol': confs}))
    print(f'Skipped {n_skipped} molecules')
    return pd.concat(result)

def geom_to_torsional_diffusion_input(data_path: str, summary_path: str, split: str = 'val', subset: str = 'small'):
    df = geom_to_df(data_path, summary_path, split, subset)
    grouped_by_smiles = df.groupby('smiles')
    num_confs_by_smiles = grouped_by_smiles['ROMol'].aggregate('count')
    num_confs_by_smiles.name = 'num_confs'
    num_confs_by_smiles = num_confs_by_smiles // 2
    res = num_confs_by_smiles.reset_index()
    res.assign(smiles_copy=res['smiles']).to_csv(Path(summary_path).with_name(f'{split}_smiles_{subset}.csv'), index=False)

In [16]:
SPLIT = 'val'
data = geom_to_df(str(DATASET_UNPROCESSED), summary_path, SPLIT)
PandasTools.WriteSDF(data, f'data/DRUGS/{SPLIT}_small.sdf', molColName='ROMol', properties=['smiles'])

Data path: data/DRUGS/drugs, summary path: data/DRUGS/summary_preprocessed.json


Processing molecules: 100%|██████████| 1656/1656 [00:29<00:00, 56.84it/s] 


Skipped 167 molecules


In [17]:
geom_to_torsional_diffusion_input(str(DATASET_UNPROCESSED), summary_path, SPLIT)

Data path: data/DRUGS/drugs, summary path: data/DRUGS/summary_preprocessed.json


Processing molecules: 100%|██████████| 1656/1656 [00:33<00:00, 49.68it/s] 


Skipped 167 molecules


In [18]:
SPLIT = 'test'
data = geom_to_df(str(DATASET_UNPROCESSED), summary_path, SPLIT)
PandasTools.WriteSDF(data, f'data/DRUGS/{SPLIT}_small.sdf', molColName='ROMol', properties=['smiles'])

Data path: data/DRUGS/drugs, summary path: data/DRUGS/summary_preprocessed.json


Processing molecules: 100%|██████████| 2910/2910 [01:15<00:00, 38.57it/s] 


Skipped 415 molecules


In [19]:
geom_to_torsional_diffusion_input(str(DATASET_UNPROCESSED), summary_path, SPLIT)

Data path: data/DRUGS/drugs, summary path: data/DRUGS/summary_preprocessed.json


Processing molecules: 100%|██████████| 2910/2910 [00:49<00:00, 58.60it/s] 


Skipped 415 molecules


##### Assemble the samples from pieces

In [2]:
import pickle
from pathlib import Path
import pandas as pd
from typing import Any
from tqdm.auto import tqdm
from rdkit import Chem
from rdkit.Chem import PandasTools

In [3]:
data_pieces = ['data/PLATINUM/drugs_cosmic_weights_50confs_20steps_0.pkl',
               'data/PLATINUM/drugs_cosmic_weights_50confs_20steps_1.pkl',
               'data/PLATINUM/drugs_cosmic_weights_50confs_20steps_2.pkl',
]

def pkl_to_dataframe(input_pkl: str) -> pd.DataFrame:
    dict_data = pickle.load(Path(input_pkl).open('rb'))
    result = []
    for smi, confs in tqdm(dict_data.items(), desc='molecules'):
            canon_smi = Chem.CanonSmiles(smi)
            confs_nohs = [Chem.RemoveHs(cnf) for cnf in confs]
            data = pd.DataFrame({'smiles': [canon_smi]*len(confs_nohs), 'ROMol': confs_nohs})
            result.append(data)
    return pd.concat(result)

def assemble_sdf(pieces: list[str], out_sdf: str) -> None:
    result = []
    for piece in tqdm(pieces, desc='parts'):
        data = pkl_to_dataframe(piece)
        result.append(data)
    PandasTools.WriteSDF(pd.concat(result), out_sdf, molColName='ROMol', properties=['smiles'])

In [7]:
assemble_sdf(data_pieces, 'data/PLATINUM/drugs_cosmic_weights_50confs_20steps.sdf')

parts:   0%|          | 0/3 [00:00<?, ?it/s]

molecules:   0%|          | 0/1518 [00:00<?, ?it/s]

molecules:   0%|          | 0/1518 [00:00<?, ?it/s]

molecules:   0%|          | 0/1507 [00:00<?, ?it/s]

##### Transform sampled pickles into SDF

In [8]:
# Write sampled test set to SDF
data = pkl_to_dataframe('data/DRUGS/sampled_test_smiles_small_50confs_20steps.pkl')
PandasTools.WriteSDF(data, 'data/DRUGS/sampled_test_smiles_small_50confs_20steps.sdf', molColName='ROMol', properties=['smiles'])

molecules:   0%|          | 0/2902 [00:00<?, ?it/s]

In [9]:
# Write sampled val set to SDF
data = pkl_to_dataframe('data/DRUGS/sampled_val_smiles_small_50confs_20steps.pkl')
PandasTools.WriteSDF(data, 'data/DRUGS/sampled_val_smiles_small_50confs_20steps.sdf', molColName='ROMol', properties=['smiles'])

molecules:   0%|          | 0/1464 [00:00<?, ?it/s]