## A code to prepare the input molecules for the PACHQA dataset
## PubChemPCH subset

In [None]:
import requests
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdchem
import mols2grid
from subprocess import run
import copy
import random
import collections

In [None]:
mols = [x for x in Chem.SDMolSupplier('chcl.sdf', removeHs=False, sanitize=False)]
mols2grid.display(mols)

In [None]:
mols_filtered=[]
carbon_pattern = Chem.MolFromSmarts("[#6]")
for mol in mols:
    if Descriptors.NumRadicalElectrons(mol) != 0:
        continue #no radicals
    mol.UpdatePropertyCache(False)
    Chem.SetAromaticity(mol, Chem.AromaticityModel.AROMATICITY_SIMPLE)#only benzene rings are considered as aromatic
    if len(list(rdchem.Mol.GetAromaticAtoms(mol))) != len(mol.GetSubstructMatches(carbon_pattern)):
        continue #molecules must have only aromatic carbons
    if Descriptors.RingCount(mol) > 6:
        continue #6 cycles or less
    ringLen = [len(ring) for ring in Chem.GetSSSR(mol)]
    if max(ringLen) > 6:
        continue #no flexible macrocycles
    if 'TRIPLE' in [str(bond.GetBondType()) for bond in mol.GetBonds()]:
        continue #no triple bonds
    mols_filtered.append(mol)

In [None]:
mols2grid.display(mols_filtered)

In [None]:
ids_to_remove = [24, 252, 284]#some unstable bridged molecules are removed by hand
mols_handfiltered_PCH = [mol for mol in mols_filtered if not mols_filtered.index(mol) in ids_to_remove]

In [None]:
mols2grid.display(mols_handfiltered_PCH)

In [None]:
smiles_list=[]
for mol in mols_handfiltered_PCH:
    smiles_list.append(Chem.MolToSmiles(mol))
print([item for item, count in collections.Counter(smiles_list).items() if count > 1])#checking for duplicate SMILES

In [None]:
run(['mkdir', f'pubchemPCH'])
for mol in mols_handfiltered_PCH:
    ikey = Chem.MolToInchiKey(mol)
    run(['mkdir', f'pubchemPCH/{ikey}'])
    writer = Chem.SDWriter(f'pubchemPCH/{ikey}/pubchem_conf.sdf')
    writer.write(mol)

## PubChemPAH subset

In [None]:
mols = [x for x in Chem.SDMolSupplier('ArCH.sdf', removeHs=False, sanitize=False)]

In [None]:
mols_filtered=[]
carbon_pattern = Chem.MolFromSmarts("[#6]")
for mol in mols:
    if Descriptors.NumRadicalElectrons(mol) != 0:
        continue #no radicals
    mol.UpdatePropertyCache(False)
    Chem.SetAromaticity(mol, Chem.AromaticityModel.AROMATICITY_SIMPLE)#only benzene rings are considered as aromatic
    if len(list(rdchem.Mol.GetAromaticAtoms(mol))) != len(mol.GetSubstructMatches(carbon_pattern)):
        continue #molecules must have only aromatic carbons
    if Descriptors.RingCount(mol) > 6:
        continue #6 cycles or less
    ringLen = [len(ring) for ring in Chem.GetSSSR(mol)]
    if max(ringLen) > 6:
        continue #no flexible macrocycles
    if 'TRIPLE' in [str(bond.GetBondType()) for bond in mol.GetBonds()]:
        continue #no triple bonds
    if len(mol.GetSubstructMatches(Chem.MolFromSmarts('c1cc2ccc1cc2'))):
        continue #no unstable bridged fragment
    if len(mol.GetSubstructMatches(Chem.MolFromSmarts('c1c2cccc(cc1)c2'))):
        continue #no unstable bridged fragment
    mols_filtered.append(mol)

In [None]:
mols2grid.display(mols_filtered)

In [None]:
ids_to_remove = [2, 5, 6, 9, 12, 13, 16, 17, 18, 19, 22, 23, 8, 11]#more unstable molecules are removed by hand
mols_handfiltered_PAH = [mol for mol in mols_filtered if not mols_filtered.index(mol) in ids_to_remove]

In [None]:
mols2grid.display(mols_handfiltered_PAH)

In [None]:
run(['mkdir', f'pubchemPAH'])
for mol in mols_handfiltered_PAH:
    ikey = Chem.MolToInchiKey(mol)
    run(['mkdir', f'pubchemPAH/{ikey}'])
    writer = Chem.SDWriter(f'pubchemPAH/{ikey}/pubchem_conf.sdf')
    writer.write(mol)

## monoCl subset

In [None]:
keylist=[]
for mol in mols_handfiltered_PCH:
    keylist.append(Chem.MolToInchiKey(mol))
run(['mkdir', f'monoCl'])
mols_chlorinated = copy.deepcopy(mols_handfiltered_PAH)
for mol in mols_chlorinated:
    for atom in mol.GetAtoms():        
        if atom.GetAtomicNum() == 1:
            atom.SetAtomicNum(17)          
            ikey = Chem.MolToInchiKey(mol)
            if not ikey in keylist: #check if not already added or present in PubChemPCH
                keylist.append(ikey)
                run(['mkdir', f'monoCl/{ikey}'])
                writer = Chem.SDWriter(f'monoCl/{ikey}/pubchem_conf.sdf')
                writer.SetProps([''])
                writer.write(mol)
            atom.SetAtomicNum(1)

## perCl subset

In [None]:
keylist=[]
for mol in mols_handfiltered_PCH:
    keylist.append(Chem.MolToInchiKey(mol))
run(['mkdir', f'perCl'])
mols_perchlorinated = copy.deepcopy(mols_handfiltered_PAH)
for mol in mols_perchlorinated:
    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() == 1:
            atom.SetAtomicNum(17)
    ikey = Chem.MolToInchiKey(mol)
    if not ikey in keylist:
        run(['mkdir', f'perCl/{ikey}'])
        writer = Chem.SDWriter(f'perCl/{ikey}/pubchem_conf.sdf')
        writer.SetProps([''])
        writer.write(mol)

## polyCl subset (random isomers)

In [None]:
excluded=[]
keylist=[]
for mol in mols_handfiltered_PCH:
    keylist.append(Chem.MolToInchiKey(mol))
run(['mkdir', f'polyCl'])
mols_polychlorinated = copy.deepcopy(mols_handfiltered_PAH)
from rdkit.Chem import rdqueries
for mol in mols_polychlorinated:
    atoms=[atom for atom in mol.GetAtomsMatchingQuery(Chem.rdqueries.MassLessQueryAtom(2))]
    for i in range(2,len(atoms)):
        c=0
        temp_keylist=[]
        temp_mollist=[]
        while True:
            c+=1
            chlorines=random.sample(atoms, i) #1 random isomer with i chlorine atoms
            for atom in chlorines:
                atom.SetAtomicNum(17)
            ikey = Chem.MolToInchiKey(mol)
            if ikey not in temp_keylist:
                temp_keylist.append(ikey)
                temp_mollist.append(copy.deepcopy(mol))
            if ikey not in keylist:
                keylist.append(ikey)
                run(['mkdir', f'polyCl/{ikey}'])
                writer = Chem.SDWriter(f'polyCl/{ikey}/pubchem_conf.sdf')
                writer.SetProps([''])
                writer.write(mol)
                for atom in chlorines:
                    atom.SetAtomicNum(1)
                break
            for atom in chlorines:
                atom.SetAtomicNum(1)
            if c>1000: #try 1000 times, if all generated isomers were found in PubChemPCH, give up and add them to the special list
                excluded.extend(temp_mollist)
                break

In [None]:
mols2grid.display(excluded) #check the list of molecules to ensure that all the possible isomers with a given number of chlorines were found in PubChemPCH