In [None]:
%load_ext autoreload
%autoreload 2

# MolMorganDataset to cover chemical space of qmugs, patrick and CHEMBL with least possible molecules

## Import necessary packages and data sets
| Name | Content |
| --- | --- |
| corrected |Data set from patrick with some faulty molecules removed |
| leadlike | Drug-like molecules from CHEMBL below 350 mw |
| solvents | Set of small organic solvents|
| qmugs500 | All molecules from qmugs set below 500 mw (H's taken into account for weight) with only one conformere per molecule|
| noH500 | All molecules from qmugs set below 500 mw (H's NOT taken into account for weight) with only one conformere per molecule|


In [None]:
from serenityff.charge.dataset_preperation.MolMorganDataset import MolMorganDataset
import pandas as pd

from rdkit import Chem

corrected = MolMorganDataset('path/to/corrected.sdf')
leadlike = MolMorganDataset('path/to/leadlike.sdf')
solvents = MolMorganDataset('path/to/solvents.sdf')
qmugs500 = MolMorganDataset('path/to/qmugs500.sdf')
noH500 = MolMorganDataset('path/to/noH500.sdf')

# Reduce initial set to remove redundant molecules
Start point is the MolMorganDataset of molecules from qmugs, that are below a molecular weight of 500 u (H's are included in the weight). The data set only contains one conformere of each molecule.
With the reduce function the molecules needed to have all present morgan fingerprints at least 5 times (default) are reduced to a minimum with a greedy approach.

In [None]:
qreduced500 = qmugs500.reduce(NewSetName = 'qreduced500', cutoff = 5)

# Add solvent molecules that we definetly want in the set
In a second step we add the solvent molecules where we know in advance that we want them in our final data set.

In [None]:
q500solvents = qreduced500.add(otherset=solvents, NewSetName='q500solvents')

# Add molecules from corrected patrick set
Extend MolMorganDataset with molecules from corrected and leadlike sets. Add minimal amount of molecules needed to cover fingerprints from all datasets at least 5 times. The same greedy approach is used

In [None]:
qcorrected = q500solvents.reduce(NewSetName='qcorrected', otherset=corrected)
qleadlike = qcorrected.reduce(NewSetName='leadlike', otherset=leadlike)

# Reduce set to remove redundancies that occurred
Repeat procedure from first step and add solvents again to make sure none are missing in the final set

In [None]:
qleadreduced = qleadlike.reduce(NewSetName='qleadreduced')
final = qleadreduced.add(NewSetName='final', otherset=solvents)


# Compare final set with initial ones
Make sure none of the fingerprints are missing

In [None]:
final = MolMorganDataset('path/to/final.sdf')
final.weight_distribution()

In [None]:
final.compare(qmugs500)

# Write smiles codes into csv file

In [None]:
smiles_tot = []
smiles_qmugs = []
ID_qmugs = []
ID_leadlike = []
smiles_corrected = []
smiles_solvents = []
smiles_leadlike = []
ID_tot = []
set_ID = []
wrongs = []
wrongchembls = ['CHEMBL3590587',
 'CHEMBL3590586',
 'CHEMBL3590584',
 'CHEMBL3590585',
 'CHEMBL3617051',
 'CHEMBL3752539'] #got these manually

for mol in final._mols:
    if Chem.MolToSmiles(mol) not in smiles_tot:
        smiles_tot.append(Chem.MolToSmiles(mol))
    else: 
        print('redundant molecule')

for mol in qmugs500._mols:
    if Chem.MolToSmiles(mol) not in smiles_qmugs:
        smiles_qmugs.append(Chem.MolToSmiles(mol))
        ID_qmugs.append(mol.GetProp('CHEMBL_ID'))

for mol in corrected._mols:
    if Chem.MolToSmiles(mol) not in smiles_corrected:
        smiles_corrected.append(Chem.MolToSmiles(mol))

for mol in solvents._mols:
    if Chem.MolToSmiles(mol) not in smiles_solvents:
        smiles_solvents.append(Chem.MolToSmiles(mol))

for mol in leadlike._mols:
    if Chem.MolToSmiles(mol) not in smiles_leadlike:
        smiles_leadlike.append(Chem.MolToSmiles(mol))
        if mol.HasProp('chembl_id'):
            ID_leadlike.append(mol.GetProp('chembl_id'))
        else:
            ID_leadlike.append(0)

for sm in smiles_tot:
    if sm in smiles_qmugs:
        set_ID.append(0)
        ID_tot.append(ID_qmugs[smiles_qmugs.index(sm)])
    elif sm in smiles_solvents:
        set_ID.append(1)
        ID_tot.append(0)
    elif sm in smiles_corrected:
        set_ID.append(2)
        ID_tot.append(0)
    elif sm in smiles_leadlike:
        set_ID.append(3)
        ID_tot.append(ID_leadlike[smiles_leadlike.index(sm)])
    else:
        print(smiles_tot.index(sm), ' is missing in others')
        set_ID.append(10)
        ID_tot.append('missing')
        wrongs.append(smiles_tot.index(sm))
        
for i, ind in enumerate(wrongs):
    if set_ID[ind] == 10:
        set_ID[ind] = 3
    else:
        print*('mistake', ind)
    if ID_tot[ind] == 'missing':
        ID_tot[ind]= wrongchembls[i]
    else:
        print('mistake2', ind)

print(final._num_mol) #make sure that all list are same length and no molecules are missed
print(len(smiles_tot))
print(len(set_ID))
print(len(ID_tot))


In [None]:
printdata = {'Smiles': smiles_tot, "Set_ID": set_ID, "CHEMBL_ID": ID_tot}
pls = pd.DataFrame(printdata)
print(len(smiles_tot),len(set_ID), len(ID_tot))

In [None]:
pls.to_csv('final_smiles.csv', index = True)

In [None]:
wrongs = [182035, 182207, 182208, 182209, 183744, 207865]
wrongsmiles = []
for ind in wrongs:
    wrongsmiles.append(smiles_tot[ind])

In [None]:
for ind in wrongs:
    set_ID.insert(ind, 3)
    ID_tot.insert(ind, )
    

In [None]:
qleadlikewrongs = []
chemblidwrongs = []
for i, mol in enumerate(qleadlike._mols):
    if Chem.MolToSmiles(mol) in wrongsmiles:
        qleadlikewrongs.append(i)
        chemblidwrongs.append(qleadlike._mols[i].GetProp('chembl_id'))

In [None]:
Chem.MolToSmiles(leadlike._mols[leadlikewrongs[0]])
leadlike._mols[leadlikewrongs[0]]
leadlike._mols[leadlikewrongs[5]].GetProp('chembl_id')

In [None]:
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.molSize = 450,400
IPythonConsole.drawOptions.addAtomIndices = True
m = Chem.Mol(leadlike._mols[leadlikewrongs[0]])
m.RemoveAllConformers()
m

In [None]:
leadlike._mols[leadlikewrongs[0]].Debug()

In [None]:
leadlikewrongs = []
for i, mol in enumerate(leadlike._mols):
    try:
        if mol.GetProp('chembl_id') in chemblidwrongs:
            leadlikewrongs.append(i)
    except:
        continue