In [31]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from rdkit import rdBase
from tqdm.auto import tqdm
import seaborn as sns

In [15]:
tqdm.pandas()

In [11]:
class SillyWalks:
    def __init__(self, df):
        self.count_dict = {}
        for smi in df.SMILES:
            mol = Chem.MolFromSmiles(smi)
            if mol:
                with rdBase.BlockLogs():
                    fp = AllChem.GetMorganFingerprint(mol, 2)
                for k, v in fp.GetNonzeroElements().items():
                    self.count_dict[k] = self.count_dict.get(k, 0) + v

    def score(self, smiles_in):
        mol = Chem.MolFromSmiles(smiles_in)
        if mol:
            with rdBase.BlockLogs():
                fp = AllChem.GetMorganFingerprint(mol, 2)
            on_bits = fp.GetNonzeroElements().keys()
            silly_bits = [
                x for x in [self.count_dict.get(x) for x in on_bits] if x is None
            ]
            score = len(silly_bits) / len(on_bits)
        else:
            score = 1
        return score

In [12]:
df = pd.read_csv("/Users/pwalters/software/silly_walks/chembl_drugs.smi",sep=" ",names=["SMILES","Name"])

In [13]:
sw = SillyWalks(df)

In [16]:
chembl_df = pd.read_csv("/Users/pwalters/DATA/CReM/chembl_actives.txt",sep="\t",nrows=10000)
chembl_df['silly'] = chembl_df.smi.progress_apply(sw.score)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [17]:
chembl_df.silly.value_counts()

silly
0.333333    232
0.250000    186
0.285714    111
0.200000    110
0.300000     90
           ... 
0.460000      1
0.478873      1
0.448980      1
0.395833      1
0.119403      1
Name: count, Length: 697, dtype: int64

In [20]:
reinvent_df = pd.read_csv("/Users/pwalters/DATA/CReM/reinvent_all.txt",sep="\t",nrows=10000)
reinvent_df['silly'] = reinvent_df.smi.progress_apply(sw.score)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [21]:
reinvent_df.silly.value_counts()

silly
0.200000    213
0.250000    199
0.166667    152
0.222222    111
0.142857    110
           ... 
0.186667      1
0.121622      1
0.402778      1
0.270000      1
0.417910      1
Name: count, Length: 755, dtype: int64

In [22]:
chembl_df.query("silly > 0")

Unnamed: 0,id,protein,pdb_id,docking_score,smi,HBA,HBD,complexity,NumRings,RTB,...,FCsp3_BM,sa_score,plif_sim,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,act_class,silly
0,CHEMBL103006,BACE1,6UWP,-7.332,COc1ccc(/C=C/C(=O)C=C(O)/C=C/c2ccc(OC)cc2)cc1,4,1,5,2,7,...,0.000,2.31,0.0,IC50,'>',5000.0,nM,,0,0.225806
1,CHEMBL1080086,BACE1,6UWP,-9.932,O=C(CN1C(=O)/C(=N\NC(=O)c2cc([N+](=O)[O-])ccc2...,7,3,10,4,6,...,0.304,2.49,0.0,IC50,'=',4800.0,nM,5.32,0,0.171875
2,CHEMBL1080295,BACE1,6UWP,-10.146,Cc1ccc(NC(=O)CN2C(=O)/C(=N\NC(=O)c3cc([N+](=O)...,7,3,10,4,6,...,0.043,2.37,0.0,IC50,'=',2400.0,nM,5.62,0,0.142857
3,CHEMBL1083394,BACE1,6UWP,-7.733,Oc1cc(O)cc(Oc2c(O)cc(O)cc2Oc2c(O)cc(O)cc2O)c1,9,7,16,3,4,...,0.000,2.78,0.5,IC50,'=',11680.0,nM,4.93,0,0.250000
4,CHEMBL1089788,BACE1,6UWP,-8.007,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2Cc2cccc(N)n2)cc1,4,1,5,4,7,...,0.045,2.36,0.0,IC50,'=',1150.0,nM,5.94,0,0.240000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,CHEMBL4587925,DRD2,6CM4,-9.675,Cc1ccc(C2CCN(CC(=O)Nc3cccc(C)c3)CC2)nc1,3,1,4,3,4,...,0.333,2.02,0.0,IC50,'=',71065.0,nM,4.15,0,0.148936
9995,CHEMBL4588709,DRD2,6CM4,-7.458,CCCSc1nnc(-c2ccccc2)n1C,4,0,4,2,4,...,0.000,1.87,0.0,Ki,'=',180375.0,nM,3.74,0,0.193548
9996,CHEMBL4588808,DRD2,6CM4,-9.102,Cc1cccc(NC(=O)CCN2CCN(c3ncccn3)CC2)c1,5,1,6,3,5,...,0.353,1.95,1.0,Ki,'=',1490.0,nM,5.83,0,0.022222
9998,CHEMBL4589737,DRD2,6CM4,-8.152,COc1ccc2c(c1OC)-c1ccc(C)c3c1[C@@H](C2)N(C)CC3,3,0,3,4,2,...,0.250,2.93,0.0,IC50,'=',3629.0,nM,5.44,0,0.181818


In [23]:
reinvent_df.query("silly > 0")

Unnamed: 0,iteration,id,docking_score,smi,scaffold,HBA,HBD,complexity,NumRings,RTB,...,max_ring_size,ChiralCenters,ChiralCentersUndefined,FCsp3_BM,sa_score,run,plif_sim,protein,pdb_id,silly
0,0,44:0:0,-11.388,C=C1CNCCS(=O)(=O)n2nc3c(cc([C@H](C)c4ccccc4)c4...,C=C1CNCCS(=O)(=O)n2nc3c(cc(Cc4ccccc4)c4ccccc43...,5,1,6,5,2,...,8,1,0,0.174,3.94,1,0.0,BACE1,6UWP,0.327869
1,0,107:1:0,-9.441,CC(=O)N[C@H](C)c1ccc(O[C@H]2CCN(c3ccnc(N4CCCCC...,c1ccc(OC2CCN(c3ccnc(N4CCCCC4)n3)C2)cc1,6,1,7,4,6,...,6,2,0,0.474,3.11,1,0.0,BACE1,6UWP,0.178571
2,0,69:0:0,-8.936,CC(C)(C)c1ccc(C(=O)Nc2cc3c(-c4nc5ccccc5[nH]4)n...,O=C(Nc1cc2c(-c3nc4ccccc4[nH]3)n[nH]c2s1)c1ccccc1,4,3,7,5,3,...,6,0,0,0.000,2.78,1,0.0,BACE1,6UWP,0.218182
3,0,23:1:0,-9.750,CC(C)C1(n2ccc(-c3ncccn3)c2)CN([C@H]2CC[C@@H](C...,O=C(Nc1ccccc1)C1CCC(N2CC(n3ccc(-c4ncccn4)c3)C2...,5,1,6,5,6,...,6,0,0,0.375,3.14,1,0.0,BACE1,6UWP,0.241935
4,0,2:0:0,-9.305,CCCCCCCNc1nc2ccc(-c3nc(C)no3)cc2c2ccccc12,c1ccc2c(c1)cnc1ccc(-c3ncno3)cc12,5,1,6,4,8,...,6,0,0,0.000,2.29,1,0.0,BACE1,6UWP,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,85,59:0:0,-8.179,CCc1ccccc1N1O[C@H]2C(=O)N(c3ccccc3)C(=O)[C@H]2...,O=C1C2ON(c3ccccc3)C(c3ccco3)C2C(=O)N1c1ccccc1,5,0,5,5,4,...,6,3,0,0.143,3.40,1,0.0,BACE1,6UWP,0.333333
9996,85,85:1:0,-6.496,CN(C)[C@H]1CS[C@@](C(=S)N2CCOCC2)(c2ccccc2)SC1,S=C(N1CCOCC1)C1(c2ccccc2)SCCCS1,5,0,5,3,3,...,6,0,0,0.533,3.28,1,0.0,BACE1,6UWP,0.268293
9997,85,75:0:0,-8.427,CN(c1ccccc1)S(=O)(=O)c1ccc2c(c1)ncc(=O)n2-c1cc...,O=c1cnc2cc(S(=O)(=O)Nc3ccccc3)ccc2n1-c1ccccc1,5,0,5,4,4,...,6,0,0,0.000,2.18,1,0.0,BACE1,6UWP,0.186047
9998,85,31:0:0,-8.349,CN1C(=O)C(=O)N(C2c3ccccc3CCc3ccccc32)C1=O,O=C1NC(=O)N(C2c3ccccc3CCc3ccccc32)C1=O,3,0,3,4,1,...,7,0,0,0.167,2.36,1,0.0,BACE1,6UWP,0.225806


In [30]:
df_1 = chembl_df[["smi","silly"]].copy()
df_1["src"] = "chembl"
df_2 = reinvent_df[["smi"]].copy()
df_2["src"] = "reinvent"
combo_df = pd.concat([df_1,df_2])

In [32]:
sns.boxplot(x="src",y="silly",data=combo_df)

ValueError: Could not interpret value `silly` for `y`. An entry with this name does not appear in `data`.