# Toolbox

> In this module, we develop various toolbox

In [1]:
#| default_exp toolbox

In [2]:
#| hide
import sys
sys.path.append("/notebooks/tools")
from nbdev.showdoc import *
%matplotlib inline

In [3]:
#| export
from tools.dataset import Data
from fastbook import *
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs


In [4]:
#| export
def tanimoto(df, #pandas DataFrame with SMILES and ID columns.
                         smiles_col='SMILES', #name of the column containing the SMILES strings (default: 'SMILES').
                         id_col='ID', #name of the column containing the molecule IDs (default: 'ID').
                         target_col=None, #name of the column containing the target values (default: None).
                         radius=2, #radius of the Morgan fingerprint (default: 2).
                        ):
    """
    Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame.
    
    """

    # Convert SMILES to molecule objects
    df['Molecule'] = df[smiles_col].apply(lambda x: Chem.MolFromSmiles(x))

    # Calculate fingerprints
    df['Fingerprint'] = df['Molecule'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius))

    # Calculate similarity scores
    similarity_scores = []
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            sim_score = DataStructs.TanimotoSimilarity(df['Fingerprint'][i], df['Fingerprint'][j])
            if target_col is not None:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score, df[target_col][i], df[target_col][j]))
            else:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score))

    # Create a new DataFrame with the similarity scores
    if target_col is not None:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore', 'Target1', 'Target2'])
    else:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore'])

    # Sort by similarity score in descending order
    result_df.sort_values('SimilarityScore', ascending=False, inplace=True)
    result_df = result_df.reset_index(drop=True)

    return result_df

In [5]:
show_doc(tanimoto)

---

### tanimoto

>      tanimoto (df, smiles_col='SMILES', id_col='ID', target_col=None,
>                radius=2)

Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df |  |  | pandas DataFrame with SMILES and ID columns. |
| smiles_col | str | SMILES | name of the column containing the SMILES strings (default: 'SMILES'). |
| id_col | str | ID | name of the column containing the molecule IDs (default: 'ID'). |
| target_col | NoneType | None | name of the column containing the target values (default: None). |
| radius | int | 2 | radius of the Morgan fingerprint (default: 2). |

In [39]:
df = Data.get_g12d()

cols = ['ID','SMILES','IC50']

df = df[cols]

df = df.dropna(subset= 'IC50').reset_index(drop=True)

In [43]:
df

Unnamed: 0,ID,SMILES,IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)-c1cc(O)cc2ccccc12,124.7
1,US_2,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,2.7
2,US_3,Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,9.5
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,496.2
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,722.9
...,...,...,...
700,paper_34,FC1=C(C2=C(C=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,27.0
701,paper_35,FC1=C(C2=C(C(F)=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,7.0
702,paper_36,FC1=C(C2=C(C(C#C)=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,2.0
703,paper_37,FC1=C(C2=C(C(Cl)=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,2.0


In [26]:
result = tanimoto(df, target_col = 'IC50')

In [27]:
result

Unnamed: 0,ID1,ID2,SMILES1,SMILES2,SimilarityScore,Target1,Target2
0,US_26,US_461,Oc1cc(-c2ncc3c(nc(OCCN4C[C@@H]5CC4CO5)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,[C@@H]12OC[C@@H](N(C1)CCOC=1N=C(C3=C(N1)C(=C(N=C3)C3=CC(=CC1=CC=CC=C31)O)F)N3C[C@H]1CC[C@@H](C3)N1)C2,1.000000,42.1,62.0
1,US_18,paper_17,CN(C)CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,FC1=C(C2=C(C=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OCCN(C)C)N=C4N5C[C@@H](CC6)N[C@@H]6C5,1.000000,76.8,70.0
2,646_5,646_18,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OCC23CCCN3C(CC2)COC(NC)=O)CN(CC1)C1=CC=CC2=CC=CC(=C12)CC,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@@]23CCCN3[C@H](CC2)COC(NC)=O)CN(CC1)C1=CC=CC2=CC=CC(=C12)CC,1.000000,1.6,2.0
3,US_219,US_225,CO[C@@H]1CN2CCC[C@]2(COc2nc(N3CC4CCC(C3)N4)c3cnc(c(F)c3n2)-c2cccc3cccc(Cl)c23)C1,CO[C@H]1CN2CCC[C@]2(COc2nc(N3CC4CCC(C3)N4)c3cnc(c(F)c3n2)-c2cccc3cccc(Cl)c23)C1,1.000000,8.1,9.2
4,646_89,646_90,ClC=1C=CC=C2C=CC=C(C12)N1CC=2N=C(N=C(C2CC1)N1C[C@H]2C/C(/[C@@H](C1)N2)=C\C#N)OC[C@H]2N(CCC2)C,ClC=1C=CC=C2C=CC=C(C12)N1CC=2N=C(N=C(C2CC1)N1C[C@H]2C/C(/[C@@H](C1)N2)=C/C#N)OC[C@H]2N(CCC2)C,1.000000,10000.0,1462.9
...,...,...,...,...,...,...,...
248155,31678_13,646_22,ClC1=C(C(=C2C=NNC2=C1)C1=C(C=2N=C(N=C(C2C=N1)NCC1(CCC1)N(C)C)OC[C@]12CCCN2C[C@@H](C1)F)F)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.113821,165.0,7862.6
248156,31678_57,646_22,CN(C1(CCC1)CNC=1C2=C(N=C(N1)OC[C@]13CCCN3C[C@@H](C1)F)C(=C(N=C2)C2=CC(=CC1=CC=C(C(=C21)CC)F)O)F)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.112000,1.1,7862.6
248157,31678_46,646_22,CN(C(OC[C@H]1CC[C@]2(CCCN12)COC=1N=C(C2=C(N1)C(=C(N=C2)C2=CC(=CC1=CC=C(C(=C21)CC)F)O)F)NCC2(CCC2)N(C)C)=O)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.105263,0.7,7862.6
248158,31678_45,646_22,CN(C(OC[C@H]1CC[C@]2(CCCN12)COC=1N=C(C2=C(N1)C(=C(N=C2)C2=CC(=CC1=CC=C(C(=C21)C#C)F)O)F)NCC2(CCC2)N(C)C)=O)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.104478,0.4,7862.6


In [28]:
result = result.query("SimilarityScore ==1")

In [30]:
result = result.copy()

In [31]:
result['var'] = result.apply(lambda x: abs(np.log10(x['Target1']/x['Target2'])), axis=1)

In [34]:
result['mean'] = result.apply(lambda x: np.nan if x['var'] > 1.5 else (x['Target1'] + x['Target2']) / 2, axis=1)

In [36]:
result.to_csv('tanimoto.csv',index=False)

Modify the tanimoto.csv, take average of 2/3/4 duplicates, if its is from paper, then ignore it when averaging

In [57]:
dup = pd.read_csv('IC50_duplicate.csv')

In [61]:
dup.rename(columns={'SMILES2':'SMILES'},inplace=True)

In [62]:
dup

Unnamed: 0,ID1,ID2,SMILES,mean
0,US_65,US_66,Fc1c(ncc2c(nc(OC[C@@]34CCCN3C(CCl)CC4)nc12)N1CC2CCC(C1)N2)-c1cccc2cccc(Cl)c12,
1,US_55,US_56,OC[C@@H](O)COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cccc2cccc(Cl)c12,6024.000000
2,US_252,US_482,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@@]23CCCN3C[C@H](C2)F)C(=C(N=C1)C1=CC(=CC2=CC=C(C(=C12)C#C)F)O)F,1.200000
3,US_259,US_468,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@@]23CCCN3C[C@H](C2)F)C(=C(N=C1)C1=CC(=CC2=CC=CC(=C12)CC)O)F,1.400000
4,US_451,US_467,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@]23CCCN3C[C@@H](C2)F)C(=C(N=C1)C=1C=C(C=C(C1[C@@H]1[C@@H](C1)C)Cl)O)F,1.333333
...,...,...,...,...
69,646_4,646_54,OC=1C=C(C2=CC=CC=C2C1)N1CC=2N=C(N=C(C2CC1)N1C[C@H]2C[C@H]([C@@H](C1)N2)O)OC[C@H]2N(CCC2)C,
70,646_5,646_18,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@@]23CCCN3[C@H](CC2)COC(NC)=O)CN(CC1)C1=CC=CC2=CC=CC(=C12)CC,1.800000
71,646_121,646_122,C(C)C2=C1C(=CC=CC1=CC=C2)N8CC3=C(C(=NC(=N3)OC[C@@]45CCCN4C[C@@H](C5)F)[C@@H]6C[C@@H]7CC[C@H](C6)N7)CC8,121.450000
72,646_49,646_117,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@]23CCCN3C[C@@H](C2)F)CN(CC1)C1=CC(=CC2=CC=CC(=C12)Cl)O,0.600000


In [54]:
unique_ids = set(dup['ID1']).union(set(dup['ID2']))

In [56]:
df[~df['ID'].isin(unique_ids)]

Unnamed: 0,ID,SMILES,IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)-c1cc(O)cc2ccccc12,124.7
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,496.2
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,722.9
5,US_6,Cc1cccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,434.1
6,US_7,Oc1cc(-c2ncc3c(nc(OCCc4ncccn4)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,1867.3
...,...,...,...
675,paper_9,FC1=C(C2=C(C=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OC[C@H]5N(C)CCC5)N=C4N6CCOCC6,12000.0
676,paper_10,FC1=C(C2=C(C=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OC[C@H]5N(C)CCC5)N=C4N6CC(C)NCC6,2100.0
677,paper_11,FC1=C(C2=C(C=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OC[C@H]5N(C)CCC5)N=C4N6C(C)CNCC6,1500.0
678,paper_12,FC1=C(C2=C(C=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OC[C@H]5N(C)CCC5)N=C4N6CC7NCC6C7,650.0


In [64]:
df

Unnamed: 0,ID,SMILES,IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)-c1cc(O)cc2ccccc12,124.7
1,US_2,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,2.7
2,US_3,Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,9.5
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,496.2
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,722.9
...,...,...,...
700,paper_34,FC1=C(C2=C(C=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,27.0
701,paper_35,FC1=C(C2=C(C(F)=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,7.0
702,paper_36,FC1=C(C2=C(C(C#C)=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,2.0
703,paper_37,FC1=C(C2=C(C(Cl)=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,2.0


In [63]:
pd.concat([df, dup.rename(columns={'ID2': 'ID', 'mean': 'IC50'})])


Unnamed: 0,ID,SMILES,IC50,ID1
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)-c1cc(O)cc2ccccc12,124.70,
1,US_2,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,2.70,
2,US_3,Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,9.50,
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,496.20,
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,722.90,
...,...,...,...,...
69,646_54,OC=1C=C(C2=CC=CC=C2C1)N1CC=2N=C(N=C(C2CC1)N1C[C@H]2C[C@H]([C@@H](C1)N2)O)OC[C@H]2N(CCC2)C,,646_4
70,646_18,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@@]23CCCN3[C@H](CC2)COC(NC)=O)CN(CC1)C1=CC=CC2=CC=CC(=C12)CC,1.80,646_5
71,646_122,C(C)C2=C1C(=CC=CC1=CC=C2)N8CC3=C(C(=NC(=N3)OC[C@@]45CCCN4C[C@@H](C5)F)[C@@H]6C[C@@H]7CC[C@H](C6)N7)CC8,121.45,646_121
72,646_117,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@]23CCCN3C[C@@H](C2)F)CN(CC1)C1=CC(=CC2=CC=CC(=C12)Cl)O,0.60,646_49


In [22]:
#| hide
import nbdev; nbdev.nbdev_export()