# Feature engineering

> In this module, we develop tools to extract features from compounds, proteins, etc.

In [1]:
#| default_exp toolbox

In [2]:
#| hide
import sys
sys.path.append("/notebooks/tools")
from nbdev.showdoc import *
%matplotlib inline

In [12]:
#| export
from tools.dataset import Data
from fastbook import *
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs


In [4]:
df = Data.get_g12d()

In [5]:
cols = ['ID','SMILES','IC50']

In [6]:
df = df[cols]

In [7]:
df = df.dropna(subset= 'IC50').reset_index(drop=True)

In [8]:
df

Unnamed: 0,ID,SMILES,IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)-c1cc(O)cc2ccccc12,124.7
1,US_2,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,2.7
2,US_3,Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,9.5
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,496.2
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,722.9
...,...,...,...
700,paper_34,FC1=C(C2=C(C=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,27.0
701,paper_35,FC1=C(C2=C(C(F)=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,7.0
702,paper_36,FC1=C(C2=C(C(C#C)=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,2.0
703,paper_37,FC1=C(C2=C(C(Cl)=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,2.0


In [17]:
#| export
def calculate_similarity(df, #pandas DataFrame with SMILES and ID columns.
                         smiles_col='SMILES', #name of the column containing the SMILES strings (default: 'SMILES').
                         id_col='ID', #name of the column containing the molecule IDs (default: 'ID').
                         target_col=None, #name of the column containing the target values (default: None).
                         radius=2, #radius of the Morgan fingerprint (default: 2).
                        ):
    """
    Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame.
    
    """

    # Convert SMILES to molecule objects
    df['Molecule'] = df[smiles_col].apply(lambda x: Chem.MolFromSmiles(x))

    # Calculate fingerprints
    df['Fingerprint'] = df['Molecule'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius))

    # Calculate similarity scores
    similarity_scores = []
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            sim_score = DataStructs.TanimotoSimilarity(df['Fingerprint'][i], df['Fingerprint'][j])
            if target_col is not None:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score, df[target_col][i], df[target_col][j]))
            else:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score))

    # Create a new DataFrame with the similarity scores
    if target_col is not None:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore', 'Target1', 'Target2'])
    else:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore'])

    # Sort by similarity score in descending order
    result_df.sort_values('SimilarityScore', ascending=False, inplace=True)

    return result_df

In [18]:
show_doc(calculate_similarity)

---

### calculate_similarity

>      calculate_similarity (df, smiles_col='SMILES', id_col='ID',
>                            target_col=None, radius=2)

Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df |  |  | pandas DataFrame with SMILES and ID columns. |
| smiles_col | str | SMILES | name of the column containing the SMILES strings (default: 'SMILES'). |
| id_col | str | ID | name of the column containing the molecule IDs (default: 'ID'). |
| target_col | NoneType | None | name of the column containing the target values (default: None). |
| radius | int | 2 | radius of the Morgan fingerprint (default: 2). |

In [15]:
result = calculate_similarity(df, target_col = 'IC50')

In [19]:
result

Unnamed: 0,ID1,ID2,SMILES1,SMILES2,SimilarityScore,Target1,Target2
14319,US_26,US_461,Oc1cc(-c2ncc3c(nc(OCCN4C[C@@H]5CC4CO5)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,[C@@H]12OC[C@@H](N(C1)CCOC=1N=C(C3=C(N1)C(=C(N=C3)C3=CC(=CC1=CC=CC=C31)O)F)N3C[C@H]1CC[C@@H](C3)N1)C2,1.000000,42.1,0.062
9743,US_18,paper_17,CN(C)CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,FC1=C(C2=C(C=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OCCN(C)C)N=C4N5C[C@@H](CC6)N[C@@H]6C5,1.000000,76.8,70.000
232596,646_5,646_18,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OCC23CCCN3C(CC2)COC(NC)=O)CN(CC1)C1=CC=CC2=CC=CC(=C12)CC,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@@]23CCCN3[C@H](CC2)COC(NC)=O)CN(CC1)C1=CC=CC2=CC=CC(=C12)CC,1.000000,1.6,2.000
124909,US_219,US_225,CO[C@@H]1CN2CCC[C@]2(COc2nc(N3CC4CCC(C3)N4)c3cnc(c(F)c3n2)-c2cccc3cccc(Cl)c23)C1,CO[C@H]1CN2CCC[C@]2(COc2nc(N3CC4CCC(C3)N4)c3cnc(c(F)c3n2)-c2cccc3cccc(Cl)c23)C1,1.000000,8.1,9.200
243407,646_89,646_90,ClC=1C=CC=C2C=CC=C(C12)N1CC=2N=C(N=C(C2CC1)N1C[C@H]2C/C(/[C@@H](C1)N2)=C\C#N)OC[C@H]2N(CCC2)C,ClC=1C=CC=C2C=CC=C(C12)N1CC=2N=C(N=C(C2CC1)N1C[C@H]2C/C(/[C@@H](C1)N2)=C/C#N)OC[C@H]2N(CCC2)C,1.000000,10000.0,1462.900
...,...,...,...,...,...,...,...
221665,31678_13,646_22,ClC1=C(C(=C2C=NNC2=C1)C1=C(C=2N=C(N=C(C2C=N1)NCC1(CCC1)N(C)C)OC[C@]12CCCN2C[C@@H](C1)F)F)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.113821,165.0,7862.600
230795,31678_57,646_22,CN(C1(CCC1)CNC=1C2=C(N=C(N1)OC[C@]13CCCN3C[C@@H](C1)F)C(=C(N=C2)C2=CC(=CC1=CC=C(C(=C21)CC)F)O)F)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.112000,1.1,7862.600
228694,31678_46,646_22,CN(C(OC[C@H]1CC[C@]2(CCCN12)COC=1N=C(C2=C(N1)C(=C(N=C2)C2=CC(=CC1=CC=C(C(=C21)CC)F)O)F)NCC2(CCC2)N(C)C)=O)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.105263,0.7,7862.600
228497,31678_45,646_22,CN(C(OC[C@H]1CC[C@]2(CCCN12)COC=1N=C(C2=C(N1)C(=C(N=C2)C2=CC(=CC1=CC=C(C(=C21)C#C)F)O)F)NCC2(CCC2)N(C)C)=O)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.104478,0.4,7862.600


In [21]:
result.to_csv('tanimoto.csv',index=False)

In [22]:
#| hide
import nbdev; nbdev.nbdev_export()