# Utils

> Common functions

In [1]:
#|default_exp utils

## Setup

In [2]:
# !pip install -U pyarrow

In [33]:
#| export
import pandas as pd, numpy as np

from rdkit import Chem
from rdkit.Chem import SaltRemover, MolStandardize

In [30]:
#| export
def preprocess_smiles(smiles):
    
    "Standardize SMILES by: 1) get the largest fragment, 2) normalize, 3) neutralization"
    
    try:
        # Convert to a molecule object
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            return None

        # Standardization, get largest fragment
        lfc = MolStandardize.fragment.LargestFragmentChooser()
        mol = lfc.choose(mol)

        # Normalize
        norm = MolStandardize.normalize.Normalizer()
        mol = norm.normalize(mol)

        # # Desalt, duplicates with largest fragment
        # remover = SaltRemover.SaltRemover()
        # mol = remover.StripMol(mol, dontRemoveEverything=True)

        # Neutralization
        uncharger = MolStandardize.charge.Uncharger()
        mol = uncharger.uncharge(mol)

        # Convert back to SMILES
        standardized_smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
        
        return standardized_smiles
    
    except Exception as e:
        print(f"An error occurred for SMILES {smiles}: {e}")
        return None

In [31]:
#| export
def preprocess_smi_df(df,smi_col):
    df = df.copy()
    print('before processing:',df.shape)
    df[smi_col] = df[smi_col].apply(preprocess_smiles)
    df = df.dropna(subset=smi_col).reset_index(drop=True)
    print('after processing:',df.shape)
    return df

In [13]:
df = pd.read_csv('dataset/all2.csv')

In [14]:
df.head()

Unnamed: 0,ID,SMILES,Kd,IC50,erk_IC50
0,US_284,Oc1cc(Cl)c(C(F)(F)F)c(-c2ncc3c(N4CC5CCC(C4)N5)...,,0.1,17.4
1,31678_45,C#Cc1c(F)ccc2cc(O)cc(-c3ncc4c(NCC5(N(C)C)CCC5)...,,0.4,38.3
2,US_243,C#Cc1cccc2cc(O)cc(-c3ncc4c(N5CC6CCC(C5)N6)nc(O...,,0.4,0.8
3,US_340,Oc1ccc(OC(F)(F)F)c(-c2ncc3c(N4CC5CCC(C4)N5)nc(...,,0.4,51.5
4,US_439,C#Cc1c(F)ccc2cccc(-c3ncc4c(N5CC6CCC(C5)N6)nc(O...,,0.4,12.8


In [9]:
df2 = preprocess_smi_df(df,'SMILES')

before: (1410, 5)
after: (1410, 5)


In [11]:
df2.head()

Unnamed: 0,ID,SMILES,Kd,IC50,erk_IC50
0,US_284,Oc1cc(Cl)c(C(F)(F)F)c(-c2ncc3c(N4CC5CCC(C4)N5)...,,0.1,17.4
1,31678_45,C#Cc1c(F)ccc2cc(O)cc(-c3ncc4c(NCC5(N(C)C)CCC5)...,,0.4,38.3
2,US_243,C#Cc1cccc2cc(O)cc(-c3ncc4c(N5CC6CCC(C5)N6)nc(O...,,0.4,0.8
3,US_340,Oc1ccc(OC(F)(F)F)c(-c2ncc3c(N4CC5CCC(C4)N5)nc(...,,0.4,51.5
4,US_439,C#Cc1c(F)ccc2cccc(-c3ncc4c(N5CC6CCC(C5)N6)nc(O...,,0.4,12.8


In [32]:
#| hide
import nbdev; nbdev.nbdev_export()