# Core

## Setup

In [None]:
#| default_exp core

In [61]:
#| export
# basics
import pandas as pd, numpy as np
from functools import lru_cache

# rdkit
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Draw,Descriptors,Descriptors3D, AllChem,rdFingerprintGenerator

from sklearn.preprocessing import StandardScaler
from sklearn import set_config
set_config(transform_output="pandas")

## Data

In [8]:
#| export
class Data:
    "A class for fetching various datasets."

    ANTIBIOTICS_URL = "https://github.com/sky1ove/kdock/raw/main/dataset/antibiotics_2335.csv"
    G12D_URL = "https://github.com/sky1ove/kdock/raw/main/dataset/KRASi_g12d.csv"
    G12D_DEDUP_URL = "https://github.com/sky1ove/kdock/raw/main/dataset/KRASi_g12d_dedup.csv"
    KRAS_SEQ_URL = "https://github.com/sky1ove/kdock/raw/main/dataset/kras_seq.csv"

    @staticmethod
    @lru_cache(maxsize=None)
    def fetch_csv(url): return pd.read_csv(url)
    
    @staticmethod
    def get_antibiotics():
        "Deduplicated screening antibiotics dataset; Table S1B from Cell: A Deep Learning Approach to Antibiotic Discovery."
        return Data.fetch_csv(Data.ANTIBIOTICS_URL)
    
    @staticmethod
    def get_mirati_g12d():
        "Get the deduplicated G12D dataset from the mirati paper and patents."
        return Data.fetch_csv(Data.G12D_DEDUP_URL)
    
    @staticmethod
    def get_mirati_g12d_raw():
        "Get the raw combined G12D dataset from the paper and patents."
        return Data.fetch_csv(Data.G12D_URL)
    
    @staticmethod
    def get_kras_seq():
        "Get the sequence of KRAS and its mutations G12D and G12C."
        return Data.fetch_csv(Data.KRAS_SEQ_URL)

In [10]:
Data.get_antibiotics().head()

Unnamed: 0,name,SMILES,inhibition,activity
0,CEFPIRAMIDE,Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(C...,0.041572,1
1,GEMIFLOXACIN MESYLATE,CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC...,0.041876,1
2,POLYMYXIN B SULFATE,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O...,0.041916,1
3,PRAXADINE HYDROCHLORIDE,Cl.N=C(N)n1cccn1,0.041964,1
4,CHLORHEXIDINE DIHYDROCHLORIDE,Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(...,0.042295,1


In [11]:
Data.get_mirati_g12d().head()

Unnamed: 0,ID,SMILES,Kd,IC50,erk_IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...,97.7,124.7,3159.1
1,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(...,155.7,496.2,8530.0
2,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-...,294.8,722.9,8193.8
3,US_6,Cc1cccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1...,442.2,434.1,11518.2
4,US_7,Oc1cc(-c2ncc3c(nc(OCCc4ncccn4)nc3c2F)N2CC3CCC(...,463.5,1867.3,


In [12]:
Data.get_mirati_g12d_raw().head()

Unnamed: 0,ID,SMILES,group,with_3F,racemic_trans,mixture_isomer,trans,Kd,IC50,erk_IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...,US,0,0,0,0,97.7,124.7,3159.1
1,US_2,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2...,US,1,0,0,0,2.4,2.7,721.4
2,US_3,Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)...,US,0,0,0,0,8.3,9.5,10283.1
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(...,US,0,0,0,0,155.7,496.2,8530.0
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-...,US,0,0,0,0,294.8,722.9,8193.8


In [13]:
Data.get_kras_seq().head()

Unnamed: 0,ID,WT_sequence,g12d_seq,g12c_seq
0,kras_human,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGACGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...
1,kras_human_isoform2b,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGACGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...


## Rdkit

In [23]:
#| export
def get_rdkit(SMILES:str):
    """
    Extract chemical features from SMILES
    Reference: https://greglandrum.github.io/rdkit-blog/posts/2022-12-23-descriptor-tutorial.html
    """
    mol = Chem.MolFromSmiles(SMILES)
    return Descriptors.CalcMolDescriptors(mol)

In [24]:
#| export
def get_rdkit_3d(SMILES:str):
    "Extract 3d features from SMILES"
    mol = Chem.MolFromSmiles(SMILES)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol, AllChem.ETKDG())
    AllChem.UFFOptimizeMolecule(mol)
    return Descriptors3D.CalcMolDescriptors3D(mol)

In [54]:
#| export
def get_rdkit_all(SMILES:str):
    "Extract chemical features and 3d features from SMILES"
    feat = get_rdkit(SMILES)
    feat_3d = get_rdkit_3d(SMILES)
    return feat|feat_3d

In [81]:
#| export
def remove_hi_corr(df: pd.DataFrame, 
                   thr=0.99 # threshold
                   ):
    "Remove highly correlated features in a dataframe given a pearson threshold"
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > thr)]
    return df.drop(to_drop, axis=1), to_drop

def preprocess(df: pd.DataFrame, thr=0.99):
    "Remove features with no variance, and highly correlated features based on threshold."
    col_ori = df.columns

    # Remove columns with std == 0
    std_zero_cols = df.columns[df.std() == 0].tolist()
    
    if std_zero_cols:
        n=len(std_zero_cols)
        print(f"\n {n} Columns with zero std: {std_zero_cols}")
    df = df.loc[:, df.std() != 0].copy()

    # Remove highly correlated columns
    df, high_corr_cols = remove_hi_corr(df, thr)
    if high_corr_cols:
        n=len(high_corr_cols)
        print(f"\n {n} Columns removed due to high similarity (pearson>{thr}): {high_corr_cols}")

    dropping_col = set(col_ori) - set(df.columns)
    n = len(dropping_col)
    print(f"\n Total removed columns: {n}")
    
    return df

In [82]:
#| export
def get_rdkit_df(df: pd.DataFrame,
                 col='SMILES', # column of SMILES
                 postprocess=False, # remove redundant columns and standardize features for dimension reduction
                 ):
    "Extract rdkit features (including 3d) from SMILES in a df"
    out = df[col].apply(get_rdkit_all).apply(pd.Series)
    if postprocess:
        out = StandardScaler().fit_transform(out)
        out = preprocess(out) # remove redundant
    return out

In [83]:
df=Data.get_antibiotics().head()

In [84]:
df

Unnamed: 0,name,SMILES,inhibition,activity
0,CEFPIRAMIDE,Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(C...,0.041572,1
1,GEMIFLOXACIN MESYLATE,CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC...,0.041876,1
2,POLYMYXIN B SULFATE,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O...,0.041916,1
3,PRAXADINE HYDROCHLORIDE,Cl.N=C(N)n1cccn1,0.041964,1
4,CHLORHEXIDINE DIHYDROCHLORIDE,Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(...,0.042295,1


In [86]:
get_rdkit_df(df)

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,PMI2,PMI3,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,Eccentricity,Asphericity,SpherocityIndex,PBF
0,13.503995,13.503995,0.064423,-1.32343,0.162954,18.619048,612.65,588.458,612.120937,218.0,...,18319.759568,19439.98279,0.168114,0.942375,5.786531,0.000288,0.985768,0.581404,0.169658,1.234907
1,14.874521,14.874521,0.033885,-3.666667,0.39822,19.484848,485.494,461.302,485.138047,180.0,...,6089.061322,7268.888907,0.214013,0.837688,3.91908,0.000538,0.976831,0.49084,0.120142,0.616989
2,14.537562,14.537562,0.017282,-4.666667,0.026232,22.266667,1301.578,1200.778,1300.717307,512.0,...,84983.376436,90945.037867,0.168343,0.934448,8.571117,6.1e-05,0.985729,0.580256,0.157205,1.741487
3,6.830278,6.830278,0.0,-0.060185,0.401182,8.0,146.581,139.525,146.035924,50.0,...,323.619182,422.191895,0.246892,0.766522,1.702816,0.007354,0.969043,0.439928,0.004004,0.047039
4,7.856055,7.856055,0.0,-0.00148,0.113966,9.5,578.38,546.124,576.156552,198.0,...,38354.915952,38613.156328,0.024521,0.993312,8.207079,0.001049,0.999699,0.92845,0.05817,0.782104


In [87]:
feat = get_rdkit_df(df,postprocess=True)


 63 Columns with zero std: ['NumRadicalElectrons', 'SMR_VSA8', 'SlogP_VSA9', 'EState_VSA11', 'EState_VSA7', 'NumBridgeheadAtoms', 'NumSpiroAtoms', 'fr_Ar_NH', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amidine', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzodiazepine', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_para_hydroxylation', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_quatN', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', '

In [88]:
feat

Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,FpDensityMorgan3,...,fr_ArN,fr_Ar_N,fr_NH0,fr_NH1,fr_halogen,fr_pyridine,fr_unbrch_alkane,NPR1,Eccentricity,SpherocityIndex
0,0.573967,1.707753,0.326109,-0.376858,0.532289,-0.032641,0.663662,-1.438309,0.290024,0.836838,...,-0.5,1.745743,1.543487,-0.594803,-0.816497,0.5,-0.685994,0.843266,-0.976373,0.92431
1,0.970554,0.445177,-0.905885,1.163557,0.683641,-0.370452,0.518408,-0.93938,0.750858,0.804658,...,-0.5,0.109109,0.685994,-1.019662,-0.136083,1.75,-0.685994,0.40659,-0.279346,-0.251512
2,0.873049,-0.241294,-1.431652,-1.272044,1.169935,1.797608,1.199637,0.481615,-1.107045,-0.935665,...,-0.5,-0.981981,-1.028992,1.317064,-0.816497,-0.75,0.171499,-0.250679,0.568328,1.352292
3,-1.357198,-0.955818,0.990281,1.182948,-1.324041,-1.270829,-1.103063,0.849722,1.282954,0.771287,...,2.0,0.109109,-0.171499,-0.807233,-0.136083,-0.75,-0.685994,0.834872,-0.961949,-1.331294
4,-1.060371,-0.955818,1.021147,-0.697604,-1.061824,-0.123685,-1.278643,1.046352,-1.21679,-1.477117,...,-0.5,-0.981981,-1.028992,1.104634,1.905159,-0.75,1.886484,-1.834049,1.649339,-0.693797


## Morgan fingerprints

In [89]:
#| export
def get_morgan_df(df: pd.DataFrame, # a dataframe that contains smiles
               col="SMILES", # colname of smile
               radius=3 , # morgan radius
              ):
    "Get 2048 morgan fingerprint (binary feature) from smiles in a dataframe"
    mols = [Chem.MolFromSmiles(smi) for smi in df[col]]

    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=2048)
    morgan_fps = [mfpgen.GetFingerprint(mol) for mol in mols]
    
    fp_df = pd.DataFrame(np.array(morgan_fps), index=df.index)
    fp_df.columns = "morgan_" + fp_df.columns.astype(str)
    return fp_df

In [90]:
get_morgan(df)

Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Tanimoto similarity
> to deduplicate compounds with similar SMILES

In [20]:
#| export
def tanimoto(df, # df with SMILES and ID columns
             smiles_col='SMILES', # colname of SMILES
             id_col='ID', # colname of compound ID
             target_col=None, # colname of compound values (e.g., IC50)
             radius=2, # radius of the Morgan fingerprint.
             ):
    "Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame."
    
    df = df.copy()
    # Convert SMILES to molecule objects
    df['Molecule'] = df[smiles_col].apply(lambda x: Chem.MolFromSmiles(x))

    # Calculate fingerprints
    df['Fingerprint'] = df['Molecule'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius))

    # Calculate similarity scores
    similarity_scores = []
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            sim_score = DataStructs.TanimotoSimilarity(df['Fingerprint'][i], df['Fingerprint'][j])
            if target_col is not None:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score, df[target_col][i], df[target_col][j]))
            else:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score))

    # Create a new DataFrame with the similarity scores
    if target_col is not None:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore', 'Target1', 'Target2'])
    else:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore'])

    # Sort by similarity score in descending order
    result_df.sort_values('SimilarityScore', ascending=False, inplace=True)
    result_df = result_df.reset_index(drop=True)

    return result_df

In [17]:
df = Data.get_mirati_g12d_raw()[['ID','SMILES','IC50']]
df = df.dropna(subset= 'IC50').reset_index(drop=True)

In [18]:
df.head()

Unnamed: 0,ID,SMILES,IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...,124.7
1,US_2,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2...,2.7
2,US_3,Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)...,9.5
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(...,496.2
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-...,722.9


In [21]:
result = tanimoto(df, target_col = 'IC50')



NameError: name 'DataStructs' is not defined

In [9]:
result

Unnamed: 0,ID1,ID2,SMILES1,SMILES2,SimilarityScore,Target1,Target2
0,US_26,US_461,Oc1cc(-c2ncc3c(nc(OCCN4C[C@@H]5CC4CO5)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,[C@@H]12OC[C@@H](N(C1)CCOC=1N=C(C3=C(N1)C(=C(N=C3)C3=CC(=CC1=CC=CC=C31)O)F)N3C[C@H]1CC[C@@H](C3)N1)C2,1.000000,42.1,62.0
1,US_18,paper_17,CN(C)CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,FC1=C(C2=C(C=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OCCN(C)C)N=C4N5C[C@@H](CC6)N[C@@H]6C5,1.000000,76.8,70.0
2,646_5,646_18,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OCC23CCCN3C(CC2)COC(NC)=O)CN(CC1)C1=CC=CC2=CC=CC(=C12)CC,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@@]23CCCN3[C@H](CC2)COC(NC)=O)CN(CC1)C1=CC=CC2=CC=CC(=C12)CC,1.000000,1.6,2.0
3,US_219,US_225,CO[C@@H]1CN2CCC[C@]2(COc2nc(N3CC4CCC(C3)N4)c3cnc(c(F)c3n2)-c2cccc3cccc(Cl)c23)C1,CO[C@H]1CN2CCC[C@]2(COc2nc(N3CC4CCC(C3)N4)c3cnc(c(F)c3n2)-c2cccc3cccc(Cl)c23)C1,1.000000,8.1,9.2
4,646_89,646_90,ClC=1C=CC=C2C=CC=C(C12)N1CC=2N=C(N=C(C2CC1)N1C[C@H]2C/C(/[C@@H](C1)N2)=C\C#N)OC[C@H]2N(CCC2)C,ClC=1C=CC=C2C=CC=C(C12)N1CC=2N=C(N=C(C2CC1)N1C[C@H]2C/C(/[C@@H](C1)N2)=C/C#N)OC[C@H]2N(CCC2)C,1.000000,10000.0,1462.9
...,...,...,...,...,...,...,...
248155,31678_13,646_22,ClC1=C(C(=C2C=NNC2=C1)C1=C(C=2N=C(N=C(C2C=N1)NCC1(CCC1)N(C)C)OC[C@]12CCCN2C[C@@H](C1)F)F)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.113821,165.0,7862.6
248156,31678_57,646_22,CN(C1(CCC1)CNC=1C2=C(N=C(N1)OC[C@]13CCCN3C[C@@H](C1)F)C(=C(N=C2)C2=CC(=CC1=CC=C(C(=C21)CC)F)O)F)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.112000,1.1,7862.6
248157,31678_46,646_22,CN(C(OC[C@H]1CC[C@]2(CCCN12)COC=1N=C(C2=C(N1)C(=C(N=C2)C2=CC(=CC1=CC=C(C(=C21)CC)F)O)F)NCC2(CCC2)N(C)C)=O)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.105263,0.7,7862.6
248158,31678_45,646_22,CN(C(OC[C@H]1CC[C@]2(CCCN12)COC=1N=C(C2=C(N1)C(=C(N=C2)C2=CC(=CC1=CC=C(C(=C21)C#C)F)O)F)NCC2(CCC2)N(C)C)=O)C,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC)CN(CC1)C1=CC=CC2=CC=CC(=C12)C,0.104478,0.4,7862.6


In [91]:
#| hide
import nbdev; nbdev.nbdev_export()