# Core

## Setup

In [None]:
#| default_exp core

In [None]:
#| export
# basics
import pandas as pd, numpy as np
import subprocess,shutil,zipfile
from functools import lru_cache
from pathlib import Path
from tqdm import tqdm
tqdm.pandas()

# rdkit
from rdkit import Chem,DataStructs
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Draw,Descriptors,Descriptors3D, AllChem,rdFingerprintGenerator

from sklearn.preprocessing import StandardScaler
from sklearn import set_config
set_config(transform_output="pandas")

## Data

In [None]:
#| export
class Data:
    "A class for fetching various datasets."

    ANTIBIOTICS_URL = "https://github.com/sky1ove/kdock/raw/main/dataset/antibiotics_2335.csv"
    G12D_URL = "https://github.com/sky1ove/kdock/raw/main/dataset/KRASi_g12d.csv"
    G12D_DEDUP_URL = "https://github.com/sky1ove/kdock/raw/main/dataset/KRASi_g12d_dedup.csv"
    KRAS_SEQ_URL = "https://github.com/sky1ove/kdock/raw/main/dataset/kras_seq.csv"

    @staticmethod
    @lru_cache(maxsize=None)
    def fetch_csv(url): return pd.read_csv(url)
    
    @staticmethod
    def get_antibiotics():
        "Deduplicated screening antibiotics dataset; Table S1B from Cell: A Deep Learning Approach to Antibiotic Discovery."
        return Data.fetch_csv(Data.ANTIBIOTICS_URL)
    
    @staticmethod
    def get_mirati_g12d():
        "Get the deduplicated G12D dataset from the mirati paper and patents."
        return Data.fetch_csv(Data.G12D_DEDUP_URL)
    
    @staticmethod
    def get_mirati_g12d_raw():
        "Get the raw combined G12D dataset from the paper and patents."
        return Data.fetch_csv(Data.G12D_URL)
    
    @staticmethod
    def get_kras_seq():
        "Get the sequence of KRAS and its mutations G12D and G12C."
        return Data.fetch_csv(Data.KRAS_SEQ_URL)

In [None]:
Data.get_antibiotics().head()

Unnamed: 0,name,SMILES,inhibition,activity
0,CEFPIRAMIDE,Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(C...,0.041572,1
1,GEMIFLOXACIN MESYLATE,CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC...,0.041876,1
2,POLYMYXIN B SULFATE,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O...,0.041916,1
3,PRAXADINE HYDROCHLORIDE,Cl.N=C(N)n1cccn1,0.041964,1
4,CHLORHEXIDINE DIHYDROCHLORIDE,Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(...,0.042295,1


In [None]:
Data.get_mirati_g12d().head()

Unnamed: 0,ID,SMILES,Kd,IC50,erk_IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...,97.7,124.7,3159.1
1,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(...,155.7,496.2,8530.0
2,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-...,294.8,722.9,8193.8
3,US_6,Cc1cccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1...,442.2,434.1,11518.2
4,US_7,Oc1cc(-c2ncc3c(nc(OCCc4ncccn4)nc3c2F)N2CC3CCC(...,463.5,1867.3,


In [None]:
Data.get_mirati_g12d_raw().head()

Unnamed: 0,ID,SMILES,group,with_3F,racemic_trans,mixture_isomer,trans,Kd,IC50,erk_IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...,US,0,0,0,0,97.7,124.7,3159.1
1,US_2,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2...,US,1,0,0,0,2.4,2.7,721.4
2,US_3,Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)...,US,0,0,0,0,8.3,9.5,10283.1
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(...,US,0,0,0,0,155.7,496.2,8530.0
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-...,US,0,0,0,0,294.8,722.9,8193.8


In [None]:
Data.get_kras_seq().head()

Unnamed: 0,ID,WT_sequence,g12d_seq,g12c_seq
0,kras_human,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGACGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...
1,kras_human_isoform2b,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGACGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...


## Copy files

In [None]:
#| export
def copy_files(file_list, dest_dir):
    "Copy a list of files to the destination directory, or zip them if dest_dir ends with .zip."
    dest_path = Path(dest_dir)

    if dest_path.suffix == ".zip":
        with zipfile.ZipFile(dest_path, 'w') as zipf:
            for file_path in file_list:
                file_path = Path(file_path)
                zipf.write(file_path, arcname=file_path.name)
        print(f'Zipped {len(file_list)} files to {dest_path}')
    else:
        dest_path.mkdir(parents=True, exist_ok=True)
        for file_path in file_list:
            file_path = Path(file_path)
            shutil.copy2(file_path, dest_path / file_path.name)
        print(f'Copied {len(file_list)} files to {dest_path}')

In [None]:
# file_list = list(Path('output_folder').rglob('*data.json'))
# copy_files(file_list,'protein.zip')
# copy_files(file_list,'protein')

In [None]:
#| export
def rglob(path, pattern, max_depth):
    base_path = Path(path).resolve()
    for path in base_path.rglob(pattern):
        if len(path.relative_to(base_path).parts) <= max_depth:
            yield path

In [None]:
# file_list = list(rglob_limited_depth('folder','*model.cif',2))
# file_list

## Conformer generation

In [None]:
#| export
def rdkit_conformer(SMILES, # SMILES string
                    output, # file ".sdf" to be saved
                    method='ETKDG', # Optimization method, can be 'UFF', 'MMFF' or 'ETKDGv3'
                    visualize=True, #whether or not to visualize the compound
                    seed = 3,# randomness of the 3D conformation
                    ):

    "Gemerate 3D conformers from SMILES"
    
    np.random.seed(seed) 
    mol = Chem.MolFromSmiles(SMILES)
    
    # Generate a 3D conformation of the molecule
    AllChem.EmbedMolecule(mol)
    

    # Optimize the 3D conformation using the specified force field method
    if method == 'UFF':
        AllChem.UFFOptimizeMolecule(mol)
    elif method == 'MMFF':
        AllChem.MMFFOptimizeMolecule(mol)
    elif method == 'ETKDG':
        AllChem.EmbedMultipleConfs(mol, numConfs=1, useExpTorsionAnglePrefs=True, 
                                   useBasicKnowledge=True, enforceChirality=True,randomSeed=seed)
        AllChem.ETKDGv3()
        AllChem.UFFOptimizeMolecule(mol)

    else:
        raise ValueError('Invalid method specified')
        

    # Remove hydrogens from the molecule
    # mol = Chem.RemoveHs(mol)
    
    Path(output).parent.mkdir(parents=True,exist_ok=True)

    w = Chem.SDWriter(output)
    w.write(mol)
    w.close()

## Get receptor and ligand from pdb

In [None]:
#| export
def get_rec_lig(pdb_id: str, # pdb id for download
                            lig_id: str, # ligand id shown on the protein page
                            out_dir = '.', # directory path to save pdb files
                            ):
    "Download pdb and extract receptor and ligand from a PDB ID."
    out_dir = Path(out_dir).expanduser().resolve()
    out_dir.mkdir(parents=True, exist_ok=True)

    pdb_file = out_dir / f"{pdb_id}.pdb"
    rec_file = out_dir / f"{pdb_id}_receptor.pdb"
    lig_pdb_file = out_dir / f"{pdb_id}_lig.pdb"
    lig_sdf_file = out_dir / f"{pdb_id}_lig.sdf"

    # Download if not exists
    if not pdb_file.exists():
        url = f"http://files.rcsb.org/download/{pdb_id}.pdb"
        print(f'Downloading pdb: {pdb_id}')
        subprocess.run(["wget", url, "-O", str(pdb_file)], check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    print(f'{pdb_id}.pdb is detected!')

    # Extract protein (all ATOM lines excluding ligand ID)
    with open(pdb_file) as infile, open(rec_file, 'w') as out_rec:
        for line in infile:
            if line.startswith("ATOM") and lig_id not in line:
                out_rec.write(line)

    # Extract ligand
    with open(pdb_file) as infile, open(lig_pdb_file, 'w') as out_lig:
        for line in infile:
            if lig_id in line and line.startswith(("HETATM", "ATOM")):
                out_lig.write(line)

    # Convert ligand PDB to SDF using RDKit
    mol = Chem.MolFromPDBFile(str(lig_pdb_file), removeHs=False)
    if mol is None:
        raise ValueError("Failed to parse ligand from PDB.")
    
    writer = Chem.SDWriter(str(lig_sdf_file))
    writer.write(mol)
    writer.close()

    return str(rec_file), str(lig_sdf_file)

In [None]:
rec_path,lig_path = get_rec_lig('7OFF','VCB','core_test')
rec_path,lig_path

7OFF.pdb is detected!


('/teamspace/studios/this_studio/kdock/nbs/core_test/7OFF_receptor.pdb',
 '/teamspace/studios/this_studio/kdock/nbs/core_test/7OFF_lig.sdf')

## Get ligand box

In [None]:
#| export
def get_box(sdf_file, autobox_add=4.0,tolist=False):
    "Get the box coordinates of ligand.sdf; mimic GNINA's --autobox_ligand behavior."
    mol = Chem.SDMolSupplier(str(sdf_file), removeHs=False)[0]
    if mol is None:
        raise ValueError(f"Failed to read molecule from {sdf_file}")
    
    conf = mol.GetConformer()
    coords = np.array([list(conf.GetAtomPosition(i)) for i in range(mol.GetNumAtoms())])
    
    min_coords = coords.min(axis=0)
    max_coords = coords.max(axis=0)
    
    center = (min_coords + max_coords) / 2
    size = (max_coords - min_coords) + autobox_add

    box_dict = {
        "center_x": round(float(center[0]), 3),
        "center_y": round(float(center[1]), 3),
        "center_z": round(float(center[2]), 3),
        "size_x": round(float(size[0]), 3),
        "size_y": round(float(size[1]), 3),
        "size_z": round(float(size[2]), 3)
    }
    return list(box_dict.values()) if tolist else box_dict

In [None]:
box = get_box(lig_path)
box

{'center_x': 38.848,
 'center_y': -26.77,
 'center_z': 10.419,
 'size_x': 14.652,
 'size_y': 8.942,
 'size_z': 12.509}

In [None]:
box_list = get_box(lig_path,tolist=True)
box_list

[38.848, -26.77, 10.419, 14.652, 8.942, 12.509]

## Rdkit feature

In [None]:
#| export
def get_rdkit(SMILES:str):
    """
    Extract chemical features from SMILES
    Reference: https://greglandrum.github.io/rdkit-blog/posts/2022-12-23-descriptor-tutorial.html
    """
    mol = Chem.MolFromSmiles(SMILES)
    return Descriptors.CalcMolDescriptors(mol)

In [None]:
#| export
def get_rdkit_3d(SMILES:str):
    "Extract 3d features from SMILES"
    mol = Chem.MolFromSmiles(SMILES)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol, AllChem.ETKDG())
    AllChem.UFFOptimizeMolecule(mol)
    return Descriptors3D.CalcMolDescriptors3D(mol)

In [None]:
#| export
def get_rdkit_all(SMILES:str):
    "Extract chemical features and 3d features from SMILES"
    feat = get_rdkit(SMILES)
    feat_3d = get_rdkit_3d(SMILES)
    return feat|feat_3d

In [None]:
#| export
def remove_hi_corr(df: pd.DataFrame, 
                   thr=0.99 # threshold
                   ):
    "Remove highly correlated features in a dataframe given a pearson threshold"
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > thr)]
    return df.drop(to_drop, axis=1), to_drop

In [None]:
#| export
def preprocess(df: pd.DataFrame, thr=0.99):
    "Remove features with no variance, and highly correlated features based on threshold."
    col_ori = df.columns

    # Remove columns with std == 0
    std_zero_cols = df.columns[df.std() == 0].tolist()
    
    if std_zero_cols:
        n=len(std_zero_cols)
        print(f"\n {n} Columns with zero std: {std_zero_cols}")
    df = df.loc[:, df.std() != 0].copy()

    # Remove highly correlated columns
    df, high_corr_cols = remove_hi_corr(df, thr)
    if high_corr_cols:
        n=len(high_corr_cols)
        print(f"\n {n} Columns removed due to high similarity (pearson>{thr}): {high_corr_cols}")

    dropping_col = set(col_ori) - set(df.columns)
    n = len(dropping_col)
    print(f"\n Total removed columns: {n}")
    
    return df

In [None]:
#| export
def get_rdkit_df(df: pd.DataFrame,
                 include_3d=False,
                 col='SMILES', # column of SMILES
                 postprocess=False, # remove redundant columns and standardize features for dimension reduction
                 ):
    "Extract rdkit features (including 3d) from SMILES in a df"
    if include_3d:
        out = df[col].progress_apply(get_rdkit_all).apply(pd.Series)
    else:
        out = df[col].progress_apply(get_rdkit).apply(pd.Series)
    if postprocess:
        out = StandardScaler().fit_transform(out)
        out = preprocess(out) # remove redundant
    return out

In [None]:
df=Data.get_antibiotics().head()

In [None]:
df

Unnamed: 0,name,SMILES,inhibition,activity
0,CEFPIRAMIDE,Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(C...,0.041572,1
1,GEMIFLOXACIN MESYLATE,CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC...,0.041876,1
2,POLYMYXIN B SULFATE,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O...,0.041916,1
3,PRAXADINE HYDROCHLORIDE,Cl.N=C(N)n1cccn1,0.041964,1
4,CHLORHEXIDINE DIHYDROCHLORIDE,Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(...,0.042295,1


In [None]:
get_rdkit_df(df)

100%|██████████| 5/5 [00:00<00:00, 44.09it/s]


Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,13.503995,13.503995,0.064423,-1.32343,0.162954,18.619048,612.65,588.458,612.120937,218.0,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,14.874521,14.874521,0.033885,-3.666667,0.39822,19.484848,485.494,461.302,485.138047,180.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14.537562,14.537562,0.017282,-4.666667,0.026232,22.266667,1301.578,1200.778,1300.717307,512.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,6.830278,6.830278,0.0,-0.060185,0.401182,8.0,146.581,139.525,146.035924,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.856055,7.856055,0.0,-0.00148,0.113966,9.5,578.38,546.124,576.156552,198.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [None]:
feat = get_rdkit_df(df,postprocess=True)

100%|██████████| 5/5 [00:00<00:00, 44.12it/s]


 63 Columns with zero std: ['NumRadicalElectrons', 'SMR_VSA8', 'SlogP_VSA9', 'EState_VSA11', 'EState_VSA7', 'NumBridgeheadAtoms', 'NumSpiroAtoms', 'fr_Ar_NH', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amidine', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzodiazepine', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_para_hydroxylation', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_quatN', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', '




In [None]:
feat

Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,MaxPartialCharge,MinPartialCharge,FpDensityMorgan1,FpDensityMorgan3,...,NumSaturatedRings,RingCount,MolLogP,fr_ArN,fr_Ar_N,fr_NH0,fr_NH1,fr_halogen,fr_pyridine,fr_unbrch_alkane
0,0.573967,1.707753,0.326109,-0.376858,0.532289,-0.032641,0.663662,-1.438309,0.290024,0.836838,...,0.267261,1.49691,0.096127,-0.5,1.745743,1.543487,-0.594803,-0.816497,0.5,-0.685994
1,0.970554,0.445177,-0.905885,1.163557,0.683641,-0.370452,0.518408,-0.93938,0.750858,0.804658,...,1.603567,0.816497,0.137385,-0.5,0.109109,0.685994,-1.019662,-0.136083,1.75,-0.685994
2,0.873049,-0.241294,-1.431652,-1.272044,1.169935,1.797608,1.199637,0.481615,-1.107045,-0.935665,...,0.267261,-0.544331,-1.696198,-0.5,-0.981981,-1.028992,1.317064,-0.816497,-0.75,0.171499
3,-1.357198,-0.955818,0.990281,1.182948,-1.324041,-1.270829,-1.103063,0.849722,1.282954,0.771287,...,-1.069045,-1.224745,0.015427,2.0,0.109109,-0.171499,-0.807233,-0.136083,-0.75,-0.685994
4,-1.060371,-0.955818,1.021147,-0.697604,-1.061824,-0.123685,-1.278643,1.046352,-1.21679,-1.477117,...,-1.069045,-0.544331,1.447259,-0.5,-0.981981,-1.028992,1.104634,1.905159,-0.75,1.886484


## Morgan fingerprints

In [None]:
# #| export
# def get_morgan_df(df: pd.DataFrame, # a dataframe that contains smiles
#                col="SMILES", # colname of smile
#                radius=3 , # morgan radius
#               ):
#     "Get 2048 morgan fingerprint (binary feature) from smiles in a dataframe"
#     mols = [Chem.MolFromSmiles(smi) for smi in df[col]]

#     mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=2048)
#     morgan_fps = [mfpgen.GetFingerprint(mol) for mol in mols]
    
#     fp_df = pd.DataFrame(np.array(morgan_fps), index=df.index)
#     fp_df.columns = "morgan_" + fp_df.columns.astype(str)
#     return fp_df

TODO

## Tanimoto similarity
> to deduplicate compounds with similar SMILES

In [None]:
#| export
def tanimoto(df, # df with SMILES and ID columns
             smiles_col='SMILES', # colname of SMILES
             id_col='ID', # colname of compound ID
             target_col=None, # colname of compound values (e.g., IC50)
             radius=2, # radius of the Morgan fingerprint.
             ):
    "Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame."
    
    df = df.copy()
    # Convert SMILES to molecule objects
    df['Molecule'] = df[smiles_col].apply(lambda x: Chem.MolFromSmiles(x))

    # Calculate fingerprints
    df['Fingerprint'] = df['Molecule'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius))

    # Calculate similarity scores
    similarity_scores = []
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            sim_score = DataStructs.TanimotoSimilarity(df['Fingerprint'][i], df['Fingerprint'][j])
            if target_col is not None:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score, df[target_col][i], df[target_col][j]))
            else:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score))

    # Create a new DataFrame with the similarity scores
    if target_col is not None:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore', 'Target1', 'Target2'])
    else:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore'])

    # Sort by similarity score in descending order
    result_df.sort_values('SimilarityScore', ascending=False, inplace=True)
    result_df = result_df.reset_index(drop=True)

    return result_df

In [None]:
df = Data.get_mirati_g12d_raw()[['ID','SMILES','IC50']]
df = df.dropna(subset= 'IC50').reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,ID,SMILES,IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)...,124.7
1,US_2,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2...,2.7
2,US_3,Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)...,9.5
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(...,496.2
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-...,722.9


In [None]:
# result = tanimoto(df.head(), target_col = 'IC50')

TODO: modify morgan/ecfp

TODO: add more fingerprints

## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()