## This script demonstrates how to use `polaris.curation` module to perform the data curation
- DMPK datasets published in in Fang et al. 2023 (DOI:10.1021/acs.jcim.3c00160). 

- Curate the chemistry information on the molecules.
  - Clean the molecules by perform molecule fix and sanitization,  standardization molecules, salts/solvents removals.
  - Remove stereochemistry information if `ignore_stereo` is set to `True`. This is recommended if the downstream molecule representation is not able to differentiate the stereoisomers. 

- Curate the measured endpoint values in the datasets
  - Merge measurements of repeated molecules in the dataset. The identification of the repeated molecules is defined by `dm.hash_mol` with or without stereochemistry information.
  - Classify the measured values based on provided threshold values for classification tasks.
  - Detect activity cliff between the stereoisomers. When `mask_stereo_cliff` is set to true, the targeted activity values of those molecules pairs will be set to `None`. This is recommended if the downstream molecule representation is not able to differentiate the stereoisomers.


In [29]:
%load_ext autoreload
%autoreload 2
import datamol as dm 
import pandas as pd
from polaris import curation

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Data curation for DMPK datasets

In [30]:
INDIR = "gs://polaris-private/dataset/DMPK/Fang2023"
OUTDIR = "gs://polaris-private/dataset/DMPK"

In [31]:
# Define data column names
endpoints = {
             "HLM": 'LOG HLM_CLint (mL/min/kg)', 
             "RLM": 'LOG RLM_CLint (mL/min/kg)', 
             "hPPB": 'LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound)', 
             "rPPB": 'LOG PLASMA PROTEIN BINDING (RAT) (% unbound)', 
             "MDR1_ER": 'LOG MDR1-MDCK ER (B-A/A-B)', 
             "Sol": 'LOG SOLUBILITY PH 6.8 (ug/mL)'}

# Define thresholds for class conversions 
class_thresholds = {
                 "hPPB":  {"thresholds": [0.3, 1], "label_order": "descending" }, 
                 "rPPB": {"thresholds": [0.3, 1], "label_order": "descending"  }, 
                 "MDR1_ER": {"thresholds": [ 1, 2] }, 
                 "Sol": {"thresholds": [ 0, 1,]}, 
}

### Perform curation which takes stereochemistry information into account. 

It's important to detect and analyze the activity cliff between the stereoisomers.

In [32]:
data = dm.read_csv("ADME_public_set_3521.csv")

In [33]:
data.describe()

Unnamed: 0,LOG HLM_CLint (mL/min/kg),LOG MDR1-MDCK ER (B-A/A-B),LOG SOLUBILITY PH 6.8 (ug/mL),LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound),LOG PLASMA PROTEIN BINDING (RAT) (% unbound),LOG RLM_CLint (mL/min/kg)
count,3087.0,2642.0,2173.0,194.0,168.0,3054.0
mean,1.320019,0.397829,1.259943,0.765722,0.764177,2.256207
std,0.623952,0.688465,0.683416,0.847902,0.798988,0.750422
min,0.675687,-1.162425,-1.0,-1.59346,-1.638272,1.02792
25%,0.675687,-0.162356,1.15351,0.168067,0.226564,1.688291
50%,1.205313,0.153291,1.542825,0.867555,0.776427,2.311068
75%,1.803115,0.905013,1.687351,1.501953,1.375962,2.835274
max,3.372714,2.725057,2.179264,2.0,2.0,3.969622


In [34]:
data_cols =  list(endpoints.values())
mol_col = "SMILES"

In [35]:
# curate 
curator_with_stereo = curation.MolecularCurator(data=data, data_cols=data_cols, mol_col=mol_col,
                                                mask_stereo_undefined_mols=True, 
                                                class_thresholds={endpoints[ep]:class_thresholds[ep] for ep in class_thresholds.keys()})
df_full = curator_with_stereo.run()

In [8]:
# df_full_v0  = dm.read_csv(f"{OUTDIR}/ADME_public_set_3521_curated.csv")
df_full.to_csv(f"{OUTDIR}/ADME_public_set_3521_curated_v1.csv", index=False)



In [9]:
f"{OUTDIR}/ADME_public_set_3521_curated.csv"

'gs://polaris-private/dataset/DMPK/ADME_public_set_3521_curated.csv'

In [10]:
### extend the dataset

In [11]:
_endpoint = ["hPPB", "rPPB"]

In [12]:
data_dict = {}
for endpoint in _endpoint:
    data_dict[endpoint] = dm.read_sdf(f"{INDIR}/ADME_{endpoint}.sdf", as_df=True)
    

In [13]:
data_dict["hPPB"]["CollectionName"].value_counts()

CollectionName
chembl           1614
emolecules        187
mcule               3
labnetworkBB        2
enamineBB_pmc       1
enamineHTS          1
Name: count, dtype: int64

In [15]:
data = dm.read_csv("ADME_public_set_3521.csv")

In [16]:
data.dropna(subset=endpoints["hPPB"])["CollectionName"].value_counts()

CollectionName
emolecules       187
mcule              3
labnetworkBB       2
enamineBB_pmc      1
enamineHTS         1
Name: count, dtype: int64

In [17]:
chembl_dict = {}
chembl_dict['hPPB'] = data_dict["hPPB"].query("CollectionName == 'chembl'")
chembl_dict['rPPB'] = data_dict["rPPB"].query("CollectionName == 'chembl'")

In [18]:
cols = ["LOG PLASMA PROTEIN BINDING (RAT) (% unbound)", "SMILES", "Internal ID","Source","CollectionName"]
pbb_df = chembl_dict['hPPB'].merge(chembl_dict['rPPB'][cols], on=["SMILES", "Internal ID","Source","CollectionName"], how="outer")
pbb_df.drop(columns="smiles", inplace=True)

In [21]:
extended_data = pd.concat([data, pbb_df], axis=0)

In [22]:
extended_data.reset_index(drop=True).to_csv(f"{OUTDIR}/ADME_public_set_extended.csv", index=False)

In [23]:
for col in data_cols:
    print(col)
    print(extended_data.dropna(subset=[col])["CollectionName"].value_counts())
    print("------------------")

LOG HLM_CLint (mL/min/kg)
CollectionName
emolecules       3027
enamineHTS         20
labnetworkBB       17
mcule              17
enamineBB_pmc       6
Name: count, dtype: int64
------------------
LOG RLM_CLint (mL/min/kg)
CollectionName
emolecules       2997
enamineHTS         19
labnetworkBB       17
mcule              15
enamineBB_pmc       6
Name: count, dtype: int64
------------------
LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound)
CollectionName
chembl           1614
emolecules        187
mcule               3
labnetworkBB        2
enamineBB_pmc       1
enamineHTS          1
Name: count, dtype: int64
------------------
LOG PLASMA PROTEIN BINDING (RAT) (% unbound)
CollectionName
chembl           717
emolecules       162
labnetworkBB       3
enamineBB_pmc      2
mcule              1
Name: count, dtype: int64
------------------
LOG MDR1-MDCK ER (B-A/A-B)
CollectionName
emolecules       2594
labnetworkBB       16
enamineHTS         14
mcule              13
enamineBB_pmc       5
Name: c

In [24]:
file = f"{OUTDIR}/ADME_public_set_extended.csv"

In [25]:
file

'gs://polaris-private/dataset/DMPK/ADME_public_set_extended.csv'

In [27]:
data = pd.read_csv(file)
curator_with_stereo = curation.MolecularCurator(data=data, data_cols=data_cols, mol_col="SMILES",
                                                mask_stereo_undefined_mols=True, 
                                                class_thresholds={endpoints[ep]:class_thresholds[ep] for ep in class_thresholds.keys()})
df_full = curator_with_stereo.run()

In [28]:
file_out = 'gs://polaris-private/dataset/DMPK/ADME_public_set_extended_curated_v1.csv'
df_full.to_csv(file_out, index=False)