## Background
This is part of a release of experimental data determined at AstraZeneca on a set of compounds in the following assays: pKa, lipophilicity (LogD7.4), aqueous solubility, plasma protein binding (human, rat, dog , mouse and guinea pig), intrinsic clearance (human liver microsomes, human and rat hepatocytes). 

## Description of readout:
- **LOGD_74**: Octan-1-ol/water (pH7.4) distribution coefficent measured by a shake flask method described in J. Biomol. Screen. 2011, 16, 348-355.
- **SOLUBILITY_74**: Solubility in pH7.4 buffer using solid starting material using the method described in J. Assoc. Lab. Autom. 2011, 16, 276-284.
- **PPB**: Percent plasma-bound. % bound to plasma by equilibrium dialysis. Compound is incubated with whole human plasma at 37C for >5hrs. Method described in B. Testa et al (Eds.), Pharmacokinetic Profiling in Drug Research: Biological, Physicochemical, and Computational Strategies, Wiley-VCH, Weinheim, 2006, pp.119-141. Experimental range 10% to 99.95% bound.
- **HLM_CLEARANCE**: Intrinsic clearance measured in human liver microsomes following incubation at 37C. Experimental range <3 to >150 microL/min/mg. Rapid Commun. Mass Spectrom. 2010, 24, 1730-1736.

## Data resource

**Reference**: https://www.ebi.ac.uk/chembl/document_report_card/CHEMBL3301361/

**Raw data**: https://www.ebi.ac.uk/chembl/assay_report_card/CHEMBL3301363/, https://www.ebi.ac.uk/chembl/assay_report_card/CHEMBL3301364/, https://www.ebi.ac.uk/chembl/assay_report_card/CHEMBL3301363/, https://www.ebi.ac.uk/chembl/assay_report_card/CHEMBL3301370/

In [46]:
import datamol as dm

from polaris import (
    curation,
    load_dataset,
)
from polaris.curation.viz_utils import (
    visualize_distribution,
    verify_stereoisomers,
    check_undefined_stereocenters,
)
from polaris.dataset import Dataset, ColumnAnnotation
from polaris.utils.types import HubOwner, License
from polaris.hub.client import PolarisHubClient

import tempfile

# For downloading ChEMBL datasets
from chembl_webresource_client.new_client import new_client as client
from tqdm.auto import tqdm
import warnings
import pandas as pd

In [47]:
# load data
# logd = pd.read_parquet('./data/CHEMBL3301363_curated.parquet') # If local
logd = pd.read_parquet(
    f"gs://polaris-public/data/raw/AstraZeneca/CHEMBL3301363_curated.parquet"
)
print(len(logd))
# load data
# sol = pd.read_parquet('./data/CHEMBL3301364_curated.parquet') # If local
sol = pd.read_parquet(
    f"gs://polaris-public/data/raw/AstraZeneca/CHEMBL3301364_curated.parquet"
)
print(len(sol))

# load data
# ppb = pd.read_parquet('./data/CHEMBL3301365_curated.parquet') # If local
ppb = pd.read_parquet(
    f"gs://polaris-public/data/raw/AstraZeneca/CHEMBL3301365_curated.parquet"
)
print(len(ppb))

# load data
# hlm = pd.read_parquet('./data/CHEMBL3301370_curated.parquet') # If local
hlm = pd.read_parquet(
    f"gs://polaris-public/data/raw/AstraZeneca/CHEMBL3301370_curated.parquet"
)
print(len(hlm))

4193
1699
1598
1098


Combine all of the datasets

In [48]:
combined = pd.concat([logd, sol, ppb, hlm])

Have a look at the columns

In [49]:
combined.columns

Index(['SMILES', 'LOGD_74', 'ORIGINAL_LOGD_74', 'ORIGINAL_SMILES', 'smiles',
       'molhash_id', 'molhash_id_no_stereo', 'num_stereo_center',
       'num_undefined_stereo_center', 'num_defined_stereo_center',
       'num_stereoisomers', 'num_undefined_stereoisomers', 'undefined_E_D',
       'undefined_E/Z', 'OUTLIER_LOGD_74', 'LOGD_74_zscore',
       'LOGD_74_stereo_cliff', 'SOLUBILITY_74', 'ORIGINAL_SOLUBILITY_74',
       'OUTLIER_SOLUBILITY_74', 'SOLUBILITY_74_stereo_cliff', 'PPB',
       'ORIGINAL_PPB', 'OUTLIER_PPB', 'PPB_zscore', 'PPB_stereo_cliff',
       'HLM_CLEARANCE', 'ORIGINAL_HLM_CLEARANCE', 'OUTLIER_HLM_CLEARANCE',
       'HLM_CLEARANCE_zscore', 'HLM_CLEARANCE_stereo_cliff'],
      dtype='object')

In [50]:
# Specify wanted columns
desired_columns = ["smiles", "LOGD_74", "SOLUBILITY_74", "PPB", "HLM_CLEARANCE"]
# Just get the columns that we want
combined = combined[desired_columns]

Run curation again - the curator will average or combine values that have NaN in one sample and a value in another.

In [51]:
endpoints = ["LOGD_74", "SOLUBILITY_74", "PPB", "HLM_CLEARANCE"]
data_cols = endpoints
mol_col = "smiles"

# preliminary curation for data inspection
curator = curation.MolecularCurator(
    data=combined,
    data_cols=data_cols,
    mol_col=mol_col,
    mask_stereo_undefined_mols=False,
    keep_all_rows=False,
)
data_look = curator()

  data.loc[data[NO_STEREO_UNIQUE_ID].isin(mol_with_cliff), f"{data_col}_stereo_cliff"] = True
  data.loc[data[NO_STEREO_UNIQUE_ID].isin(mol_with_cliff), f"{data_col}_stereo_cliff"] = True
  data.loc[data[NO_STEREO_UNIQUE_ID].isin(mol_with_cliff), f"{data_col}_stereo_cliff"] = True
  data.loc[data[NO_STEREO_UNIQUE_ID].isin(mol_with_cliff), f"{data_col}_stereo_cliff"] = True


In [52]:
data_look

Unnamed: 0,smiles,LOGD_74,SOLUBILITY_74,PPB,HLM_CLEARANCE,ORIGINAL_LOGD_74,ORIGINAL_SOLUBILITY_74,ORIGINAL_PPB,ORIGINAL_HLM_CLEARANCE,ORIGINAL_smiles,...,OUTLIER_SOLUBILITY_74,OUTLIER_PPB,OUTLIER_HLM_CLEARANCE,LOGD_74_zscore,LOGD_74_stereo_cliff,SOLUBILITY_74_stereo_cliff,PPB_zscore,PPB_stereo_cliff,HLM_CLEARANCE_zscore,HLM_CLEARANCE_stereo_cliff
0,CN[C@@H](C)C(=O)N[C@H](C(=O)N[C@H]1CCCN(Cc2ccc...,2.01,,63.47,,2.01,,,,CN[C@@H](C)C(=O)N[C@H](C(=O)N[C@H]1CCCN(Cc2ccc...,...,False,False,False,-0.287892,,,-5.541574,,,
1,C[C@H]1O[C@@H](n2cnc3c(N)nc(OCC4CC(F)(F)C4(F)F...,1.42,,,,1.42,,,,C[C@H]1O[C@@H](n2cnc3c(N)nc(OCC4CC(F)(F)C4(F)F...,...,False,False,False,-0.773197,,,,,,
2,FC(F)(F)c1cc(COCC(c2cccc(Cl)c2)N2CCNCC2)cc(C(F...,4.20,,,,4.20,,,,FC(F)(F)c1cc(COCC(c2cccc(Cl)c2)N2CCNCC2)cc(C(F...,...,False,False,False,1.513491,,,,,,
3,COc1ccc(N2CCN(C(=O)[C@@H]3CCCC[C@H]3C(=O)NC3(C...,1.40,,,,1.40,,,,COc1ccc(N2CCN(C(=O)[C@@H]3CCCC[C@H]3C(=O)NC3(C...,...,False,False,False,-0.789648,,,,,,
4,CSCCC(NC(=O)c1sccc1Cl)c1nc2ccccc2[nH]1,3.69,25700.0,,63.0,3.69,,,,CSCCC(NC(=O)c1sccc1Cl)c1nc2ccccc2[nH]1,...,False,False,False,1.093991,,,,,3.545472,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5506,CC(=O)N1CCN(CCOc2ccc(C3CCN(c4ccc5nnc(C(F)(F)F)...,3.20,5400.0,,150.0,3.20,,,,CC(=O)N1CCN(CCOc2ccc(C3CCN(c4ccc5nnc(C(F)(F)F)...,...,,,,0.690942,,,,,9.664419,
5507,O=c1[nH]c2c(O)ccc([C@@H](O)CNCCCSCCNCCc3cccc(C...,1.06,,,20.0,1.06,,,,O=c1[nH]c2c(O)ccc([C@@H](O)CNCCCSCCNCCc3cccc(C...,...,,,,-1.069315,,,,,0.521165,
5508,Nc1ccc(OCc2ccccc2)cc1,2.20,467700.0,,,2.20,,,,Nc1ccc(OCc2ccccc2)cc1,...,,,,-0.131608,,,,,,
5509,NC1=NN(c2cccc(C(F)(F)F)c2)CC1,2.55,1445400.0,,,2.55,,,,NC1=NN(c2cccc(C(F)(F)F)c2)CC1,...,,,,0.156284,,,,,,


Having a look at overlaps between different assays:

In [53]:
for i in ["LOGD_74", "SOLUBILITY_74", "PPB", "HLM_CLEARANCE"]:
    for j in ["LOGD_74", "SOLUBILITY_74", "PPB", "HLM_CLEARANCE"]:
        if i == j:  # Don't print self-self comparisons
            continue
        else:
            print(
                f"overlap between {i} and {j}: {len(data_look[(~data_look[i].isna() & ~data_look[j].isna())])}"
            )

overlap between LOGD_74 and SOLUBILITY_74: 1047
overlap between LOGD_74 and PPB: 993
overlap between LOGD_74 and HLM_CLEARANCE: 883
overlap between SOLUBILITY_74 and LOGD_74: 1047
overlap between SOLUBILITY_74 and PPB: 148
overlap between SOLUBILITY_74 and HLM_CLEARANCE: 202
overlap between PPB and LOGD_74: 993
overlap between PPB and SOLUBILITY_74: 148
overlap between PPB and HLM_CLEARANCE: 533
overlap between HLM_CLEARANCE and LOGD_74: 883
overlap between HLM_CLEARANCE and SOLUBILITY_74: 202
overlap between HLM_CLEARANCE and PPB: 533


In [54]:
for i in ["LOGD_74", "SOLUBILITY_74", "PPB", "HLM_CLEARANCE"]:
    for j in ["LOGD_74", "SOLUBILITY_74", "PPB", "HLM_CLEARANCE"]:
        for k in ["LOGD_74", "SOLUBILITY_74", "PPB", "HLM_CLEARANCE"]:
            if (
                (i == j == k) or (i == k) or (i == j) or (j == k)
            ):  # Don't show redundant comparisons
                continue
            else:
                print(
                    f"overlap between {i}, {j}, {k}: {len(data_look[~data_look[i].isna() & ~data_look[j].isna() & ~data_look[k].isna()])}"
                )

overlap between LOGD_74, SOLUBILITY_74, PPB: 128
overlap between LOGD_74, SOLUBILITY_74, HLM_CLEARANCE: 165
overlap between LOGD_74, PPB, SOLUBILITY_74: 128
overlap between LOGD_74, PPB, HLM_CLEARANCE: 425
overlap between LOGD_74, HLM_CLEARANCE, SOLUBILITY_74: 165
overlap between LOGD_74, HLM_CLEARANCE, PPB: 425
overlap between SOLUBILITY_74, LOGD_74, PPB: 128
overlap between SOLUBILITY_74, LOGD_74, HLM_CLEARANCE: 165
overlap between SOLUBILITY_74, PPB, LOGD_74: 128
overlap between SOLUBILITY_74, PPB, HLM_CLEARANCE: 87
overlap between SOLUBILITY_74, HLM_CLEARANCE, LOGD_74: 165
overlap between SOLUBILITY_74, HLM_CLEARANCE, PPB: 87
overlap between PPB, LOGD_74, SOLUBILITY_74: 128
overlap between PPB, LOGD_74, HLM_CLEARANCE: 425
overlap between PPB, SOLUBILITY_74, LOGD_74: 128
overlap between PPB, SOLUBILITY_74, HLM_CLEARANCE: 87
overlap between PPB, HLM_CLEARANCE, LOGD_74: 425
overlap between PPB, HLM_CLEARANCE, SOLUBILITY_74: 87
overlap between HLM_CLEARANCE, LOGD_74, SOLUBILITY_74: 165

In [55]:
data_look = data_look[desired_columns]
data_look

Unnamed: 0,smiles,LOGD_74,SOLUBILITY_74,PPB,HLM_CLEARANCE
0,CN[C@@H](C)C(=O)N[C@H](C(=O)N[C@H]1CCCN(Cc2ccc...,2.01,,63.47,
1,C[C@H]1O[C@@H](n2cnc3c(N)nc(OCC4CC(F)(F)C4(F)F...,1.42,,,
2,FC(F)(F)c1cc(COCC(c2cccc(Cl)c2)N2CCNCC2)cc(C(F...,4.20,,,
3,COc1ccc(N2CCN(C(=O)[C@@H]3CCCC[C@H]3C(=O)NC3(C...,1.40,,,
4,CSCCC(NC(=O)c1sccc1Cl)c1nc2ccccc2[nH]1,3.69,25700.0,,63.0
...,...,...,...,...,...
5506,CC(=O)N1CCN(CCOc2ccc(C3CCN(c4ccc5nnc(C(F)(F)F)...,3.20,5400.0,,150.0
5507,O=c1[nH]c2c(O)ccc([C@@H](O)CNCCCSCCNCCc3cccc(C...,1.06,,,20.0
5508,Nc1ccc(OCc2ccccc2)cc1,2.20,467700.0,,
5509,NC1=NN(c2cccc(C(F)(F)F)c2)CC1,2.55,1445400.0,,


In [56]:
# Additional meta-data on the column level
annotations = {
    "smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "LOGD_74": ColumnAnnotation(
        description="Octan-1-ol/water (pH7.4) distribution coefficent.",
        user_attributes={
            "PH": "7.4",
        },
    ),
    "SOLUBILITY_74": ColumnAnnotation(
        description="Solubility in pH7.4 buffer using solid starting material.",
        user_attributes={"PH": "7.4"},
    ),
    "PPB": ColumnAnnotation(
        description="Percent bound to whole human plasma by equilibrium dialysis.",
        user_attributes={"unit": "percent", "organism": "human"},
    ),
    "HLM_CLEARANCE": ColumnAnnotation(
        description="Intrinsic clearance measured in human liver microsomes following incubation at 37C.",
        user_attributes={"unit": "uL/min/mg", "organism": "human"},
    ),
}

In [57]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=data_look,
    # Additional meta-data on the dataset level.
    name="AstraZeneca_ADME",
    description="A set of experiment data released by AstraZeneca",
    source="https://www.ebi.ac.uk/chembl/document_report_card/CHEMBL3301361/",
    annotations=annotations,
    tags=["ADME"],
    owner=HubOwner(user_id="Shawn Whitfield", slug="swhitfield"),
    license=License(id="MIT"),
    user_attributes={"year": "2016"},
)

In [58]:
# Make a temporary directory to save the dataset
temp_dir = tempfile.TemporaryDirectory().name

save_dir = dm.fs.join(temp_dir, "dataset")

path = dataset.to_json(save_dir)

# Look at the save destination
fs = dm.fs.get_mapper(save_dir).fs
fs.ls(save_dir)

  Expected `url` but got `str` - serialized value may not be as expected
  Expected `url` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


['/var/folders/kq/q3sv3jmd5ys443j8rfrgmrxm0000gq/T/tmpakomfyff/dataset/table.parquet',
 '/var/folders/kq/q3sv3jmd5ys443j8rfrgmrxm0000gq/T/tmpakomfyff/dataset/dataset.json']

In [59]:
# data_curated.to_parquet(f'./data/CHEMBL3301363_curated.parquet') # Save just in case. Requires a data folder.
data_look.to_parquet(
    f"gs://polaris-public/data/raw/AstraZeneca/AstraZeneca_curated.parquet"
)

In [18]:
# Load dataset through json file given
# dataset = load_dataset(path)

In [None]:
# NOTE: Commented out to not flood the DB
# with PolarisHubClient() as client:
#     client.upload_dataset(dataset=dataset)