Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

# Dataset creation with [Polaris](https://github.com/polaris-hub/polaris)
The first step of creating a benchmark is to set up a standard dataset which allows accessing the curated dataset (which has been demonstrated in <01_ncats_solubility_data_curation.ipynb>), and all necessary information about the dataset such as data source, description of endpoints, units etc. 

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm
import numpy as np

from polaris.cli import PolarisHubClient

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation
from polaris.dataset._column import Modality

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[3]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Get the owner and organization
org = "polaris"
data_name = "ncats_adme/CYP"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org.lower(), type="organization")
owner

HubOwner(slug='polaris', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data

In [4]:
PATH = "gs://polaris-public/polaris-recipes/org-polaris/ncats_adme/CYP/data/curation/CYP_curated.csv"
table = pd.read_csv(PATH)
table.columns

Index(['PUBCHEM_SID', 'PUBCHEM_CID', 'SMILES', 'CYP2D6_OUTCOME',
       'CYP2D6_SCORE', 'CYP3A4_OUTCOME', 'CYP3A4_SCORE', 'CYP2C9_OUTCOME',
       'CYP2C9_SCORE', 'MOL_smiles', 'MOL_molhash_id',
       'MOL_molhash_id_no_stereo', 'MOL_num_stereoisomers',
       'MOL_num_undefined_stereoisomers', 'MOL_num_defined_stereo_center',
       'MOL_num_undefined_stereo_center', 'MOL_num_stereo_center',
       'MOL_undefined_E_D', 'MOL_undefined_E/Z', 'OUTLIER_CYP2D6_SCORE',
       'OUTLIER_CYP3A4_SCORE', 'OUTLIER_CYP2C9_SCORE', 'AC_CYP2D6_OUTCOME',
       'AC_CYP3A4_OUTCOME', 'AC_CYP2C9_OUTCOME'],
      dtype='object')

### Define `Dataset` object

In [5]:
version = "v1"
dataset_name = f"ncats_cyp-{version}"

### Below we specify the meta information of data columns

In [6]:
# Additional meta-data on the column level
annotations = {
    "MOL_molhash_id": ColumnAnnotation(
        description="Molecular hash ID. See <datamol.mol.hash_mol>"
    ),
    "MOL_smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "CYP2D6_OUTCOME": ColumnAnnotation(
        description="Binarized label based on the phenotype observed, active antagonism (class = 1) if CYP2D6_SCORE between 40-100, Inactive (class = 0) if 0.",
        user_attributes={"ref_col": "CYP2D6_SCORE"},
    ),
    "CYP2D6_SCORE": ColumnAnnotation(
        description="Average activity score of 5 replicates for CYP2D6"
    ),
    "CYP3A4_OUTCOME": ColumnAnnotation(
        description="Binarized label based on the phenotype observed, active antagonism (class = 1) if CYP3A4_SCORE between 40-100, Inactive (class = 0) if 0.",
        user_attributes={"ref_col": "CYP3A4_SCORE"},
    ),
    "CYP3A4_SCORE": ColumnAnnotation(
        description="Average activity score of 5 replicates for CYP2D6"
    ),
    "CYP2C9_OUTCOME": ColumnAnnotation(
        description="Binarized label based on the phenotype observed, active antagonism (class = 1) if CYP2C9_SCORE between 40-100, Inactive (class = 0) if 0.",
        user_attributes={"ref_col": "CYP2D6_SCORE"},
    ),
    "CYP2C9_SCORE": ColumnAnnotation(
        description="Average activity score of 5 replicates for CYP2C9"
    ),
}

### Define `Dataset` object

In [7]:
version = "v1"
dataset_name = f"ncats_cyp-{version}"

In [8]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=table[annotations.keys()],
    # Additional meta-data on the dataset level.
    name=dataset_name,
    description=f"ADME Cytochrome P450 CYP2D6, CYP3A4, CYP2C9 antagonist experiment data released by the National Center for Advancing Translational Sciences",
    source="https://opendata.ncats.nih.gov/adme/data",
    annotations=annotations,
    tags=["ADME"],
    owner=HubOwner(user_id="Polaris", slug="polaris"),
    license="CC-BY-4.0",
    user_attributes={"year": "2021"},
    curation_reference="https://github.com/polaris-hub/polaris-recipes/org-Polaris/ncats_adme/CYP/01_ncats_cyp_data_curation.ipynb",
)

In [9]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

'gs://polaris-public/polaris-recipes/org-polaris/ncats_adme/CYP/datasets/ncats_cyp-v1/dataset.json'

In [10]:
# upload to Polaris Hub
# dataset.upload_to_hub(owner=owner, access="private")

[32m2024-07-10 02:17:00.279[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m631[0m - [32m[1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/polaris/ncats_cyp-v1[0m
