Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm
import numpy as np

from polaris.cli import PolarisHubClient

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation
from polaris.dataset._column import Modality

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
org = "novartis"
data_name = "CYP"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org, type="organization")
owner

HubOwner(slug='novartis', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data


In [4]:
# Load the data
source_data_path = (
    "gs://polaris-public/polaris-recipes/org-novartis/CYP/data/curation/CYP_curated.csv"
)
data = pd.read_csv(source_data_path)

### Define `Dataset` object

In [5]:
version = "v1"
dataset_name = f"novartis_cyp3a4-{version}"

In [6]:
# Additional meta-data on the column level
annotations = {
    "MOL_smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "log_kobs": ColumnAnnotation(
        description="Log unit of TDI CYP3A4 inactivation.",
    ),
    "CLS_log_kobs": ColumnAnnotation(
        description="Three-class binning on TDI CYP3A4 inactivation  0.01 and 0.025.",
    ),
    "pIC50_CYP3A4": ColumnAnnotation(
        description="Reversible CYP3A4 inhibition pIC50 values.",
    ),
    "pIC50_CYP2C9": ColumnAnnotation(
        description="Reversible CYP2C9 inhibition pIC50 values."
    ),
    "pIC50_CYP2D6": ColumnAnnotation(
        description="Reversible CYP2D6 inhibition pIC50 values."
    ),
    "split": ColumnAnnotation(
        description="Train and test splitting from the original publication doi.org/10.1021/acs.chemrestox.3c00305",
    ),
}

In [7]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=data[annotations.keys()],
    # Additional meta-data on the dataset level.
    name=dataset_name,
    description="CYP3A4 Time-Dependent Inhibition data released by Novartis",
    source="https://opendata.ncats.nih.gov/adme/data",
    annotations=annotations,
    tags=["ADME"],
    owner=owner,
    license="CC-BY-4.0",
    user_attributes={"year": "2024"},
    curation_reference="https://github.com/polaris-hub/polaris-recipes/org-Polaris/org-Novartis/CYP/00_CYP3A4_data_processing.ipynb",
)

In [8]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

'gs://polaris-public/polaris-recipes/org-novartis/CYP/datasets/novartis_cyp3a4-v1/dataset.json'

In [9]:
# from polaris.hub.client import PolarisHubClient
# client = PolarisHubClient()
# client.login()

# client.upload_dataset(dataset=dataset, access="private", owner=owner)

[32m2024-07-10 01:40:37.248[0m | [1mINFO    [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m285[0m - [1mYou are already logged in to the Polaris Hub as  (lu@valencediscovery.com). Set `overwrite=True` to force re-authentication.[0m
[32m2024-07-10 01:40:39.736[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m631[0m - [32m[1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/novartis/novartis_cyp3a4-v1[0m


{'id': 'cYlWyrr7wFiTiTHKZt9v9',
 'createdAt': '2024-07-10T05:40:37.884Z',
 'deletedAt': None,
 'name': 'novartis-cyp3a4-v1',
 'slug': 'novartis-cyp3a4-v1',
 'description': 'CYP3A4 Time-Dependent Inhibition data released by Novartis',
 'tags': ['ADME'],
 'userAttributes': {'year': '2024'},
 'access': 'private',
 'isCertified': False,
 'polarisVersion': 'dev',
 'readme': '',
 'ownerId': 'yLurE6WfGoFvJX8EHoCpA',
 'creatorId': 'NKnaHGybLqwSHcaMEHqfF',
 'state': 'upload_pending',
 'source': 'https://opendata.ncats.nih.gov/adme/data',
 'curationReference': 'https://github.com/polaris-hub/polaris-recipes/org-Polaris/org-Novartis/CYP/00_CYP3A4_data_processing.ipynb',
 'nRows': 16740,
 'nColumns': 7,
 'license': 'CC-BY-4.0',
 'md5Sum': 'dcefee6bdae2a0c8224ea14a54e8b7ad',
 'annotations': {'split': {'dtype': 'object',
   'modality': 'UNKNOWN',
   'isPointer': False,
   'description': 'Train and test splitting from the original publication doi.org/10.1021/acs.chemrestox.3c00305',
   'userAttribute