Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

# Dataset creation with [Polaris](https://github.com/polaris-hub/polaris) 

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

In [2]:
# Get the owner and organization
org = "Graphium"
data_name = "tox21"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org.lower(), type="organization")
owner

HubOwner(slug='graphium', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data

In [4]:
PATH = f"{gcp_root}/data/raw/Tox21-7k-12-labels.csv"
table = pd.read_csv(PATH)
table.columns

Index(['smiles', 'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
       'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP',
       'SR-p53'],
      dtype='object')

### Below we specify the meta information of data columns

In [5]:
# Additional meta-data on the column level
annotations = {
    "smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "NR-AR": ColumnAnnotation(
        description="Nuclear receptors pathway assay androgen receptor."
    ),
    "NR-AR-LBD": ColumnAnnotation(
        description="Nuclear receptors pathway assay androgen receptor ligand-binding domain."
    ),
    "NR-AhR": ColumnAnnotation(
        description="Nuclear receptors pathway assay nuclear receptor aryl hydrocarbon receptor."
    ),
    "NR-Aromatase": ColumnAnnotation(
        description="Nuclear receptors pathway assay aromatase"
    ),
    "NR-ER": ColumnAnnotation(
        description="Nuclear receptors pathway assay estrogen receptor."
    ),
    "NR-ER-LBD": ColumnAnnotation(
        description="Nuclear receptors pathway estrogen receptor luciferase assay."
    ),
    "NR-PPAR-gamma": ColumnAnnotation(
        description="Nuclear receptors pathway assay peroxisome proliferator-activated receptor."
    ),
    "SR-ARE": ColumnAnnotation(
        description="Stress response assay antioxidant response element"
    ),
    "SR-ATAD5": ColumnAnnotation(
        description="Stress response assay ATPase Family AAA Domain Containing 5."
    ),
    "SR-HSE": ColumnAnnotation(
        description="Stress response assay heat shock factor response element."
    ),
    "SR-MMP": ColumnAnnotation(
        description="Stress response assay mitochondrial membrane potential."
    ),
    "SR-p53": ColumnAnnotation(description="Stress response assay p53."),
}

### Define `Dataset` object

In [6]:
version = "v1"
dataset_name = f"{data_name}-{version}"

In [9]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=table[annotations.keys()],
    # Additional meta-data on the dataset level.
    name=dataset_name,
    description="The Tox21 compound structures and activity measurements for 12 different qHTS assays were extracted from the Tox21 Data Challenge",
    source="https://europepmc.org/article/MED/23603828",
    annotations=annotations,
    tags=["Toxicity"],
    owner=owner,
    license="CC-BY-SA-4.0",
    user_attributes={"year": "2013"},
    readme=load_readme("org-Graphium/tox21/tox21_readme.md"),
)

In [10]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

'gs://polaris-public/polaris-recipes/org-Graphium/tox21/datasets/tox21-v1/dataset.json'

In [12]:
# from polaris.hub.client import PolarisHubClient

# client = PolarisHubClient()
# client.login()

# client.upload_dataset(dataset=dataset, access="public", owner=owner)

[32m2024-07-19 00:31:36.703[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m224[0m - [32m[1mYou are successfully logged in to the Polaris Hub.[0m
[32m2024-07-19 00:31:38.660[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m569[0m - [32m[1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/graphium/tox21-v1[0m


{'id': 'J2evWy7RceAJ9GRZsP8iK',
 'createdAt': '2024-07-19T04:31:37.747Z',
 'deletedAt': None,
 'name': 'tox21-v1',
 'slug': 'tox21-v1',
 'description': 'The Tox21 compound structures and activity measurements for 12 different qHTS assays were extracted from the Tox21 Data Challenge',
 'tags': ['Toxicity'],
 'userAttributes': {'year': '2013'},
 'access': 'public',
 'isCertified': False,
 'polarisVersion': 'dev',
 'readme': '## Background\nTox21 is a well-known dataset for researchers in machine learning for drug discovery. The data set provided by the Tox21 Data Challenge included approximately 12 000 compounds. It consists of a multi-label classification task with 12 labels, with most labels missing and a strong imbalance towards the negative class. Each subchallenge required the prediction of a different type of toxicity. The sub-challenges were split between two panels: Seven of the twelve sub-challenges dealt with Nuclear Receptor (NR) signaling pathways, the remaining five with the