Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

# Dataset creation with [Polaris](https://github.com/polaris-hub/polaris) 

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

In [2]:
# Get the owner and organization
org = "Graphium"
data_name = "pcba_1328_1564k"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org.lower(), type="organization")
owner

HubOwner(slug='graphium', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data

In [4]:
PATH = f"{gcp_root}/data/raw/PCBA_1328_1564k.parquet"
PATH = "/Users/lu.zhu/Downloads/pcba_1328/PCBA_1328_1564k.parquet"
table = pd.read_parquet(PATH)
table.columns

Index(['Unnamed: 0', 'SMILES', 'assayID-1', 'assayID-101', 'assayID-103',
       'assayID-105', 'assayID-107', 'assayID-109', 'assayID-11',
       'assayID-113',
       ...
       'assayID-1645856', 'assayID-1645857', 'assayID-1645858',
       'assayID-1645859', 'assayID-1645860', 'assayID-1671188',
       'assayID-1671193', 'assayID-1671194', 'CID', 'SID'],
      dtype='object', length=1332)

### Below we specify the meta information of data columns

In [6]:
# Additional meta-data on the column level
annotations = {
    "SMILES": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "CID": ColumnAnnotation(
        description="PubChem Conmpound ID",
    ),
    "SID": ColumnAnnotation(
        description="PubChem Substance ID",
    ),
    **{
        col: ColumnAnnotation(description=col, modality="molecule")
        for col in table.columns
        if col.startswith("assayID")
    },
}

### Define `Dataset` object

In [7]:
version = "v1"
dataset_name = f"{data_name}-{version}"

In [9]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=table[annotations.keys()],
    # Additional meta-data on the dataset level.
    name=dataset_name,
    description="A subset of PubChem BioAssay, containing 1328 bioassays measured over 1564k compounds used by previous work to benchmark machine learning methods.",
    source="https://pubmed.ncbi.nlm.nih.gov/26400175/",
    annotations=annotations,
    tags=["LargeMix", "BioAssay"],
    owner=owner,
    license="CC-BY-4.0",
    user_attributes={"year": "2024"},
    readme=load_readme(f"org-Graphium/{data_name}/readme.md"),
)

In [10]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

[32m2024-07-17 01:12:09.117[0m | [1mINFO    [0m | [36mpolaris._mixins[0m:[36mmd5sum[0m:[36m27[0m - [1mComputing the checksum. This can be slow for large datasets.[0m


In [None]:
from polaris.hub.client import PolarisHubClient

client = PolarisHubClient()
client.login()

client.upload_dataset(dataset=dataset, access="private", owner=owner)

[32m2024-07-17 00:51:02.092[0m | [1mINFO    [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m285[0m - [1mYou are already logged in to the Polaris Hub as  (lu@valencediscovery.com). Set `overwrite=True` to force re-authentication.[0m
[32m2024-07-17 00:52:06.230[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m631[0m - [32m[1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/graphium/pcba-v1[0m


{'id': 'bMHENhJEMteTAiEBnEASq',
 'createdAt': '2024-07-17T04:51:03.292Z',
 'deletedAt': None,
 'name': 'pcba-v1',
 'slug': 'pcba-v1',
 'description': 'A subset of PubChem BioAssay, containing 1328 bioassays measured over 1564k compounds used by previous work to benchmark machine learning methods.',
 'tags': ['LargeMix', 'BioAssay'],
 'userAttributes': {'year': '2024'},
 'access': 'private',
 'isCertified': False,
 'polarisVersion': 'dev',
 'readme': '## Background\nThe LINCS L1000 is a database of high-throughput transcriptomics that screened more than 30,000 perturbations on a set of 978 landmark genes [4] from multiple cell lines. VCAP and MCF7 are, respectively, prostate cancer and human breast cancer cell lines. In L1000, most of the perturbagens are chemical, meaning that small drug-like molecules are added to the cell lines to observe how the gene expressions change. This allows to generate biological signatures of the molecules, which are known to correlate with drug activity an