# Benchmarks for PCBA 1328 1564K  dataset

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import datamol as dm

# polaris benchmark
from polaris.benchmark import MultiTaskBenchmarkSpecification

# polaris hub
from polaris.utils.types import HubOwner

# utils
root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

In [2]:
# Get the owner and organization
org = "Graphium"
data_name = "pcba_1328_1564k"
dataset_name = f"{data_name}-v1"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org.lower(), type="organization")
owner

HubOwner(slug='graphium', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_JSON = f"{gcp_root}/datasets/{dataset_name}/dataset.json"

FIGURE_DIR = f"{gcp_root}/figures"

### Load existing data

In [4]:
# Load the saved Dataset
from polaris.dataset import Dataset

dataset = Dataset.from_json(DATASET_JSON)

In [8]:
dataset.to_json(f"{dirname}/pcba_1328_1564k-v1")

'/Users/lu.zhu/Documents/Codebase/ValenceLab/polaris-recipes/org-Graphium/pcba_1328_1564k/pcba_1328_1564k-v1/dataset.json'

In [9]:
dataset.table = dataset.table.head(10)

<a id="benchmark"></a>
## Benchmark creation with `Polaris`
Creating a benchmark involves setting up a standard dataset, designing the train-test set and defining evaluation metrics which is used to establish baseline performance level. 

In [12]:
data_cols = [col for col in dataset.columns if col.startswith("assayID")]

mol_col = "SMILES"

### Get the train/test splits

In [13]:
import torch

split_path = f"{gcp_root}/data/raw/pcba_1328_random_splits.pt"
with dm.fs.fsspec.open(split_path) as f:
    split_dict = torch.load(f)

splits = [split_dict["train"], split_dict["val"], split_dict["test"]]

## Define multitask benchmarks with the above defined split

In [43]:
# Polaris only allows train and test splits. Here we merge the training set and validation
benchmark_splits = (splits[0] + splits[1], splits[2])

In [40]:
benchmark_version = "v1"
benchmark_name = f"{data_name}-{benchmark_version}"
readme_name = f"org-Graphium/{data_name}/benchmark_readme.md"
BENCHMARK_SAVE_DIR = f"{BENCHMARK_DIR}/{benchmark_name}"
BENCHMARK_SAVE_DIR = f"{dirname}/benchmark_{benchmark_name}"

benchmark = MultiTaskBenchmarkSpecification(
    name=benchmark_name,
    dataset=dataset,
    target_cols=data_cols,
    target_types={col: "classification" for col in data_cols[:1]},
    input_cols=mol_col,
    split=benchmark_splits,
    metrics=["f1"],
    tags=["multitask"],
    description="A multitask classification benchmark for binding predictions.",
    owner=owner,
    readme=load_readme(readme_name),
)
path = benchmark.to_json(BENCHMARK_SAVE_DIR)
print(path)

[32m2024-07-19 01:20:28.185[0m | [1mINFO    [0m | [36mpolaris._mixins[0m:[36mmd5sum[0m:[36m27[0m - [1mComputing the checksum. This can be slow for large datasets.[0m


/Users/lu.zhu/Documents/Codebase/ValenceLab/polaris-recipes/org-Graphium/pcba_1328_1564k/benchmark_pcba_1328_1564k-v1/benchmark.json


In [48]:
benchmark = MultiTaskBenchmarkSpecification.from_json(
    "/Users/lu.zhu/Documents/Codebase/ValenceLab/polaris-recipes/org-Graphium/pcba_1328_1564k/benchmark_pcba_1328_1564k-v1/benchmark.json"
)

In [51]:
# Upload to hub

from polaris.hub.client import PolarisHubClient

client = PolarisHubClient()
client.login()

client.upload_benchmark(benchmark, access="public", owner=owner)

[32m2024-07-19 01:27:02.594[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m224[0m - [32m[1mYou are successfully logged in to the Polaris Hub.[0m
[32m2024-07-19 01:27:04.269[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_benchmark[0m:[36m613[0m - [32m[1mYour benchmark has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/graphium/pcba_1328_1564k-v1[0m


{'id': 'iHVUErMSyYdRfB1qKdK4M',
 'createdAt': '2024-07-19T05:27:03.910Z',
 'deletedAt': None,
 'name': 'pcba_1328_1564k-v1',
 'slug': 'pcba_1328_1564k-v1',
 'description': 'A multitask classification benchmark for binding predictions.',
 'tags': ['multitask'],
 'userAttributes': {},
 'access': 'public',
 'isCertified': False,
 'polarisVersion': 'dev',
 'readme': '## Background\n\n\n## Assay information\n\n\n## Description of readout:\n\n\n## Data resource\n\n',
 'state': 'ready',
 'ownerId': 'zMTB7lQiiukqEmLQF7EjT',
 'creatorId': 'NKnaHGybLqwSHcaMEHqfF',
 'datasetId': 'UzJWfPDb6WLUz4NxDzMe5',
 'targetCols': ['assayID-1',
  'assayID-101',
  'assayID-103',
  'assayID-105',
  'assayID-107',
  'assayID-109',
  'assayID-11',
  'assayID-113',
  'assayID-115',
  'assayID-119',
  'assayID-121',
  'assayID-123',
  'assayID-125',
  'assayID-129',
  'assayID-13',
  'assayID-131',
  'assayID-133',
  'assayID-135',
  'assayID-137',
  'assayID-139',
  'assayID-141',
  'assayID-143',
  'assayID-145',