# Dataset and Benchmark creation with `Polaris`
The first step of creating a benchmark is to set up a standard dataset which allows accessing the curated dataset.

In [3]:
%load_ext autoreload
%autoreload 2
import os
import pathlib
import warnings
import pandas as pd
import datamol as dm
import numpy as np

from sklearn.model_selection import ShuffleSplit
import polaris

# polaris curation module
from polaris.curation._chemistry_curator import SMILES_COL, UNIQUE_ID

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation
from polaris.dataset._column import Modality

# polaris benchmark
from polaris.benchmark import (
    SingleTaskBenchmarkSpecification,
    MultiTaskBenchmarkSpecification,
)
from polaris.utils.types import HubOwner, License

os.chdir(pathlib.Path("__file__").absolute().parents[1])


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
owner = HubOwner(organizationId="PolarisTest", slug="polaristest")
owner

HubOwner(slug='polaristest', external_id=None, type=None)

In [5]:
BENCHMARK_DIR = "gs://polaris-public/benchmarks/hello_world"
DATASET_DIR = "gs://polaris-public/datasets/hello_world"

In [38]:
# Load data
from datamol.data import solubility
table = solubility()
table.head(5)
table['smiles'] = table.mol.apply(dm.to_smiles)

In [33]:
table["fp"] = table.mol.apply(dm.to_fp)

In [37]:
dm.to_fp

<function datamol.fp.to_fp(mol: Union[str, rdkit.Chem.rdchem.Mol], as_array: bool = True, fp_type: str = 'ecfp', fold_size: Optional[int] = None, **fp_args: Any) -> Union[numpy.ndarray, rdkit.DataStructs.cDataStructs.SparseBitVect, rdkit.DataStructs.cDataStructs.ExplicitBitVect, NoneType]>

In [39]:
table['UNIQUE_ID'] = table.mol.apply(dm.hash_mol)

### Below we specify the meta information of data columns

In [40]:
annotations = {
    "UNIQUE_ID": ColumnAnnotation(
        description="Molecular hash ID. See <datamol.mol.hash_mol>"
    ),
    "smiles": ColumnAnnotation(
        description="Molecule SMILES string after cleaning and standardization.",
        modality=Modality.MOLECULE,
    ),
    "SOL": ColumnAnnotation(
        description="Experimental solubility",
    ),
}

### Define `Dataset` object

In [41]:
dataset_name = "hello-world"

In [42]:
dataset = Dataset(
    table=table[annotations.keys()].copy(),
    name=dataset_name,
    description="Hello-world dataset for testing purpose",
    annotations=annotations,
    owner=owner,
    tags=["test"],
    licence=License(id="CC-BY-4.0"),
)

In [43]:
# save the dataset
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset.to_json(SAVE_DIR)

'gs://polaris-public/datasets/hello_world/hello-world/dataset.json'

## Benchmark creation with `Polaris`
Creating a benchmark involves setting up a standard dataset, designing the train-validation-test set and defining evaluation metrics which is used to establish baseline performance level. 

In [44]:
benchmark_name = "hello-world-benchmark"
split = table[table["split"] =="train"].index.values, table[table["split"] =="test"].index.values
benchmark = SingleTaskBenchmarkSpecification(
    name=benchmark_name,
    dataset=dataset,
    target_cols=["SOL"],
    input_cols="smiles",
    split=split,
    tags=["test"],
    owner=owner,
    description="Hello-world dataset for testing purpose",
    main_metric="mean_squared_error",
    metrics=[
        "mean_absolute_error",
        "mean_squared_error",
        "r2",
        "spearmanr",
        "pearsonr",
        "explained_var",
    ],
)
SAVE_DIR = f"{BENCHMARK_DIR}/{benchmark_name}"
path = benchmark.to_json(SAVE_DIR)
print(path)

gs://polaris-public/benchmarks/hello_world/hello-world-benchmark/benchmark.json
