Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

# Dataset creation with [Polaris](https://github.com/polaris-hub/polaris) 

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

In [2]:
# Get the owner and organization
org = "Graphium"
data_name = "qm9"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org.lower(), type="organization")
owner

HubOwner(slug='graphium', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data

In [4]:
PATH = f"{gcp_root}/data/raw/qm9.csv"
table = pd.read_csv(PATH)
table.columns

Index(['mol_id', 'smiles', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap',
       'r2', 'zpve', 'u0', 'u298', 'h298', 'g298', 'cv', 'u0_atom',
       'u298_atom', 'h298_atom', 'g298_atom'],
      dtype='object')

### Below we specify the meta information of data columns

In [5]:
# Additional meta-data on the column level
annotations = {
    "mol_id": ColumnAnnotation(description="Molecule identifier"),
    "smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "A": ColumnAnnotation(
        description="Rotational constant A", userAttributes={"unit": "GHz"}
    ),
    "B": ColumnAnnotation(
        description="Rotational constant B", userAttributes={"unit": "GHz"}
    ),
    "C": ColumnAnnotation(
        description="Rotational constant C", userAttributes={"unit": "GHz"}
    ),
    "mu": ColumnAnnotation(
        description="Dipole moment", userAttributes={"unit": "Debye"}
    ),
    "alpha": ColumnAnnotation(
        description="Isotropic polarizability", userAttributes={"unit": "Bohr^3"}
    ),
    "homo": ColumnAnnotation(
        description="Energy of Highest occupied molecular orbital (HOMO)",
        userAttributes={"unit": "Hartree"},
    ),
    "lumo": ColumnAnnotation(
        description="Energy of Lowest occupied molecular orbital (LUMO)",
        userAttributes={"unit": "Hartree"},
    ),
    "gap": ColumnAnnotation(
        description="Gap, difference between LUMO and HOMO",
        userAttributes={"unit": "Hartree"},
    ),
    "r2": ColumnAnnotation(
        description="Electronic spatial extent", userAttributes={"unit": "Bohr^2"}
    ),
    "zpve": ColumnAnnotation(
        description="Zero point vibrational energy", userAttributes={"unit": "Hartree"}
    ),
    "u0": ColumnAnnotation(
        description="Internal energy at 0 K", userAttributes={"unit": "Hartree"}
    ),
    "u298": ColumnAnnotation(
        description="Internal energy at 298.15 K", userAttributes={"unit": "Hartree"}
    ),
    "h298": ColumnAnnotation(
        description="Enthalpy at 298.15 K", userAttributes={"unit": "Hartree"}
    ),
    "g298": ColumnAnnotation(
        description="Free energy at 298.15 K", userAttributes={"unit": "Hartree"}
    ),
    "cv": ColumnAnnotation(
        description="Heat capacity at 298.15 K", userAttributes={"unit": "cal/(mol K)"}
    ),
    "u0_atom": ColumnAnnotation(
        description="Atomization energy at 0 K atom",
        userAttributes={"unit": "kcal/mol"},
    ),
    "u298_atom": ColumnAnnotation(
        description="Atomization energy at 298.15 K atom",
        userAttributes={"unit": "kcal/mol"},
    ),
    "h298_atom": ColumnAnnotation(
        description="Atomization enthalpy at 298.15 K atom",
        userAttributes={"unit": "kcal/mol"},
    ),
    "g298_atom": ColumnAnnotation(
        description="Atomization free energy at 298.15 K atom",
        userAttributes={"unit": "kcal/mol"},
    ),
}

### Define `Dataset` object

In [6]:
version = "v1"
dataset_name = f"{data_name}-{version}"

In [7]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=table[annotations.keys()],
    # Additional meta-data on the dataset level.
    name=dataset_name,
    description="QM9 contains quantum chemical properties for a relevant, consistent, and comprehensive chemical space of small organic molecules",
    source="https://www.nature.com/articles/sdata201422",
    annotations=annotations,
    tags=["Graph", "Quantum chemistry"],
    owner=owner,
    license="CC-BY-4.0",
    user_attributes={"year": "2014"},
    readme=load_readme(f"org-Graphium/{data_name}/{data_name}_readme.md"),
)

In [8]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

'gs://polaris-public/polaris-recipes/org-Graphium/qm9/datasets/qm9-v1/dataset.json'

In [11]:
from polaris.hub.client import PolarisHubClient

client = PolarisHubClient()
client.login()

client.upload_dataset(dataset=dataset, access="private", owner=owner)

[32m2024-07-18 23:40:14.185[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m224[0m - [32m[1mYou are successfully logged in to the Polaris Hub.[0m
[32m2024-07-18 23:40:36.507[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m569[0m - [32m[1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/graphium/qm9-v1[0m


{'id': 'qqwqD8SblJfFrHhJHu170',
 'createdAt': '2024-07-19T03:40:15.334Z',
 'deletedAt': None,
 'name': 'qm9-v1',
 'slug': 'qm9-v1',
 'description': 'QM9 contains quantum chemical properties for a relevant, consistent, and comprehensive chemical space of small organic molecules',
 'tags': ['Graph', 'Quantum chemistry'],
 'userAttributes': {'year': '2014'},
 'access': 'private',
 'isCertified': False,
 'polarisVersion': 'dev',
 'readme': '## Background\nQM9 is a well-known dataset in the field of 3D GNNs. It consists of 19 graph-level quantum properties associated to an energy-minimized 3D conformation of the molecules. It is considered a simple dataset since all the molecules have at most 9 heavy atoms. We chose QM9 in our ToyMix since it is very similar to the larger proposed quantum datasets, PCQM4M_multitask and PM6_83M, but with smaller molecules.\n\n## Assay information\nComputed geometric, energetic, electronic, and thermodynamic properties for 134k stable small organic molecules 