Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

# Dataset creation with [Polaris](https://github.com/polaris-hub/polaris) 

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

In [2]:
# Get the owner and organization
org = "Graphium"
data_name = "pm6_subset"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org.lower(), type="organization")
owner

HubOwner(slug='graphium', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data

In [4]:
PATH = f"{gcp_root}/data/raw/PCQM4M_G25_N4.parquet.csv"
PATH = "/Users/lu.zhu/Downloads/pm6_86M_0.parquet"
table = pd.read_parquet(PATH)
table.columns

Index(['graph_ordered_smiles', 'graph_cation_plane_best_fit',
       'graph_S0_principal_length_b', 'graph_S0_principal_length_a',
       'graph_cation_energy_total', 'graph_anion_plane_best_fit',
       'graph_T0_plane_best_fit', 'graph_anion_beta_homo',
       'graph_cation_beta_homo', 'graph_T0_alpha_homo', 'graph_S0_beta_gap',
       'graph_T0_beta_homo', 'graph_cation_principal_length_b',
       'graph_T0_spherocity', 'graph_S0_plane_best_fit',
       'graph_T0_principal_length_b', 'graph_anion_principal_length_c',
       'graph_anion_alpha_gap', 'graph_T0_principal_length_c',
       'graph_cation_principal_length_a', 'graph_S0_alpha_homo',
       'graph_cation_beta_gap', 'graph_S0_energy_total', 'graph_S0_spherocity',
       'graph_anion_principal_length_a', 'graph_T0_principal_length_a',
       'graph_anion_alpha_homo', 'graph_cation_alpha_homo',
       'graph_T0_energy_total', 'graph_S0_principal_length_c',
       'graph_anion_spherocity', 'graph_anion_principal_length_b',
    

### Below we specify the meta information of data columns

In [5]:
# Additional meta-data on the column level
annotations = {
    "graph_ordered_smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    **{
        col: ColumnAnnotation(description=f"Graph feature {col}")
        for col in table.columns
        if col.startswith("graph")
    },
}

### Define `Dataset` object

In [6]:
version = "v1"
dataset_name = f"{data_name}-{version}"

In [7]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=table[annotations.keys()],
    # Additional meta-data on the dataset level.
    name=dataset_name,
    description="Subset of quantum chemistry dataset which uses PM6 semi-empirical computation of the quantum properties.",
    source="https://arxiv.org/abs/1904.06046",
    annotations=annotations,
    tags=["Graph", "Quantum chemistry", "LargeMix"],
    owner=owner,
    license="CC-BY-4.0",
    user_attributes={"year": "2019"},
    readme=load_readme("org-Graphium/pm6_83m/readme.md"),
)

In [8]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

In [9]:
from polaris.dataset import Dataset

dataset = Dataset.from_json(
    "gs://polaris-public/polaris-recipes/org-Graphium/pm6_subset/datasets/pm6_subset-v1/dataset.json"
)

from polaris.hub.client import PolarisHubClient

client = PolarisHubClient()
client.login()

client.upload_dataset(dataset=dataset, access="public", owner=owner, timeout=5000)

[32m2024-07-19 12:06:28.430[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m224[0m - [32m[1mYou are successfully logged in to the Polaris Hub.[0m
[32m2024-07-19 12:06:45.084[0m | [1mINFO    [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m512[0m - [1mUploaded metadata[0m
[32m2024-07-19 12:06:45.087[0m | [1mINFO    [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m516[0m - [1mUploading Parquet file: https://polarishub.io/storage/dataset/graphium/pm6-subset-0-v1/table.parquet[0m
