Note: this notebook is set up to run with the env.yml containing the name 'polaris_datasets'

# Dataset creation with [Polaris](https://github.com/polaris-hub/polaris) 

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

In [3]:
# Get the owner and organization
org = "Graphium"
data_name = "pm6_subset"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org.lower(), type="organization")
owner

HubOwner(slug='graphium', external_id=None, type='organization')

In [4]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

In [7]:
!gsutil cp -r /Users/lu.zhu/Downloads/pcqm4m_g25_n4 {gcp_root}/data/raw

Copying file:///Users/lu.zhu/Downloads/pcqm4m_g25_n4/pcqm4m_g25_n4_random_splits.pt [Content-Type=application/octet-stream]...
Copying file:///Users/lu.zhu/Downloads/pcqm4m_g25_n4/PCQM4M_G25_N4.parquet [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1 files][418.3 MiB/  2.7 GiB]  756.5 KiB/s                        

## Load existing data

In [None]:
PATH = f"{gcp_root}/data/raw/PCQM4M_G25_N4.parquet.csv"
PATH = "/Users/lu.zhu/Downloads/pcqm4m_g25_n4/PCQM4M_G25_N4.parquet"
table = pd.read_parquet(PATH)
table.columns

Index(['ordered_cxsmiles_3d', 'ordered_smiles', 'graph_alpha_homo',
       'graph_alpha_gap', 'graph_beta_homo', 'graph_beta_gap',
       'graph_energy_total', 'graph_principal_length_a',
       'graph_principal_length_b', 'graph_principal_length_c',
       'graph_inertia_mass_a', 'graph_inertia_mass_b', 'graph_inertia_mass_c',
       'graph_inertia_valence_a', 'graph_inertia_valence_b',
       'graph_inertia_valence_c', 'graph_inertia_charges_mulliken_a',
       'graph_inertia_charges_mullkien_b', 'graph_inertia_charges_mulliken_c',
       'graph_inertia_charges_lowdin_a', 'graph_inertia_charges_lowdin_b',
       'graph_inertia_charges_lowdin_c', 'graph_spherocity',
       'graph_plane_best_fit', 'graph_tddft_alpha_gap',
       'graph_tddft_energy_beta_gap', 'graph_tddft_energy_total',
       'node_charges_mulliken', 'node_charges_lowdin',
       'node_tddft_charges_mulliken', 'node_tddft_charges_lowdin'],
      dtype='object')

### Below we specify the meta information of data columns

In [10]:
# Additional meta-data on the column level
annotations = {
    "smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    "SA": ColumnAnnotation(description="Synthetic accessibility score."),
    "logp": ColumnAnnotation(description="Log P, octanol-water partition coefficient."),
    "score": ColumnAnnotation(description="Score"),
}

### Define `Dataset` object

In [11]:
version = "v1"
dataset_name = f"{data_name}-{version}"

In [12]:
dataset = Dataset(
    # The table is the core data-structure required to construct a dataset
    table=table[annotations.keys()],
    # Additional meta-data on the dataset level.
    name=dataset_name,
    description="A subset (12K) of ZINC molecular graphs (250K) dataset.",
    source="https://arxiv.org/abs/2003.00982",
    annotations=annotations,
    tags=["Graph"],
    owner=owner,
    license="CC-BY-4.0",
    user_attributes={"year": "2022"},
    readme=load_readme("org-Graphium/zinc12/zinc12_readme.md"),
)

In [13]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

[32m2024-07-16 23:05:28.217[0m | [1mINFO    [0m | [36mpolaris._mixins[0m:[36mmd5sum[0m:[36m27[0m - [1mComputing the checksum. This can be slow for large datasets.[0m


'gs://polaris-public/polaris-recipes/org-Graphium/zinc12k/datasets/zinc12k-v1/dataset.json'

In [14]:
from polaris.hub.client import PolarisHubClient

client = PolarisHubClient()
client.login()

client.upload_dataset(dataset=dataset, access="private", owner=owner)

[32m2024-07-16 23:05:37.048[0m | [1mINFO    [0m | [36mpolaris.hub.client[0m:[36mlogin[0m:[36m285[0m - [1mYou are already logged in to the Polaris Hub as  (lu@valencediscovery.com). Set `overwrite=True` to force re-authentication.[0m
[32m2024-07-16 23:05:39.938[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_dataset[0m:[36m631[0m - [32m[1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/graphium/zinc12k-v1[0m


{'id': 'lyuMH2u2azzmsvdwtrMu3',
 'createdAt': '2024-07-17T03:05:37.875Z',
 'deletedAt': None,
 'name': 'zinc12k-v1',
 'slug': 'zinc12k-v1',
 'description': 'A subset (12K) of ZINC molecular graphs (250K) dataset.',
 'tags': ['Graph'],
 'userAttributes': {'year': '2022'},
 'access': 'private',
 'isCertified': False,
 'polarisVersion': 'dev',
 'readme': '## Background\n\n\n## Assay information\n\n\n## Description of readout:\n\n\n## Data resource\n\n',
 'ownerId': 'zMTB7lQiiukqEmLQF7EjT',
 'creatorId': 'NKnaHGybLqwSHcaMEHqfF',
 'state': 'upload_pending',
 'source': 'https://arxiv.org/abs/2003.00982',
 'curationReference': None,
 'nRows': 12000,
 'nColumns': 4,
 'license': 'CC-BY-4.0',
 'md5Sum': '804d595f51d3841f329fdb4fec03cc60',
 'annotations': {'SA': {'dtype': 'float64',
   'modality': 'UNKNOWN',
   'isPointer': False,
   'description': 'Synthetic accessibility score.',
   'userAttributes': {}},
  'logp': {'dtype': 'float64',
   'modality': 'UNKNOWN',
   'isPointer': False,
   'descri