# Dataset creation with [Polaris](https://github.com/polaris-hub/polaris)

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation
from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))

In [2]:
# Get the owner and organization
org = "AdaptyvBio"
data_name = "IL7Ra_binders"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug="adaptyv-bio", type="organization")
owner

HubOwner(slug='adaptyv-bio', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}/benchmarks"
DATASET_DIR = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}/datasets"
FIGURE_DIR = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}/figures"

## Load existing data

In [4]:
# Load the curated data
PATH = "gs://polaris-public/polaris-recipes/org-AdaptyvBio/raw/IL7Ra_binders_data.csv"
table = pd.read_csv(PATH)

### Below we specify the meta information of data columns

It's necessary to specify the key bioactivity columns, molecule structures and identifiers in the dataset with `ColumnAnnotation`. It is possible to add `user_attributes` with any key and values when needed, such as `unit`, `organism`, `scale` and optimization `objective`. 

In [5]:
# Here we simplify the column names
table = table.rename(columns={"KD (M)": "KD"})
# fill nans
table["Binder_class"] = (
    table["Binder_class"].fillna("inactive").replace("none", "inactive").values
)
table["Binder_class"].value_counts()

Binder_class
inactive    61
strong      19
weak        15
Name: count, dtype: int64

In [6]:
annotations = {
    "sequence": ColumnAnnotation(description="Protein sequence in fasta format"),
    "KD": ColumnAnnotation(
        description="Kd for the measure of binding affinity.",
        user_attributes={
            "unit": "M",
            "objective": "Lower value",
        },
    ),
    "Binder_class": ColumnAnnotation(
        description="The binding affinity as three classes labels.",
        user_attributes={
            "objective": "strong",
        },
    ),
    "Binder_bool": ColumnAnnotation(
        description="The binding affinity as boolean class label",
        user_attributes={
            "objective": "True",
        },
    ),
}

### Define `Dataset` object

In [7]:
dataset_version = "v1"
dataset_name = f"IL7Ra_binders-{dataset_version}"

In [8]:
dataset = Dataset(
    table=table[annotations.keys()].copy(),
    name=dataset_name,
    description="This dataset includes binding protein designs targeting the interleukin-7 receptor alpha chain (IL7RA), a drug target associated with various diseases.",
    source="https://www.adaptyvbio.com",
    annotations=annotations,
    owner=owner,
    tags=["protein-design"],
    license="CC-BY-4.0",
)

### Dataset overview

In [9]:
dataset

[32m2024-09-12 16:33:10.494[0m | [1mINFO    [0m | [36mpolaris.mixins._checksum[0m:[36mmd5sum[0m:[36m27[0m - [1mComputing the checksum. This can be slow for large datasets.[0m


0,1
name,IL7Ra_binders-v1
description,"This dataset includes binding protein designs targeting the interleukin-7 receptor alpha chain (IL7RA), a drug target associated with various diseases."
tags,protein-design
user_attributes,
owner,adaptyv-bio
polaris_version,0.7.9
default_adapters,
zarr_root_path,
readme,
annotations,sequenceis_pointerFalsemodalityUNKNOWNdescriptionProtein sequence in fasta formatuser_attributesdtypeobjectKDis_pointerFalsemodalityUNKNOWNdescriptionKd for the measure of binding affinity.user_attributesunitMobjectiveLower valuedtypeobjectBinder_classis_pointerFalsemodalityUNKNOWNdescriptionThe binding affinity as three classes labels.user_attributesobjectivestrongdtypeobjectBinder_boolis_pointerFalsemodalityUNKNOWNdescriptionThe binding affinity as boolean class labeluser_attributesobjectiveTruedtypebool

0,1
sequence,is_pointerFalsemodalityUNKNOWNdescriptionProtein sequence in fasta formatuser_attributesdtypeobject
KD,is_pointerFalsemodalityUNKNOWNdescriptionKd for the measure of binding affinity.user_attributesunitMobjectiveLower valuedtypeobject
Binder_class,is_pointerFalsemodalityUNKNOWNdescriptionThe binding affinity as three classes labels.user_attributesobjectivestrongdtypeobject
Binder_bool,is_pointerFalsemodalityUNKNOWNdescriptionThe binding affinity as boolean class labeluser_attributesobjectiveTruedtypebool

0,1
is_pointer,False
modality,UNKNOWN
description,Protein sequence in fasta format
user_attributes,
dtype,object

0,1
is_pointer,False
modality,UNKNOWN
description,Kd for the measure of binding affinity.
user_attributes,unitMobjectiveLower value
dtype,object

0,1
unit,M
objective,Lower value

0,1
is_pointer,False
modality,UNKNOWN
description,The binding affinity as three classes labels.
user_attributes,objectivestrong
dtype,object

0,1
objective,strong

0,1
is_pointer,False
modality,UNKNOWN
description,The binding affinity as boolean class label
user_attributes,objectiveTrue
dtype,bool

0,1
objective,True


In [10]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

'gs://polaris-public/polaris-recipes/org-AdaptyvBio/IL7Ra_binders/datasets/IL7Ra_binders-v1/dataset.json'

### Upload the dataset to the hub

In [11]:
dataset.upload_to_hub()

✅ SUCCESS: [1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/adaptyv-bio/IL7Ra_binders-v1[0m
 


  self._color = self._set_color(value) if value else value
