# PKIS 2 Dataset creation

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pathlib

import pandas as pd
import datamol as dm

# polaris dataset
from polaris.dataset import Dataset, ColumnAnnotation

from polaris.utils.types import HubOwner


root = pathlib.Path("__file__").absolute().parents[2]
os.chdir(root)
sys.path.insert(0, str(root))
from utils.docs_utils import load_readme

In [2]:
# Get the owner and organization
org = "polaris"
data_name = "drewry2017_pkis2_subset"
dirname = dm.fs.join(root, f"org-{org}", data_name)
gcp_root = f"gs://polaris-public/polaris-recipes/org-{org}/{data_name}"

owner = HubOwner(slug=org, type="organization")
owner

HubOwner(slug='polaris', external_id=None, type='organization')

In [3]:
BENCHMARK_DIR = f"{gcp_root}/benchmarks"
DATASET_DIR = f"{gcp_root}/datasets"
FIGURE_DIR = f"{gcp_root}/figures"

## Load existing data

In [5]:
PATH = f"{gcp_root}/data/curation/{data_name}_curated_v3.csv"
table = pd.read_csv(PATH)
table.columns

Index(['Smiles', 'EGFR', 'KIT', 'LOK', 'RET', 'SLK', 'MOL_smiles',
       'MOL_molhash_id', 'MOL_molhash_id_no_stereo', 'MOL_num_stereoisomers',
       'MOL_num_undefined_stereoisomers', 'MOL_num_defined_stereo_center',
       'MOL_num_undefined_stereo_center', 'MOL_num_stereo_center',
       'MOL_undefined_E_D', 'MOL_undefined_E/Z', 'CLS_EGFR', 'CLS_KIT',
       'CLS_RET', 'CLS_LOK', 'CLS_SLK', 'OUTLIER_EGFR', 'OUTLIER_KIT',
       'OUTLIER_LOK', 'OUTLIER_RET', 'OUTLIER_SLK', 'AC_CLS_EGFR',
       'AC_CLS_KIT', 'AC_CLS_LOK', 'AC_CLS_RET', 'AC_CLS_SLK'],
      dtype='object')

## Below we specify the meta information of data columns

In [6]:
data_cols = ["EGFR", "KIT", "RET", "LOK", "SLK"]
threshold = 80

In [7]:
import re

mutant_anno = {}
mutant_cls_anno = {}

for col in data_cols:
    tar = col.split("_")[0]
    mut_var = re.findall(r"_\((\S+)\)", data_cols[0])
    mut_var = "NaN" if len(mut_var) == 0 else mut_var[0]
    if mut_var is None:
        anno = ColumnAnnotation(
            description=f"Percentage of inhibition on {tar} wide type",
            user_attributes={
                "unit": "%",
                "concentration": "1uM",
                "organism": "Human",
                "objective": "Higher value",
                "source": "PKIS2",
                "mutation": "None",
                "target": tar,
            },
        )
        cls_anno = ColumnAnnotation(
            description=f"Binarized label based on the percentage of inhibition on {tar} wide type",
            user_attributes={
                "thresholds": f"Greather than {threshold}",
                "label_order": "ascending",
                "ref_col": col,
            },
        )
    else:
        anno = ColumnAnnotation(
            description=f"Percentage of inhibition on {tar} with mutation {mut_var}",
            user_attributes={
                "unit": "%",
                "concentration": "1uM",
                "organism": "Human",
                "objective": "Higher value",
                "source": "PKIS2",
                "mutation": mut_var,
                "target": tar,
            },
        )

        cls_anno = ColumnAnnotation(
            protocol=f"Binarized label based on the percentage of inhibition on {tar} {mut_var}",
            user_attributes={
                "thresholds": f"Greather than {threshold}",
                "label_order": "ascending",
                "ref_col": col,
            },
        )
    mutant_anno[col] = anno
    mutant_cls_anno[f"CLS_{col}"] = cls_anno

In [8]:
pkis2_annotations = {
    # Molecule identifiers
    "MOL_molhash_id": ColumnAnnotation(
        description="Molecular hash ID. See <datamol.mol.hash_mol>"
    ),
    "MOL_smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    **mutant_anno,
    **mutant_cls_anno,
}

In [9]:
version = "v3"
dataset_name = f"drewry2017_pkis2_subset-{version}"
dataset = Dataset(
    table=table[pkis2_annotations.keys()],
    name=dataset_name,
    description=f"A subset of PKIS 2 dataset only including EGFR, RET, KIT, LOK and SLK kinases. Profile of kinases PKIS2 which contains 640 small molecule for 468 kinases.",
    source=" https://www.ncbi.nlm.nih.gov/pubmed/28767711",
    annotations=pkis2_annotations,
    owner=owner,
    tags=["Kinase", "HitDiscovery", "Selectivity"],
    user_attributes={"year": "2017"},
    readme=load_readme("org-Polaris/drewry2017_pkis2_subset/pkis2_subset_readme.md"),
    license="CC-BY-4.0",
    curation_reference="https://github.com/polaris-hub/polaris-recipes/org-Polaris/drewry2017_pkis2_subset/01_pkis2_kinase_data_curation.ipynb",
)

In [10]:
# save the dataset to GCP
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset_path = dataset.to_json(SAVE_DIR)
dataset_path

'gs://polaris-public/polaris-recipes/org-polaris/drewry2017_pkis2_subset/datasets/drewry2017_pkis2_subset-v3/dataset.json'

In [12]:
# upload to Polaris Hub
dataset.upload_to_hub(owner=owner, access="private")

✅ SUCCESS: [1mYour dataset has been successfully uploaded to the Hub. View it here: https://polarishub.io/datasets/polaris/drewry2017_pkis2_subset-v3[0m
 


  self._color = self._set_color(value) if value else value


## Disclaimers

<div style="background-color: lightyellow; padding: 10px; border: 1px solid black;">
        <span>Here are some additional details that may be of use when deciding whether or not to use these datasets.</span><br /><br />
        <strong>Some advantages include: </strong>
        <ul>
        <li>The dataset was generated by one group that carried out the assays under a consistent set of conditions.</li>
        <li>Dataset contains only a small number of molecules with unspecified stereocenters.</li>
        <li>There are no duplicate structures in the dataset.</li>
        <li>The data is based on a well-defined biomedical endpoints.</li>
        </ul>
        <strong>Some limitations to consider: </strong>
        <ul>
        <li>The assay endpoint is % inhibition, which is less desirable than a dose-response but similar to what is commonly encountered with <a herf="https://www.bmglabtech.com/en/blog/high-throughput-screening/#:~:text=What%20is%20high%2Dthroughput%20screening,for%20example%20through%20binding%20assays.">HTS</a> data.</li>
        <li>The compounds are highly clustered, with the largest cluster containing 50 compounds. Thus, generalizability of your model may be affected due to the limited diversity of compounds in this dataset.</li>
        <li>The dataset is relatively small, containing only 640 compounds. This, combined with the fact that the data is highly clustered, will make it difficult to see statistically significant differences between methods. This will be highly acute when the splits are based on clusters or scaffolds.</li>
        </ul>
        
</div>
