# PKIS 2 Dataset creation

In [1]:
%load_ext autoreload
%autoreload 2
import os
import datamol as dm
import pandas as pd
from polaris import curation
import numpy as np
import umap
import re
import seaborn as sns
from matplotlib import pyplot as plt
import seaborn as sns
import os
import pathlib

os.chdir(pathlib.Path("__file__").absolute().parents[2])
from polaris.curation._chemistry_curator import UNIQUE_ID
from polaris.dataset import ColumnAnnotation, Dataset
from utils import load_readme

import warnings

warnings.filterwarnings("ignore")

In [3]:
from polaris.utils.types import HubOwner

owner = HubOwner(organizationId="PolarisTest", slug="polaristest")
owner

HubOwner(slug='polaristest', external_id=None, type=None)

In [4]:
DATASET_DIR = "gs://polaris-public/datasets/kinases/pkis2_subset_curated"

### Create Dataset

In [5]:
path = "gs://polaris-public/data/Kinases/pkis2_subset_curated_v1.parquet"

table = pd.read_parquet(path)
table.rename(columns={UNIQUE_ID: "UNIQUE_ID"}, inplace=True)

In [6]:
data_cols = ["EGFR", "KIT", "LOK", "RET", "SLK"]
threshold = 80

In [7]:
mutant_anno = {}
mutant_cls_anno = {}

for col in data_cols:
    tar = col.split("_")[0]
    mut_var = re.findall(r"_\((\S+)\)", data_cols[0])
    mut_var = None if len(mut_var) == 0 else mut_var[0]
    if mut_var is None:
        anno = ColumnAnnotation(
            description=f"Percentage of inhibition on {tar} wide type",
            user_attributes={
                "unit": "%",
                "concentration": "1uM",
                "organism": "Human",
                "objective": "Higher value",
                "source": "PKIS2",
                "mutation": "None",
                "target": tar,
            },
        )
        cls_anno = ColumnAnnotation(
            description=f"Binarized label based on the percentage of inhibition on {tar} wide type",
            user_attributes={
                "thresholds": f"Greather than {threshold}",
                "label_order": "ascending",
                "ref_col": col,
            },
        )
    else:
        anno = ColumnAnnotation(
            description=f"Percentage of inhibition on {tar} with mutation {mut_var}",
            user_attributes={
                "unit": "%",
                "concentration": "1uM",
                "organism": "Human",
                "objective": "Higher value",
                "source": "PKIS2",
                "mutation": mut_var,
                "target": tar,
            },
        )

        cls_anno = ColumnAnnotation(
            protocol=f"Binarized label based on the percentage of inhibition on {tar} {mut_var}",
            user_attributes={
                "thresholds": f"Greather than {threshold}",
                "label_order": "ascending",
                "ref_col": col,
            },
        )
    mutant_anno[col] = anno
    mutant_cls_anno[f"CLASS_{col}"] = cls_anno

In [8]:
pkis2_annotations = {
    # Molecule identifiers
    "UNIQUE_ID": ColumnAnnotation(
        description="Molecular hash ID. See <datamol.mol.hash_mol>"
    ),
    "smiles": ColumnAnnotation(
        description="Molecule SMILES string", modality="molecule"
    ),
    **mutant_anno,
    **mutant_cls_anno,
}

In [10]:
dataset_name = "kinase-pkis2-1"
readme_name = "pkis2_egfr_ret_kit_lok_slk_v1"
dataset = Dataset(
    table=table[pkis2_annotations.keys()],
    name=dataset_name,
    description=f"A subset of PKIS 2 dataset only including EGFR, RET, KIT, LOK and SLK kinases. Profile of kinases PKIS2 which contains 640 small molecule for 468 kinases.",
    source=" https://www.ncbi.nlm.nih.gov/pubmed/28767711",
    annotations=pkis2_annotations,
    owner=owner,
    tags=["Kinase", "HitDiscovery", "Selectivity"],
    readme=load_readme(f"gs://polaris-public/readme/datasets/{readme_name}_readme.md"),
)

In [11]:
# save the dataset
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset.to_json(SAVE_DIR)

'gs://polaris-public/datasets/kinases/pkis2_subset_curated/kinase-pkis2-1/dataset.json'