# PKIS 1 Dataset creation

In [1]:
%load_ext autoreload
%autoreload 2
import os
import datamol as dm
import pandas as pd
from polaris import curation
import numpy as np
import umap
import re
import seaborn as sns
from matplotlib import pyplot as plt
import seaborn as sns
import os
import pathlib
os.chdir(pathlib.Path("__file__").absolute().parents[2])
from polaris.curation._chemistry_curator import UNIQUE_ID
from polaris.dataset import ColumnAnnotation, Dataset
from utils import *


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
from polaris.utils.types import HubOwner
owner = HubOwner(organizationId="PolarisTest", slug="polaristest")
owner.owner

'PolarisTest'

In [3]:
DATASET_DIR = "gs://polaris-public/datasets/kinases/pkis1_subset_curated "

### Create Dataset

In [4]:
path = 'gs://polaris-public/data/Kinases/pkis1_subset_curated_v1.parquet'

table = pd.read_parquet(path)
table.rename(columns={UNIQUE_ID: "UNIQUE_ID"}, inplace=True)

In [9]:
data_cols = ['EGFR_(L858R_mutant)',  'EGFR',
       'KIT_(T6701_mutant)', 'KIT_(V560G_mutant)', 'KIT', 
       'RET_(V804L_mutant)', 'RET_(Y791F_mutant)', 'RET']

In [10]:
mutant_anno = {}
mutant_cls_anno = {}

for col in data_cols:
    tar = col.split("_")[0]
    mut_var = re.findall(r"_\((\S+)\)", data_cols[0])
    mut_var = None if len(mut_var) == 0 else mut_var[0]
    if mut_var is None:
        anno = ColumnAnnotation(
            protocol=f"Percentage of inhibition on {tar} wide type",
            user_attributes={
                "unit": "%",
                "concentration": "1uM",
                "organism": "Human",
                "objective": "Higher value",
                "source": "PKIS1",
                "mutation": None,
                "target": tar,
            },
        )
        cls_anno = ColumnAnnotation(
            protocol=f"Binarized label based on the percentage of inhibition on {tar} wide type",
            user_attributes={
                "thresholds": "Greather than 90 (stringent)",
                "label_order": "ascending",
                "ref_col": col,
            },
        )
    else:
        anno = ColumnAnnotation(
            protocol=f"Percentage of inhibition on {tar} with mutation {mut_var}",
            user_attributes={
                "unit": "%",
                "concentration": "1uM",
                "organism": "Human",
                "objective": "Higher value",
                "source": "PKIS1",
                "mutation": mut_var,
                "target": tar,
            },
        )

        cls_anno = ColumnAnnotation(
            protocol=f"Binarized label based on the percentage of inhibition on {tar} {mut_var}",
            user_attributes={
                "thresholds": "Greather than 90 (stringent)",
                "label_order": "ascending",
                "ref_col": col,
            },
        )
    mutant_anno[col] = anno
    mutant_cls_anno[f"CLASS_{col}"] = cls_anno

In [11]:
pkis_annotations = {
    # Molecule identifiers
    "UNIQUE_ID": ColumnAnnotation(
        protocol="Molecular hash ID. See <datamol.mol.hash_mol>"
    ),
    'smiles': ColumnAnnotation(protocol='Molecule SMILES string', modality="molecule"),
    **mutant_anno,
    **mutant_cls_anno,
}

In [12]:
dataset_name = "pkis1_egfr_ret_kit_lok_slk"
dataset = Dataset(
    table=table,
    name=dataset_name,
    description=f"A subset of PKIS dataset only including EGFR, RET, KIT, LOK and SLK kinases. PKIS is a data set of 367 small-molecule ATP-competitive kinase inhibitors that was screened by the set in activity assays with 224 recombinant kinases and 24 G protein-coupled receptors and in cellular assays of cancer cell proliferation and angiogenesis.",
    source="https://pubmed.ncbi.nlm.nih.gov/26501955",
    annotations=pkis_annotations,
    owner=owner, 
    tags=["Kinase","HitDiscovery", "Selectivity"],
)

In [13]:
# save the dataset
SAVE_DIR = f"{DATASET_DIR}/{dataset_name}"
dataset.to_json(SAVE_DIR)

'gs://polaris-public/datasets/kinases/pkis1_subset_curated /pkis1_egfr_ret_kit_lok_slk/dataset.json'