# Data

> Load various kinase-relatd datasets

## Setup

In [None]:
#| default_exp data

In [None]:
#| export
import pandas as pd
from functools import lru_cache
from fastcore.all import patch,patch_to

import gdown,zipfile,shutil,tempfile
from pathlib import Path

In [None]:
#| hide
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', 100) # show all columns

```python
from katlas.data import *
```

## Kinase Dataset

We will go through how to load kinase information data and phosphorylation sites data.

Datasets used in this study can be accessed through `Data`

In [None]:
#| export
class Data:
    "A class for fetching various datasets."
    DATASET_DIR = Path(tempfile.gettempdir()) / 'katlas_dataset'

You can change it to local path if you want:

```python
Data.DATASET_DIR=Path('katlas_dataset')
```

In [None]:
#| export
@patch_to(Data)
def download(download_dir=None, # dest directory of downloaded folder
             force=False, # if force, will overwrite the current dataset folder
             verbose=True, # print existing dataset folder
            ):
    "Download dataset zip and extract them in tmp folder if dataset_dir is not given."
    path = 'https://drive.google.com/uc?id=17wIl0DbdoHV036Z3xgaT_0H3LlM_W47l'
    if download_dir is not None: Data.DATASET_DIR=Path(download_dir)/'katlas_dataset'
    
    # 🧹 If old extracted folder exists, remove it (so we overwrite cleanly)
    if Data.DATASET_DIR.exists():
        if force:
            print(f"♻️ Removing existing folder: {Data.DATASET_DIR}")
            shutil.rmtree(Data.DATASET_DIR)
        else:
            if verbose: print(f"✅ Dataset exists at: {Data.DATASET_DIR}")
            return

    # ⬇️ Download zip (always fresh)
    print(f"⬇️ Downloading katlas_dataset.zip ...")
    downloaded_file = gdown.download(path)

    # 📦 Extract zip to folder
    print(f"📂 Extracting to {Data.DATASET_DIR} ...")
    with zipfile.ZipFile(downloaded_file, 'r') as zip_ref:
        zip_ref.extractall(Data.DATASET_DIR)
    # 🧹 Remove the zip after extraction
    try:
        print(f"🧹 Removing zip file: {downloaded_file}")
        Path(downloaded_file).unlink()
    except Exception as e:
        print(f"⚠️ Could not remove {downloaded_file}: {e}")

    print(f"✅ Done! Extracted dataset is at: {Data.DATASET_DIR}")

For the first time download or to update data folder:

```python
Data.download(force=True)
```

In [None]:
# Data.download(force=True)

In [None]:
#| export
@patch_to(Data)
def read_file(rel_path):
    """
    Load a CSV or Parquet file from the local dataset folder.

    Automatically infers file type from the filename extension.
    Renames 'Unnamed: 0' column to 'kinase' if present.
    """
    Data.download(verbose=False)
    path = Data.DATASET_DIR / rel_path
    ext = path.suffix.lower()

    if ext == '.csv': df = pd.read_csv(path)
    elif ext == '.parquet': 
        # df = pd.read_parquet(path)
        try:
            df = pd.read_parquet(path, engine="fastparquet")
        except Exception:
            try:
                df = pd.read_parquet(path, engine="pyarrow")
            except Exception as e:
                print(f"Failed to read parquet file {path}: {e}")
                return None
    else: raise ValueError(f"❌ Unsupported file type: {ext}")

    if "Unnamed: 0" in df.columns:
        df = df.rename(columns={"Unnamed: 0": "kinase"})

    return df

In [None]:
Data.read_file('kinase_info.csv')

Unnamed: 0,kinase,ID_coral,uniprot,gene,modi_group,group,family,subfamily_coral,subfamily,in_pspa_st,in_pspa_tyr,in_pspa,in_cddm,kd_ID,active_D1_D2,active_kd_ID,pspa_ID,pseudo,pspa_category_small,pspa_category_big,cddm_big,cddm_small,length,human_uniprot_sequence,kinasecom_domain,nucleus,cytosol,cytoskeleton,plasma membrane,mitochondrion,Golgi apparatus,endoplasmic reticulum,vesicle,centrosome,aggresome,main_location
0,AAK1,AAK1,Q2M2I8,AAK1,Other,Other,NAK,,NAK,1,0,1,0,Q2M2I8_AAK1_HUMAN_KD1,1.0,Q2M2I8_AAK1_HUMAN_KD1,AAK1,0,Nak,Nak,,,961,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,VTVDEVLAEGGFAIVFLVRTSNGMKCALKRMFVNNEHDLQVCKREI...,,,,,,,,,,,
1,AATK,LMR1,Q6ZMQ8,AATK,TK,TK,Lmr,,Lmr,0,0,0,0,Q6ZMQ8_LMTK1_HUMAN_KD1,1.0,Q6ZMQ8_LMTK1_HUMAN_KD1,,0,,,,,1374,MSSSFFNPSFAFSSHFDPDGAPLSELSWPSSLAVVAVSFSGLFAVI...,LLYLKEIGRGWFGKVFLGEVNSGISSAQVVVKELQASASVQEQMQF...,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,ZAK,ZAK,Q9NYL2,MAP3K20,TKL,TKL,MLK,ZAK,ZAK,1,0,1,1,Q9NYL2_M3K20_HUMAN_KD1,1.0,Q9NYL2_M3K20_HUMAN_KD1,ZAK,0,Map3k,Map3k,2.0,9.0,800,MSSLGASFVQIKFDDLQFFENCGGGSFGSVYRAKWISQDKEVAVKK...,LQFFENCGGGSFGSVYRAKWISQDKEVAVKKLLKIEKEAEILSVLS...,5.0,5.0,,,,,,,,,nucleus
522,ZAP70,ZAP70,P43403,ZAP70,TK,TK,Syk,,Syk,0,1,1,1,P43403_ZAP70_HUMAN_KD1,1.0,P43403_ZAP70_HUMAN_KD1,ZAP70,0,Syk and fak,Syk and fak,1.0,3.0,619,MPDPAAHLPFFYGSISRAEAEEHLKLAGMADGLFLLRQCLRSLGGY...,LIADIELGCGNFGSVRQGVYRMRKKQIDVAIKVLKQGTEKADTEEM...,3.0,5.0,,2.0,,,,,,,cytosol


### Kinase info

In [None]:
#| export
@patch_to(Data)
def get_kinase_info():
    """
    Get information of 523 human kinases on kinome tree. 
    Group, family, and subfamily classifications are sourced from Coral; 
    full protein sequences are retrieved using UniProt IDs; 
    kinase domain sequences are obtained from KinaseDomain.com; 
    and cellular localization data is extracted from published literature.
    """
    return Data.read_file("kinase_info.csv")

In [None]:
Data.get_kinase_info()

Unnamed: 0,kinase,ID_coral,uniprot,gene,modi_group,group,family,subfamily_coral,subfamily,in_pspa_st,in_pspa_tyr,in_pspa,in_cddm,kd_ID,active_D1_D2,active_kd_ID,pspa_ID,pseudo,pspa_category_small,pspa_category_big,cddm_big,cddm_small,length,human_uniprot_sequence,kinasecom_domain,nucleus,cytosol,cytoskeleton,plasma membrane,mitochondrion,Golgi apparatus,endoplasmic reticulum,vesicle,centrosome,aggresome,main_location
0,AAK1,AAK1,Q2M2I8,AAK1,Other,Other,NAK,,NAK,1,0,1,0,Q2M2I8_AAK1_HUMAN_KD1,1.0,Q2M2I8_AAK1_HUMAN_KD1,AAK1,0,Nak,Nak,,,961,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,VTVDEVLAEGGFAIVFLVRTSNGMKCALKRMFVNNEHDLQVCKREI...,,,,,,,,,,,
1,AATK,LMR1,Q6ZMQ8,AATK,TK,TK,Lmr,,Lmr,0,0,0,0,Q6ZMQ8_LMTK1_HUMAN_KD1,1.0,Q6ZMQ8_LMTK1_HUMAN_KD1,,0,,,,,1374,MSSSFFNPSFAFSSHFDPDGAPLSELSWPSSLAVVAVSFSGLFAVI...,LLYLKEIGRGWFGKVFLGEVNSGISSAQVVVKELQASASVQEQMQF...,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,ZAK,ZAK,Q9NYL2,MAP3K20,TKL,TKL,MLK,ZAK,ZAK,1,0,1,1,Q9NYL2_M3K20_HUMAN_KD1,1.0,Q9NYL2_M3K20_HUMAN_KD1,ZAK,0,Map3k,Map3k,2.0,9.0,800,MSSLGASFVQIKFDDLQFFENCGGGSFGSVYRAKWISQDKEVAVKK...,LQFFENCGGGSFGSVYRAKWISQDKEVAVKKLLKIEKEAEILSVLS...,5.0,5.0,,,,,,,,,nucleus
522,ZAP70,ZAP70,P43403,ZAP70,TK,TK,Syk,,Syk,0,1,1,1,P43403_ZAP70_HUMAN_KD1,1.0,P43403_ZAP70_HUMAN_KD1,ZAP70,0,Syk and fak,Syk and fak,1.0,3.0,619,MPDPAAHLPFFYGSISRAEAEEHLKLAGMADGLFLLRQCLRSLGGY...,LIADIELGCGNFGSVRQGVYRMRKKQIDVAIKVLKQGTEKADTEEM...,3.0,5.0,,2.0,,,,,,,cytosol


In [None]:
#| export
@patch_to(Data)
def get_kinase_uniprot() -> pd.DataFrame:
    """
    Get information of 672 uniprot human kinases, which were retrieved from UniProt by filtering all human protein entries using the keyword 'kinase'. 
    It includes additional pseudokinases and lipid kinases.
    """
    path = "uniprot_human_keyword_kinase.parquet"
    return Data.read_file(path)

In [None]:
Data.get_kinase_uniprot()

Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,uniprot_keyword_kinase,on_tree,Organism,Keywords,Sequence
0,A2RU49,HYKK_HUMAN,Hydroxylysine kinase (5-hydroxy-L-lysine kinas...,HYKK AGPHD1,1,0,Homo sapiens (Human),Alternative splicing;Cytoplasm;Kinase;Proteomi...,MSSGNYQQSEALSKPTFSEEQASALVESVFGLKVSKVRPLPSYDDQ...
1,A4D2B8,PM2P1_HUMAN,Putative postmeiotic segregation increased 2-l...,PMS2P1 PMS2L1 PMS2L13 PMS2L6 PMS2L8 PMS3 PMS8 ...,1,0,Homo sapiens (Human),Alternative splicing;Kinase;Reference proteome...,MVTMCGGHRPENFLHQVLTEFGEELAGEGKSEVGGGAPRSYLQVAS...
...,...,...,...,...,...,...,...,...,...
670,Q8NCB2,CAMKV_HUMAN,CaM kinase-like vesicle-associated protein,CAMKV,0,1,Homo sapiens (Human),Alternative splicing;Calmodulin-binding;Cell m...,MPFGCVTLGDKKNYNQPSEVTDRYDLGQVIKTEEFCEIFRAKDKTT...
671,Q8IV63,VRK3_HUMAN,Serine/threonine-protein kinase VRK3 (EC 2.7.1...,VRK3,0,1,Homo sapiens (Human),3D-structure;Alternative splicing;Cytoplasm;Nu...,MISFCPDCGKSIQAAFKFCPYCGNSLPVEEHVGSQTFVNPHVSSFQ...


In [None]:
#| export
@patch_to(Data)
def get_kd_uniprot():
    "Kinase domains extracted from UniProt database. "
    path = "uniprot_kd_labeled.parquet"
    return Data.read_file(path)

In [None]:
Data.get_kd_uniprot()

Unnamed: 0,kd_ID,Uniprot,Entry Name,Protein names,Gene Names,Gene Names (primary),Organism,kd_note,kd_evidence,kd_start,kd_end,kd_seq,Domain [FT],Domain [CC],Region,Motif,Protein families,Reactome,ComplexPortal,Subcellular location [CC],Gene Ontology (biological process),Tissue specificity,Interacts with,Subunit structure,Function [CC],Activity regulation,full_seq,D1,D2,D3,N1,active_D1_D2
0,A0A075F7E9_LERK1_ORYSI_KD1,A0A075F7E9,LERK1_ORYSI,G-type lectin S-receptor-like serine/threonine...,LECRK1 LECRK OsI_14840,LECRK1,Oryza sativa subsp. indica (Rice),Protein kinase,ECO:0000255|PROSITE-ProRule:PRU00159,523,797,AGFHEILGAGASGVVYKGQLEDELKTNIAVKTIHKLQPETEKEFMV...,"DOMAIN 22..149; /note=""Bulb-type lectin""; /evi...",,,,"Protein kinase superfamily, Ser/Thr protein ki...",,,SUBCELLULAR LOCATION: Membrane {ECO:0000255}; ...,defense response [GO:0006952]; response to oth...,"TISSUE SPECIFICITY: Expressed in plumules, rad...",,SUBUNIT: Interacts (via kinase domain) with AD...,FUNCTION: Involved in innate immunity. Require...,,MVALLLFPMLLQLLSPTCAQTQKNITLGSTLAPQGPASSWLSPSGD...,1,1,1,1,1
1,A0A078BQP2_GCY25_CAEEL_KD1,A0A078BQP2,GCY25_CAEEL,Receptor-type guanylate cyclase gcy-25 (EC 4.6...,gcy-25 Y105C5B.2,gcy-25,Caenorhabditis elegans,Protein kinase,ECO:0000255|PROSITE-ProRule:PRU00159,464,749,RVSTISTARASYSSIFSGNVAEHAIVNKQKVSVKRHVQRRAITFSR...,"DOMAIN 464..749; /note=""Protein kinase""; /evid...",DOMAIN: The protein kinase domain is predicted...,,,Adenylyl cyclase class-4/guanylyl cyclase family,R-CEL-2514859;,,SUBCELLULAR LOCATION: Cell membrane {ECO:00003...,cGMP biosynthetic process [GO:0006182]; intrac...,"TISSUE SPECIFICITY: Expressed in AQR, PQR and ...",,,FUNCTION: Guanylate cyclase involved in the pr...,,MLLLLLLLKISTFVDSFQIGHLEFENSNETRILEICMKNAGSWRDH...,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5534,X5M5N0_WNK_CAEEL_KD1,X5M5N0,WNK_CAEEL,Serine/threonine-protein kinase WNK (EC 2.7.11...,wnk-1 C46C2.1,wnk-1,Caenorhabditis elegans,Protein kinase,ECO:0000255|PROSITE-ProRule:PRU00159,334,596,LKFDEELGRGSFKTVFRGLDTETGVAVAWCELQESKLNKTERQRFR...,"DOMAIN 334..596; /note=""Protein kinase""; /evid...",DOMAIN: Disordered regions undergo liquid-liqu...,,,"Protein kinase superfamily, Ser/Thr protein ki...",,,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250|U...,cell volume homeostasis [GO:0006884]; cellular...,"TISSUE SPECIFICITY: Expressed in pharynx, nerv...",G5EEN4,SUBUNIT: Interacts with gck-3 (via C-terminus)...,FUNCTION: Serine/threonine-protein kinase comp...,ACTIVITY REGULATION: Activated in response to ...,MPDSITNGGRPPAPPSSVSSTTASTTGNFGTRRRLVNRIKKVDELH...,1,1,1,1,1
5535,X5M8U1_GCY17_CAEEL_KD1,X5M8U1,GCY17_CAEEL,Receptor-type guanylate cyclase gcy-17 (EC 4.6...,gcy-17 W03F11.2,gcy-17,Caenorhabditis elegans,Protein kinase,ECO:0000255|PROSITE-ProRule:PRU00159,535,824,EASQRSFASGPSTSTKLTVESRTETTRFIFYIYQVRNNEVVAANKH...,"DOMAIN 535..824; /note=""Protein kinase""; /evid...",DOMAIN: The protein kinase domain is predicted...,,,Adenylyl cyclase class-4/guanylyl cyclase family,R-CEL-2514859;,,SUBCELLULAR LOCATION: Cell membrane {ECO:00003...,cGMP biosynthetic process [GO:0006182]; intrac...,TISSUE SPECIFICITY: Expressed in PHA sensory n...,,,FUNCTION: Guanylate cyclase involved in the pr...,,MLFLRLFIFTPFLILANCQARRTIKVGLLFVQNVSSLQVGIGYRTS...,0,1,1,0,0


### PSPA data

In [None]:
#| export
@patch_to(Data)
def get_pspa_tyr_norm():
    """Get PSPA normalized data of tyrosine kinase."""
    path = "PSPA/pspa_tyr_norm.parquet"
    return Data.read_file(path)

In [None]:
Data.get_pspa_tyr_norm()

Unnamed: 0_level_0,-5P,-5G,-5A,-5C,-5S,-5T,-5V,-5I,-5L,-5M,-5F,-5Y,-5W,-5H,-5K,-5R,-5Q,-5N,-5D,-5E,-5s,-5t,-5y,-4P,-4G,-4A,-4C,-4S,-4T,-4V,-4I,-4L,-4M,-4F,-4Y,-4W,-4H,-4K,-4R,-4Q,-4N,-4D,-4E,-4s,-4t,-4y,-3P,-3G,-3A,-3C,...,4A,4C,4S,4T,4V,4I,4L,4M,4F,4Y,4W,4H,4K,4R,4Q,4N,4D,4E,4s,4t,4y,5P,5G,5A,5C,5S,5T,5V,5I,5L,5M,5F,5Y,5W,5H,5K,5R,5Q,5N,5D,5E,5s,5t,5y,0S,0T,0Y,0s,0t,0y
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
ABL1,0.0668,0.0689,0.0646,0.0520,0.0564,0.0539,0.0485,0.0448,0.0520,0.0536,0.0454,0.0454,0.0283,0.0597,0.0600,0.0662,0.0594,0.0606,0.0575,0.0535,0.0507,0.0507,0.0658,0.0680,0.0758,0.0687,0.0518,0.0478,0.0442,0.0561,0.0443,0.0547,0.0491,0.0390,0.0390,0.0345,0.0496,0.0485,0.0530,0.0655,0.0502,0.0667,0.0843,0.0618,0.0618,0.0925,0.0634,0.0642,0.0568,0.0527,...,0.0526,0.0584,0.0563,0.0522,0.0514,0.0429,0.0510,0.0664,0.0645,0.0645,0.0600,0.0647,0.0647,0.0757,0.0607,0.0499,0.0322,0.0342,0.0217,0.0217,0.0306,0.0769,0.0707,0.0624,0.0493,0.0591,0.0661,0.0553,0.0378,0.0548,0.0603,0.0392,0.0392,0.0413,0.0613,0.0652,0.0756,0.0526,0.0512,0.0362,0.0339,0.0254,0.0254,0.0337,0,0,1,0,0,1
TNK2,0.0679,0.0818,0.0627,0.0617,0.0529,0.0528,0.0419,0.0463,0.0437,0.0453,0.0539,0.0539,0.0598,0.0583,0.0624,0.0727,0.0537,0.0553,0.0451,0.0435,0.0430,0.0430,0.0555,0.0723,0.0682,0.0665,0.0567,0.0458,0.0423,0.0436,0.0426,0.0433,0.0495,0.0584,0.0584,0.0697,0.0556,0.0664,0.0757,0.0647,0.0532,0.0409,0.0413,0.0398,0.0398,0.0493,0.0755,0.0653,0.0572,0.0544,...,0.0580,0.0648,0.0670,0.0571,0.0470,0.0493,0.0412,0.0568,0.0516,0.0516,0.0499,0.0559,0.0430,0.0553,0.0485,0.0502,0.0416,0.0464,0.0452,0.0452,0.0533,0.0644,0.0599,0.0609,0.0629,0.0564,0.0634,0.0527,0.0502,0.0641,0.0539,0.0679,0.0679,0.0680,0.0499,0.0385,0.0302,0.0531,0.0465,0.0630,0.0572,0.0364,0.0364,0.0572,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YES1,0.0677,0.0571,0.0537,0.0530,0.0527,0.0505,0.0435,0.0375,0.0400,0.0463,0.0478,0.0478,0.0419,0.0564,0.0681,0.0647,0.0610,0.0752,0.0748,0.0610,0.0652,0.0652,0.0668,0.0625,0.0695,0.0524,0.0530,0.0470,0.0404,0.0476,0.0416,0.0528,0.0529,0.0406,0.0406,0.0424,0.0566,0.0549,0.0561,0.0607,0.0559,0.0786,0.0875,0.0596,0.0596,0.0705,0.0564,0.0580,0.0456,0.0572,...,0.0464,0.0747,0.0521,0.0561,0.0468,0.0452,0.0481,0.0716,0.0689,0.0689,0.0619,0.0623,0.0587,0.0757,0.0652,0.0499,0.0421,0.0492,0.0371,0.0371,0.0467,0.0762,0.0532,0.0533,0.0610,0.0596,0.0558,0.0416,0.0375,0.0467,0.0518,0.0627,0.0627,0.0456,0.0593,0.0662,0.0840,0.0559,0.0604,0.0422,0.0482,0.0374,0.0374,0.0411,0,0,1,0,0,1
ZAP70,0.0602,0.0880,0.0623,0.0496,0.0471,0.0514,0.0465,0.0380,0.0307,0.0526,0.0479,0.0479,0.0347,0.0641,0.0471,0.0452,0.0492,0.0703,0.0870,0.0777,0.1622,0.1622,0.1208,0.0977,0.0792,0.0865,0.0556,0.0487,0.0367,0.0375,0.0312,0.0317,0.0344,0.0274,0.0274,0.0347,0.0474,0.0338,0.0380,0.0571,0.0457,0.0929,0.1392,0.1432,0.1432,0.1685,0.0588,0.0610,0.0537,0.0581,...,0.0491,0.0520,0.0583,0.0530,0.0504,0.0304,0.0424,0.0554,0.0393,0.0393,0.0539,0.0671,0.0562,0.0557,0.0712,0.0406,0.0597,0.0558,0.0440,0.0440,0.0318,0.1269,0.0904,0.0737,0.0457,0.0638,0.0692,0.0343,0.0235,0.0305,0.0461,0.0343,0.0343,0.0344,0.0484,0.0477,0.0290,0.0520,0.0537,0.0709,0.0710,0.0862,0.0862,0.0605,0,0,1,0,0,1


In [None]:
#| export
@patch_to(Data)
def get_pspa_st_norm():
    """Get PSPA normalized data of serine/threonine kinase."""
    path = "PSPA/pspa_st_norm.parquet"
    return Data.read_file(path)

In [None]:
Data.get_pspa_st_norm()

Unnamed: 0_level_0,-5P,-5G,-5A,-5C,-5S,-5T,-5V,-5I,-5L,-5M,-5F,-5Y,-5W,-5H,-5K,-5R,-5Q,-5N,-5D,-5E,-5s,-5t,-5y,-4P,-4G,-4A,-4C,-4S,-4T,-4V,-4I,-4L,-4M,-4F,-4Y,-4W,-4H,-4K,-4R,-4Q,-4N,-4D,-4E,-4s,-4t,-4y,-3P,-3G,-3A,-3C,...,3A,3C,3S,3T,3V,3I,3L,3M,3F,3Y,3W,3H,3K,3R,3Q,3N,3D,3E,3s,3t,3y,4P,4G,4A,4C,4S,4T,4V,4I,4L,4M,4F,4Y,4W,4H,4K,4R,4Q,4N,4D,4E,4s,4t,4y,0s,0t,0y,0S,0T,0Y
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
AAK1,0.0720,0.0245,0.0284,0.0456,0.0425,0.0425,0.0951,0.1554,0.0993,0.0864,0.0425,0.0952,0.0315,0.0331,0.0262,0.0956,0.0560,0.0275,0.0160,0.0153,0.0201,0.0201,0.0611,0.0534,0.0642,0.0706,0.0560,0.0619,0.0619,0.0619,0.0621,0.0742,0.0693,0.0520,0.0534,0.0403,0.0514,0.0809,0.0715,0.0627,0.0429,0.0332,0.0560,0.0332,0.0332,0.0339,0.1084,0.0512,0.1119,0.0655,...,0.0582,0.0742,0.0582,0.0582,0.0610,0.0388,0.0489,0.0437,0.0430,0.0533,0.0481,0.0674,0.0739,0.0901,0.0623,0.0735,0.0405,0.0371,0.0335,0.0335,0.0359,0.0628,0.0702,0.0646,0.0603,0.0560,0.0560,0.0422,0.0415,0.0461,0.0464,0.0523,0.0521,0.0826,0.0560,0.0831,0.0928,0.0635,0.0592,0.0389,0.0457,0.0251,0.0251,0.0270,0.1013,1.0,0.0,0.1013,1.0,0.0
ACVR2A,0.0415,0.0481,0.0584,0.0489,0.0578,0.0578,0.0598,0.0625,0.0596,0.0521,0.0600,0.0578,0.0803,0.0570,0.0510,0.0475,0.0430,0.0536,0.0888,0.0789,0.0783,0.0783,0.0760,0.0466,0.0549,0.0555,0.0551,0.0549,0.0549,0.0543,0.0526,0.0520,0.0576,0.0619,0.0586,0.0754,0.0551,0.0406,0.0457,0.0482,0.0501,0.1040,0.0869,0.0809,0.0809,0.0681,0.0502,0.0653,0.0537,0.0588,...,0.0529,0.0481,0.0590,0.0590,0.0567,0.0554,0.0612,0.0589,0.0644,0.0654,0.0635,0.0590,0.0462,0.0387,0.0499,0.0524,0.0622,0.0870,0.0519,0.0519,0.0815,0.0758,0.0544,0.0498,0.0517,0.0563,0.0563,0.0516,0.0563,0.0512,0.0662,0.0523,0.0579,0.0800,0.0573,0.0527,0.0491,0.0616,0.0556,0.0640,0.0640,0.0703,0.0703,0.0589,0.9833,1.0,0.0,0.9833,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YSK4,0.0593,0.0728,0.0744,0.0734,0.0597,0.0597,0.0517,0.0400,0.0433,0.0512,0.0636,0.0600,0.0755,0.0664,0.0525,0.0597,0.0481,0.0616,0.0692,0.0508,0.0703,0.0703,0.0474,0.0622,0.0683,0.0618,0.0652,0.0618,0.0618,0.0477,0.0412,0.0519,0.0570,0.0610,0.0620,0.0714,0.0626,0.0491,0.0523,0.0551,0.0649,0.0671,0.0645,0.0663,0.0663,0.0534,0.0561,0.0683,0.0571,0.0636,...,0.0531,0.0545,0.0578,0.0578,0.0540,0.0436,0.0481,0.0473,0.0573,0.0578,0.0625,0.0645,0.0774,0.0697,0.0580,0.0720,0.0471,0.0450,0.0642,0.0642,0.0397,0.0790,0.0721,0.0573,0.0557,0.0573,0.0573,0.0445,0.0471,0.0481,0.0507,0.0461,0.0493,0.0539,0.0657,0.0814,0.0618,0.0741,0.0620,0.0585,0.0484,0.0634,0.0634,0.0389,0.7907,1.0,0.0,0.7907,1.0,0.0
ZAK,0.0604,0.0641,0.0659,0.0631,0.0597,0.0597,0.0454,0.0431,0.0477,0.0484,0.0544,0.0597,0.0673,0.0650,0.0815,0.0669,0.0538,0.0653,0.0591,0.0520,0.0716,0.0716,0.0611,0.0627,0.0682,0.0537,0.0621,0.0627,0.0627,0.0447,0.0423,0.0433,0.0500,0.0526,0.0564,0.0630,0.0663,0.0634,0.0653,0.0548,0.0701,0.0759,0.0673,0.0703,0.0703,0.0469,0.0723,0.0694,0.0593,0.0680,...,0.0489,0.0588,0.0582,0.0582,0.0579,0.0636,0.0630,0.0572,0.0652,0.0710,0.0738,0.0664,0.0851,0.0833,0.0527,0.0484,0.0281,0.0319,0.0459,0.0459,0.0423,0.0684,0.0623,0.0561,0.0519,0.0556,0.0556,0.0406,0.0429,0.0395,0.0481,0.0424,0.0526,0.0698,0.0672,0.1207,0.1012,0.0614,0.0556,0.0342,0.0370,0.0390,0.0390,0.0408,0.6135,1.0,0.0,0.6135,1.0,0.0


In [None]:
#| export
@patch_to(Data)
def get_pspa_all_norm() -> pd.DataFrame:
    """Get PSPA normalized data of serine/threonine and tyrosine kinases."""
    path = "PSPA/pspa_all_norm.parquet"
    return Data.read_file(path)

In [None]:
Data.get_pspa_all_norm()

Unnamed: 0_level_0,-5P,-5G,-5A,-5C,-5S,-5T,-5V,-5I,-5L,-5M,-5F,-5Y,-5W,-5H,-5K,-5R,-5Q,-5N,-5D,-5E,-5s,-5t,-5y,-4P,-4G,-4A,-4C,-4S,-4T,-4V,-4I,-4L,-4M,-4F,-4Y,-4W,-4H,-4K,-4R,-4Q,-4N,-4D,-4E,-4s,-4t,-4y,-3P,-3G,-3A,-3C,...,4A,4C,4S,4T,4V,4I,4L,4M,4F,4Y,4W,4H,4K,4R,4Q,4N,4D,4E,4s,4t,4y,0s,0t,0y,0S,0T,0Y,5P,5G,5A,5C,5S,5T,5V,5I,5L,5M,5F,5Y,5W,5H,5K,5R,5Q,5N,5D,5E,5s,5t,5y
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
AAK1,0.0720,0.0245,0.0284,0.0456,0.0425,0.0425,0.0951,0.1554,0.0993,0.0864,0.0425,0.0952,0.0315,0.0331,0.0262,0.0956,0.0560,0.0275,0.0160,0.0153,0.0201,0.0201,0.0611,0.0534,0.0642,0.0706,0.0560,0.0619,0.0619,0.0619,0.0621,0.0742,0.0693,0.0520,0.0534,0.0403,0.0514,0.0809,0.0715,0.0627,0.0429,0.0332,0.0560,0.0332,0.0332,0.0339,0.1084,0.0512,0.1119,0.0655,...,0.0646,0.0603,0.0560,0.0560,0.0422,0.0415,0.0461,0.0464,0.0523,0.0521,0.0826,0.0560,0.0831,0.0928,0.0635,0.0592,0.0389,0.0457,0.0251,0.0251,0.0270,0.1013,1.0,0.0,0.1013,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,
ACVR2A,0.0415,0.0481,0.0584,0.0489,0.0578,0.0578,0.0598,0.0625,0.0596,0.0521,0.0600,0.0578,0.0803,0.0570,0.0510,0.0475,0.0430,0.0536,0.0888,0.0789,0.0783,0.0783,0.0760,0.0466,0.0549,0.0555,0.0551,0.0549,0.0549,0.0543,0.0526,0.0520,0.0576,0.0619,0.0586,0.0754,0.0551,0.0406,0.0457,0.0482,0.0501,0.1040,0.0869,0.0809,0.0809,0.0681,0.0502,0.0653,0.0537,0.0588,...,0.0498,0.0517,0.0563,0.0563,0.0516,0.0563,0.0512,0.0662,0.0523,0.0579,0.0800,0.0573,0.0527,0.0491,0.0616,0.0556,0.0640,0.0640,0.0703,0.0703,0.0589,0.9833,1.0,0.0,0.9833,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YES1,0.0677,0.0571,0.0537,0.0530,0.0527,0.0505,0.0435,0.0375,0.0400,0.0463,0.0478,0.0478,0.0419,0.0564,0.0681,0.0647,0.0610,0.0752,0.0748,0.0610,0.0652,0.0652,0.0668,0.0625,0.0695,0.0524,0.0530,0.0470,0.0404,0.0476,0.0416,0.0528,0.0529,0.0406,0.0406,0.0424,0.0566,0.0549,0.0561,0.0607,0.0559,0.0786,0.0875,0.0596,0.0596,0.0705,0.0564,0.0580,0.0456,0.0572,...,0.0464,0.0747,0.0521,0.0561,0.0468,0.0452,0.0481,0.0716,0.0689,0.0689,0.0619,0.0623,0.0587,0.0757,0.0652,0.0499,0.0421,0.0492,0.0371,0.0371,0.0467,0.0000,0.0,1.0,0.0000,0.0,1.0,0.0762,0.0532,0.0533,0.0610,0.0596,0.0558,0.0416,0.0375,0.0467,0.0518,0.0627,0.0627,0.0456,0.0593,0.0662,0.084,0.0559,0.0604,0.0422,0.0482,0.0374,0.0374,0.0411
ZAP70,0.0602,0.0880,0.0623,0.0496,0.0471,0.0514,0.0465,0.0380,0.0307,0.0526,0.0479,0.0479,0.0347,0.0641,0.0471,0.0452,0.0492,0.0703,0.0870,0.0777,0.1622,0.1622,0.1208,0.0977,0.0792,0.0865,0.0556,0.0487,0.0367,0.0375,0.0312,0.0317,0.0344,0.0274,0.0274,0.0347,0.0474,0.0338,0.0380,0.0571,0.0457,0.0929,0.1392,0.1432,0.1432,0.1685,0.0588,0.0610,0.0537,0.0581,...,0.0491,0.0520,0.0583,0.0530,0.0504,0.0304,0.0424,0.0554,0.0393,0.0393,0.0539,0.0671,0.0562,0.0557,0.0712,0.0406,0.0597,0.0558,0.0440,0.0440,0.0318,0.0000,0.0,1.0,0.0000,0.0,1.0,0.1269,0.0904,0.0737,0.0457,0.0638,0.0692,0.0343,0.0235,0.0305,0.0461,0.0343,0.0343,0.0344,0.0484,0.0477,0.029,0.0520,0.0537,0.0709,0.0710,0.0862,0.0862,0.0605


In [None]:
#| export
@patch_to(Data)
def get_pspa_all_scale():
    """
    Get PSPA (-5 to +4) scaled data from PSPA normalized data. 
    Each position (including both pS/pT and pS=pT) are normalized to 1.
    """
    path = "PSPA/pspa_all_scale.parquet"
    return Data.read_file(path)

In [None]:
Data.get_pspa_all_scale()

Unnamed: 0_level_0,-5P,-5G,-5A,-5C,-5S,-5T,-5V,-5I,-5L,-5M,-5F,-5Y,-5W,-5H,-5K,-5R,-5Q,-5N,-5D,-5E,-5pS,-5pT,-5pY,-4P,-4G,-4A,-4C,-4S,-4T,-4V,-4I,-4L,-4M,-4F,-4Y,-4W,-4H,-4K,-4R,-4Q,-4N,-4D,-4E,-4pS,-4pT,-4pY,-3P,-3G,-3A,-3C,...,2E,2pS,2pT,2pY,3P,3G,3A,3C,3S,3T,3V,3I,3L,3M,3F,3Y,3W,3H,3K,3R,3Q,3N,3D,3E,3pS,3pT,3pY,4P,4G,4A,4C,4S,4T,4V,4I,4L,4M,4F,4Y,4W,4H,4K,4R,4Q,4N,4D,4E,4pS,4pT,4pY
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
AAK1,0.05845,0.01989,0.02305,0.03702,0.03450,0.03450,0.07720,0.12615,0.08061,0.07014,0.03450,0.07728,0.02557,0.02687,0.02127,0.07760,0.04546,0.02232,0.01299,0.01242,0.01632,0.01632,0.04960,0.04172,0.05015,0.05515,0.04375,0.04836,0.04836,0.04836,0.04851,0.05796,0.05414,0.04062,0.04172,0.03148,0.04015,0.06320,0.05586,0.04898,0.03351,0.02594,0.04375,0.02594,0.02594,0.02648,0.08610,0.04067,0.08888,0.05203,...,0.04025,0.03142,0.03142,0.03149,0.05264,0.07135,0.04499,0.05735,0.04499,0.04499,0.04715,0.02999,0.03780,0.03378,0.03324,0.04120,0.03718,0.05210,0.05712,0.06965,0.04816,0.05681,0.03131,0.02868,0.02589,0.02589,0.02775,0.05026,0.05618,0.05170,0.04826,0.04482,0.04482,0.03377,0.03321,0.03689,0.03713,0.04186,0.04170,0.06611,0.04482,0.06651,0.07427,0.05082,0.04738,0.03113,0.03657,0.02009,0.02009,0.02161
ACVR2A,0.02971,0.03443,0.04180,0.03500,0.04137,0.04137,0.04281,0.04474,0.04266,0.03729,0.04295,0.04137,0.05748,0.04080,0.03651,0.03400,0.03078,0.03837,0.06356,0.05648,0.05605,0.05605,0.05440,0.03341,0.03936,0.03979,0.03950,0.03936,0.03936,0.03893,0.03771,0.03728,0.04130,0.04438,0.04201,0.05406,0.03950,0.02911,0.03276,0.03456,0.03592,0.07456,0.06230,0.05800,0.05800,0.04882,0.03345,0.04351,0.03578,0.03918,...,0.04447,0.04786,0.04786,0.03799,0.04958,0.04381,0.03914,0.03559,0.04366,0.04366,0.04196,0.04099,0.04529,0.04358,0.04765,0.04839,0.04699,0.04366,0.03419,0.02864,0.03692,0.03877,0.04603,0.06438,0.03840,0.03840,0.06031,0.05559,0.03989,0.03652,0.03791,0.04129,0.04129,0.03784,0.04129,0.03755,0.04855,0.03835,0.04246,0.05867,0.04202,0.03865,0.03601,0.04517,0.04077,0.04693,0.04693,0.05155,0.05155,0.04319
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YES1,0.05216,0.04399,0.04137,0.04084,0.04060,0.03891,0.03352,0.02889,0.03082,0.03567,0.03683,0.03683,0.03228,0.04345,0.05247,0.04985,0.04700,0.05794,0.05763,0.04700,0.05023,0.05023,0.05147,0.04870,0.05416,0.04083,0.04130,0.03662,0.03148,0.03709,0.03242,0.04114,0.04122,0.03164,0.03164,0.03304,0.04411,0.04278,0.04372,0.04730,0.04356,0.06125,0.06818,0.04644,0.04644,0.05494,0.04363,0.04486,0.03527,0.04425,...,0.05324,0.05535,0.05535,0.05174,0.02228,0.02870,0.02669,0.03458,0.02886,0.03899,0.05880,0.06545,0.06259,0.04333,0.12155,0.12155,0.07172,0.04309,0.02375,0.03923,0.02576,0.02824,0.02120,0.02352,0.02135,0.02135,0.02739,0.03449,0.04429,0.03670,0.05908,0.04121,0.04437,0.03702,0.03575,0.03804,0.05663,0.05450,0.05450,0.04896,0.04928,0.04643,0.05988,0.05157,0.03947,0.03330,0.03891,0.02934,0.02934,0.03694
ZAP70,0.03902,0.05704,0.04038,0.03215,0.03053,0.03332,0.03014,0.02463,0.01990,0.03410,0.03105,0.03105,0.02249,0.04155,0.03053,0.02930,0.03189,0.04557,0.05639,0.05037,0.10514,0.10514,0.07830,0.06354,0.05151,0.05625,0.03616,0.03167,0.02387,0.02439,0.02029,0.02062,0.02237,0.01782,0.01782,0.02257,0.03083,0.02198,0.02471,0.03713,0.02972,0.06041,0.09052,0.09313,0.09313,0.10958,0.03211,0.03331,0.02933,0.03173,...,0.04035,0.08151,0.08151,0.09746,0.12355,0.03982,0.04552,0.04723,0.05148,0.03701,0.07326,0.05267,0.04518,0.09377,0.04374,0.04374,0.07471,0.04476,0.01838,0.02042,0.02961,0.02357,0.02051,0.01293,0.01719,0.01719,0.02374,0.08125,0.05210,0.04054,0.04294,0.04814,0.04376,0.04162,0.02510,0.03501,0.04574,0.03245,0.03245,0.04450,0.05540,0.04640,0.04599,0.05879,0.03352,0.04929,0.04607,0.03633,0.03633,0.02626


In [None]:
#| export
@patch_to(Data)
@lru_cache
def get_pspa_st_pct():
    """Get PSPA reference score to calculate percentile for serine/threonine kinases."""
    path = "PSPA/pspa_pct_st.parquet"
    return Data.read_file(path)

In [None]:
Data.get_pspa_st_pct()

kinase,AAK1,ACVR2A,ACVR2B,AKT1,AKT2,AKT3,ALK2,ALK4,ALPHAK3,AMPKA1,AMPKA2,ANKRD3,ASK1,ATM,ATR,AURA,AURB,AURC,BCKDK,BIKE,BMPR1A,BMPR1B,BMPR2,BRAF,BRSK1,BRSK2,BUB1,CAMK1A,CAMK1B,CAMK1D,CAMK1G,CAMK2A,CAMK2B,CAMK2D,CAMK2G,CAMK4,CAMKK1,CAMKK2,CAMLCK,CDC7,CDK1,CDK10,CDK12,CDK13,CDK14,CDK16,CDK17,CDK18,CDK19,CDK2,...,RIPK1,RIPK2,RIPK3,ROCK1,ROCK2,RSK2,RSK3,RSK4,SBK,SGK1,SGK3,SIK,SKMLCK,SLK,SMG1,SMMLCK,SNRK,SRPK1,SRPK2,SRPK3,SSTK,STK33,STLK3,TAK1,TAO1,TAO2,TAO3,TBK1,TGFBR1,TGFBR2,TLK1,TLK2,TNIK,TSSK1,TSSK2,TTBK1,TTBK2,TTK,ULK1,ULK2,VRK1,VRK2,WNK1,WNK3,WNK4,YANK2,YANK3,YSK1,YSK4,ZAK
0,-10.960,-0.581,0.329,-3.891,-3.591,-5.312,0.814,-0.559,-0.933,-2.607,-3.167,-0.764,-6.366,2.533,0.010,-1.164,-4.296,-4.302,1.695,-7.684,1.713,1.965,0.183,0.018,-0.379,-3.219,-9.801,-3.750,0.149,-0.610,-2.746,4.377,4.940,0.902,2.957,-2.297,-3.196,-3.469,-1.716,-0.232,-1.378,-9.497,-5.727,-4.521,-8.698,-7.186,-5.666,-5.802,-4.029,-4.039,...,-3.605,-5.763,-5.047,-6.065,-4.406,-1.298,-3.001,-0.837,-2.256,-3.190,-3.763,-0.262,-0.226,-3.190,-1.971,-3.519,-4.495,-3.007,-2.348,-3.560,-4.580,-6.610,-6.038,-1.211,-4.498,-2.911,0.463,-1.287,0.680,-3.707,1.479,3.406,-3.950,-2.172,-2.109,-7.773,-4.765,-5.262,-3.549,-2.822,-4.682,-2.854,-1.669,-1.527,-2.965,-2.877,-1.792,-6.283,-1.715,-3.204
1,-6.788,-0.166,0.307,-5.886,-4.786,-6.576,1.561,-0.865,-3.399,-3.261,-3.464,-4.366,-7.176,0.019,-0.697,-1.581,-4.628,-4.533,-1.092,-5.161,1.202,1.525,-0.386,-3.377,-1.858,-3.819,-5.141,-5.537,-1.294,-3.311,-5.862,-0.315,0.517,-0.535,0.877,-3.692,-1.182,-1.239,-1.285,2.578,-4.304,-7.431,-4.146,-3.498,-6.062,-5.981,-5.925,-5.136,-4.122,-3.122,...,-2.358,-5.160,-3.075,-7.793,-6.160,-1.518,-2.279,-2.877,-2.437,-5.513,-6.566,-4.297,-0.846,-7.367,1.751,-4.481,-5.585,-4.549,-5.719,-5.101,-2.420,-6.698,-8.190,-4.619,-8.130,-4.908,-5.247,-1.650,1.537,-4.228,-5.881,-4.427,-7.145,-2.825,-0.935,-4.638,-3.624,-9.189,-4.698,-3.656,-5.670,-2.817,-4.071,-3.394,-5.097,-1.874,-1.480,-8.709,-3.708,-6.093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89782,-3.753,1.451,1.883,-5.583,-5.253,-7.164,1.226,-0.399,3.341,-5.932,-7.009,-1.346,-1.257,-1.930,-1.086,-5.300,-6.268,-6.415,-6.027,-2.674,2.403,3.097,0.315,0.331,-8.989,-9.727,-0.962,-6.574,-2.149,-5.753,-6.502,-3.379,-3.798,-4.577,-2.381,-5.040,-0.197,-0.772,-1.802,-0.784,-1.720,-6.366,-3.784,-2.870,-4.482,-2.392,-2.803,-2.889,-3.985,-2.087,...,-5.479,-5.107,-5.028,-4.316,-3.410,-4.957,-5.244,-5.152,-5.324,-5.636,-5.262,-5.684,-3.825,-1.824,-3.824,-4.159,-6.882,-4.014,-4.454,-3.755,-8.207,-5.581,-2.655,1.240,-2.303,-0.338,0.146,-3.978,0.138,-3.302,-3.075,-1.497,-1.209,-6.844,-5.322,-4.496,-2.852,-1.356,-3.828,-3.049,-1.930,-1.420,-5.949,-4.854,-5.401,-1.853,-2.068,-2.824,-0.340,-1.326
89783,-1.540,-2.180,-2.014,-2.416,-0.592,-1.364,-3.320,-0.826,-4.438,-1.393,-2.058,-2.101,-2.838,-3.597,-0.664,-4.098,-1.564,-2.006,-5.130,-2.428,-3.192,-2.013,-0.868,-2.795,-5.958,-6.211,-0.309,-1.495,0.020,-1.378,-1.652,-2.380,-3.275,-3.026,-2.647,-1.958,-5.024,-3.868,0.578,-2.915,-3.348,-4.227,-5.020,-5.209,-1.570,-2.480,-2.677,-2.574,-5.737,-5.213,...,-1.942,-6.158,-0.930,-1.559,-0.154,0.361,-0.754,-0.276,-3.035,-1.083,-0.965,-3.652,1.355,-0.824,-3.360,-0.030,-5.503,-2.102,-4.456,-3.284,-2.001,-3.563,-4.212,-4.696,-4.132,-2.436,-2.292,-3.454,-2.188,-2.436,-2.339,-1.988,-1.042,-0.358,-0.074,-2.837,-1.059,0.434,-5.814,-4.250,-1.979,-0.661,-2.586,-4.076,-2.832,-0.575,-0.859,-2.415,-2.999,-2.550


In [None]:
#| export
@patch_to(Data)
@lru_cache
def get_pspa_tyr_pct():
    """Get PSPA reference score to calculate percentile for tyrosine kinases."""
    path = "PSPA/pspa_pct_tyr.parquet"
    return Data.read_file(path)

In [None]:
Data.get_pspa_tyr_pct()

kinase,ABL1,TNK2,ALK,ABL2,AXL,BLK,BMPR2_TYR,PTK6,BTK,CSF1R,CSK,MATK,DDR1,DDR2,EGFR,EPHA1,EPHA2,EPHA3,EPHA4,EPHA5,EPHA6,EPHA7,EPHA8,EPHB1,EPHB2,EPHB3,EPHB4,BMX,PTK2,FER,FES,FGFR1,FGFR2,FGFR3,FGFR4,FGR,FLT3,FRK,FYN,HCK,ERBB2,ERBB4,IGF1R,INSR,INSRR,ITK,JAK1,JAK2,JAK3,KIT,LCK,LIMK1_TYR,LIMK2_TYR,LTK,LYN,MERTK,MET,MAP2K4_TYR,MAP2K6_TYR,MAP2K7_TYR,MST1R,MUSK,PKMYT1_TYR,NEK10_TYR,PDGFRA,PDGFRB,PDHK1_TYR,PDHK3_TYR,PDHK4_TYR,PINK1_TYR,PTK2B,RET,ROS1,SRC,SRMS,SYK,TEC,TESK1_TYR,TEK,TNK1,TNNI3K_TYR,NTRK1,NTRK2,NTRK3,TXK,TYK2,TYRO3,FLT1,KDR,FLT4,WEE1_TYR,YES1,ZAP70
0,-0.709617,-3.624831,-2.136338,-0.022776,-0.737589,2.345905,0.504821,2.417165,-0.121611,-1.205218,1.576014,1.917812,-2.449589,-3.806540,1.005984,-2.925415,-0.609664,-1.256237,0.491093,0.494654,-1.348782,-1.938472,0.221994,-0.805070,1.130282,0.111910,-0.253774,0.221683,2.233854,2.295975,-0.025152,-1.738376,-0.986921,-0.375908,1.631896,2.154664,-1.214864,-1.185009,6.124421,1.940066,-1.020647,-0.292182,0.761946,-0.992736,-0.599703,0.298515,-4.146873,-2.339013,-2.479359,-1.082389,1.758755,-3.027934,-3.701494,-0.480120,1.346726,1.124192,-1.205155,-0.094138,1.031112,-0.287930,-2.589829,-3.379800,-1.341062,-3.607123,-4.385381,-1.978546,1.390759,0.259191,0.378106,-0.094980,0.908537,-0.618981,-2.562691,4.377146,1.855049,2.017083,-0.572871,-2.295149,-2.833113,-3.811383,-6.262204,0.683287,-0.626250,-0.368491,1.187208,-1.601712,-1.143748,-0.891566,-1.888643,-1.758264,-1.610344,4.545175,0.280174
1,0.986158,-1.645273,-1.183920,0.553010,-1.098784,-1.245678,-0.276461,-0.156496,-1.322652,-0.684989,0.447463,0.054841,-0.295641,-2.374194,0.261968,-0.444003,0.620075,-0.918899,-0.266802,-1.466189,0.181707,-0.884474,-0.829816,-1.039152,-1.332577,-1.553626,-1.557679,-1.041167,-1.465569,-0.573358,-1.402839,-1.119166,-0.006615,-0.690700,0.057757,-1.329587,-0.752020,-1.421231,-1.119607,-0.361730,-0.067540,-1.488971,-1.959423,-1.198227,-1.250944,-1.559043,-1.742221,-0.297813,-0.737524,0.101179,-0.785122,-0.803239,-0.227134,-0.226996,-1.061925,-0.749321,-0.324825,0.101380,-0.753605,-0.186826,-0.078232,-1.533730,-0.949200,1.335091,-0.953302,-1.471499,-0.039170,0.376709,-0.653393,-0.690604,-0.187336,0.372630,-1.753648,-0.822064,0.291210,-1.380433,-0.652058,-0.974885,-2.822036,-1.223497,-2.903235,-0.909811,-1.158577,-0.777541,-0.385554,-0.624216,-0.737089,-0.315447,-1.293708,-1.182827,-1.891533,-0.456570,-2.465316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7313,0.737694,-0.477689,-0.646850,0.928066,0.187149,-1.000041,-0.283551,-3.053869,-0.750475,0.132043,0.069439,0.778468,-0.572751,-0.877774,0.279656,-1.905299,-2.542936,-0.629488,-1.433432,-1.319311,-0.959149,-1.904233,-1.836303,-0.665941,-0.500875,-1.140260,-0.324883,-0.138589,-1.634647,-1.463951,-2.856428,1.338733,1.308337,1.210296,0.236992,-1.117447,0.508686,-1.152320,-0.680328,-1.311162,-1.278938,-0.529947,-0.707274,-0.296606,0.170371,-0.406315,0.053109,-0.184986,1.285550,-1.175831,-0.657466,-0.059045,-0.307151,-0.027580,-1.925899,0.415170,0.535199,-0.446664,0.591918,0.316993,0.457042,0.960955,-1.222173,0.783990,-0.043431,1.226039,0.035492,0.102904,0.600526,0.732808,-1.478030,1.261671,0.460373,-1.074805,-1.293450,-1.402257,-0.780591,-0.107376,-0.479018,1.326369,-1.912418,-0.017800,0.585871,-0.122134,-1.275022,-0.020350,0.483620,-0.060204,1.378042,0.573273,-2.383657,-0.246005,1.174693
7314,2.115113,0.153795,0.356357,1.846239,-0.856035,-0.422296,-0.985140,0.554181,0.381133,-1.666383,-0.728353,-0.254733,-0.999583,0.117162,-1.622541,-1.882815,-3.830351,-1.601071,-1.750483,-2.748548,-0.900263,-1.661417,-2.349142,-0.825704,-1.211989,-1.668258,-0.242232,0.727625,-2.099673,0.240011,-0.659603,-1.569382,-1.979445,-2.605414,-1.726131,0.175403,-2.030979,-0.537023,-0.303194,-0.134410,-1.497615,-2.364148,-3.279693,-1.919277,-2.157962,0.598978,-0.792918,-0.965011,-1.728287,-1.616595,-0.084054,-0.968384,-0.373329,-0.234884,-1.088861,-1.129566,-0.358743,0.154190,-0.787745,-0.482053,-1.581810,-1.865008,-0.707663,-0.049274,-1.190498,-1.403224,-0.849104,-0.212996,-0.159123,-0.885066,0.447393,-0.121666,-0.543588,0.170109,0.044321,-1.576300,1.325049,-1.122959,-2.962076,-0.786136,-1.090242,-1.402034,-1.365117,-1.670434,1.684176,-0.508297,-0.304215,-2.045909,-1.629804,-2.227050,-2.294855,0.428825,-1.789086


In [None]:
#| export
@patch_to(Data)
@lru_cache
def get_num_dict() -> dict:
    """Get a dictionary mapping kinase to number of random amino acids in PSPA."""
    path = "PSPA/pspa_divide_num.csv"
    return Data.read_file(path).set_index("kinase")["num_random_aa"].to_dict()

In [None]:
list(Data.get_num_dict().items())[:5]

[('SYK', 18), ('PTK2', 18), ('ZAP70', 18), ('ERBB2', 18), ('CSK', 18)]

### CDDM data

In [None]:
#| export
@patch_to(Data)
def get_ks_unique():
    """Get kinase substrate dataset with unique sub site ID."""
    path = "CDDM/unique_ks_sites.parquet"
    return Data.read_file(path)

In [None]:
Data.get_ks_unique()

Unnamed: 0,sub_site,num_kin,bin,sub_genes,site_seq,source_combine,acceptor,O00141_SGK1,O00238_BMPR1B,O00311_CDC7,O00329_PIK3CD,O00418_EEF2K,O00443_PIK3C2A,O00444_PLK4,O00506_STK25,O14578_CIT,O14730_RIOK3,O14733_MAP2K7,O14757_CHEK1,O14874_BCKDK,O14920_IKBKB,O14936_CASK,O14965_AURKA,O14976_GAK,O15021_MAST4,O15075_DCLK1,O15111_CHUK,O15146_MUSK,O15264_MAPK13,O15530_PDPK1,O43283_MAP3K13,O43293_DAPK3,O43318_MAP3K7,O43353_RIPK2,O43683_BUB1,O43781_DYRK3,O60285_NUAK1,O60331_PIP5K1C,O60566_BUB1B,O60674_JAK2,O60885_BRD4,O75116_ROCK2,O75385_ULK1,O75460_ERN1,O75582_RPS6KA5,O75676_RPS6KA4,O75716_STK16,O75914_PAK3,O76039_CDKL5,O94768_STK17B,...,Q9HBY8_SGK2,Q9HC98_NEK6,Q9HCP0_CSNK1G1,Q9NQU5_PAK6,Q9NR20_DYRK4,Q9NRA0_SPHK2,Q9NRM7_LATS2,Q9NSY1_BMP2K,Q9NWZ3_IRAK4,Q9NYA1_SPHK1,Q9NYL2_MAP3K20,Q9NYV4_CDK12,Q9NYY3_PLK2,Q9NZJ5_EIF2AK3,Q9P0L2_MARK1,Q9P1W9_PIM2,Q9P286_PAK5,Q9P289_STK26,Q9P2K8_EIF2AK4,Q9UBE8_NLK,Q9UBS0_RPS6KB2,Q9UEE5_STK17A,Q9UEW8_STK39,Q9UF33_EPHA6,Q9UHD2_TBK1,Q9UIG0_BAZ1B,Q9UIK4_DAPK2,Q9UK32_RPS6KA6,Q9UKE5_TNIK,Q9UKI8_TLK1,Q9UL54_TAOK2,Q9UM73_ALK,Q9UPE1_SRPK3,Q9UPZ9_CILK1,Q9UQ07_MOK,Q9UQ88_CDK11A,Q9UQB9_AURKC,Q9UQM7_CAMK2A,Q9Y243_AKT3,Q9Y2H1_STK38L,Q9Y2K2_SIK3,Q9Y2U5_MAP3K2,Q9Y3S1_WNK2,Q9Y463_DYRK1B,Q9Y4K4_MAP4K5,Q9Y572_RIPK3,Q9Y5S2_CDC42BPB,Q9Y6E0_STK24,Q9Y6M4_CSNK1G3,Q9Y6R4_MAP3K4
0,A0A2R8Y4L2_S158,1,1,HNRNPA1L3 HNRNPA1P48,TDRGSGKKRGFAFVTFDDHDsVDKIVIQKYHTVNGHNCEVR,Sugiyama,S,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,A0A2R8Y4L2_S22,3,2~10,HNRNPA1L3 HNRNPA1P48,SKSEsPKEPEQLRKLFIGGLsFEtTDESLRSHFEQWGTLTD,Sugiyama,S,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29809,Q9Y6Y9_Y131,1,1,LY96 ESOP1 MD2,ETVNTTISFSFKGIKFSKGKyKCVVEAISGSPEEMLFCLEF,Non-Sugiyama,Y,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
29810,Q9Y6Y9_Y22,1,1,LY96 ESOP1 MD2,LPFLFFSTLFSSIFTEAQKQyWVCNSSDASISYTYCDKMQY,Non-Sugiyama,Y,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#| export
@patch_to(Data)
def get_ks_dataset(add_kinase_info=True):
    """
    Get kinase–substrate dataset collected from public resources,
    with the option of enriching with kinase info.
    """
    # --- 1️⃣ Load and preprocess dataset ---
    path = "CDDM/ks_datasets_20250407.parquet"
    df = Data.read_file(path)

    # Convert numeric-looking column names once
    df.columns = [
        int(c) if isinstance(c, str) and c.lstrip("-").isdigit() else c
        for c in df.columns
    ]

    if "substrate_phosphoseq" in df.columns:
        df["substrate_sequence"] = df["substrate_phosphoseq"].str.upper()

    if not add_kinase_info:
        return df

    # --- 2️⃣ Prepare kinase info (dedup, indexed maps) ---
    info = (
        Data.get_kinase_info()
        .sort_values("kinase")
        .drop_duplicates("uniprot")
        .set_index("uniprot")
    )

    # Extract clean UniProt IDs (remove isoforms)
    df["uniprot_clean"] = df["kinase_uniprot"].str.split("-", n=1).str[0]

    # Create mapping dicts once
    maps = {
        "kinase_group": info["modi_group"].to_dict(),
        "kinase_family": info["family"].to_dict(),
        "kinase_subfamily": info["subfamily"].to_dict(),
        "kinase_pspa_big": info["pspa_category_big"].to_dict(),
        "kinase_pspa_small": info["pspa_category_small"].to_dict(),
        "kinase_coral_ID": info["ID_coral"].to_dict(),
        "kinase_protein": info["kinase"].to_dict(),
    }

    # Preload gene name map once
    gene_map = Data.get_kinase_uniprot().set_index("Entry")["Gene Names"].to_dict()

    # --- 3️⃣ Vectorized assignment ---
    df["kinase_on_tree"] = df["uniprot_clean"].isin(info.index).astype(int)
    df["kinase_genes"] = df["uniprot_clean"].map(gene_map)

    # Assign all kinase attributes in a loop instead of repeated `.map()`
    for col, mapping in maps.items():
        df[col] = df["uniprot_clean"].map(mapping)

    # --- 4️⃣ Add num_kin from ks_unique ---
    site_info = Data.get_ks_unique()[["sub_site", "num_kin"]].set_index("sub_site")
    df["num_kin"] = df["sub_site"].map(site_info["num_kin"])

    # Clean up
    df.drop(columns="uniprot_clean", inplace=True)
    return df

In [None]:
Data.get_ks_dataset()

Unnamed: 0,kin_sub_site,kinase_uniprot,substrate_uniprot,site,source,substrate_genes,substrate_phosphoseq,position,site_seq,sub_site,substrate_sequence,kinase_on_tree,kinase_genes,kinase_group,kinase_family,kinase_subfamily,kinase_pspa_big,kinase_pspa_small,kinase_coral_ID,kinase_protein,num_kin
0,O00141_A4FU28_S140,O00141,A4FU28,S140,Sugiyama,CTAGE9,MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC...,140,AAAEEARSLEATCEKLSRsNsELEDEILCLEKDLKEEKSKH,A4FU28_S140,MEEPGATPQPYLGLVLEELGRVVAALPESMRPDENPYGFPSELVVC...,1,SGK1 SGK,AGC,SGK,SGK,Basophilic,Akt/rock,SGK1,SGK1,22
1,O00141_O00141_S252,O00141,O00141,S252,Sugiyama,SGK1 SGK,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,252,SQGHIVLTDFGLCKENIEHNsTtstFCGtPEyLAPEVLHKQ,O00141_S252,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,1,SGK1 SGK,AGC,SGK,SGK,Basophilic,Akt/rock,SGK1,SGK1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187064,Q9Y6R4_Q9Y6R4_T1494,Q9Y6R4,Q9Y6R4,T1494,SIGNOR|EPSD|PSP,MAP3K4 KIAA0213 MAPKKK4 MEKK4 MTK1,MREAAAALVPPPAFAVTPAAAMEEPPPPPPPPPPPPEPETESEPEC...,1494,SGLIKLGDFGCSVKLKNNAQtMPGEVNSTLGTAAYMAPEVI,Q9Y6R4_T1494,MREAAAALVPPPAFAVTPAAAMEEPPPPPPPPPPPPEPETESEPEC...,1,MAP3K4 KIAA0213 MAPKKK4 MEKK4 MTK1,STE,STE11,STE11,,,MAP3K4,MAP3K4,1
187065,Q9Y6R4_Q9Y6R4_Y1328,Q9Y6R4,Q9Y6R4,Y1328,Sugiyama,MAP3K4 KIAA0213 MAPKKK4 MEKK4 MTK1,MREAAAALVPPPAFAVTPAAAMEEPPPPPPPPPPPPEPETESEPEC...,1328,RYREMRRKNIIGQVCDtPKSyDNVMHVGLRKVTFKWQRGNK,Q9Y6R4_Y1328,MREAAAALVPPPAFAVTPAAAMEEPPPPPPPPPPPPEPETESEPEC...,1,MAP3K4 KIAA0213 MAPKKK4 MEKK4 MTK1,STE,STE11,STE11,,,MAP3K4,MAP3K4,1


In [None]:
#| export
@patch_to(Data)
@lru_cache
def get_ks_background():
    """Get kinase substrate dataset with unique sub site ID."""
    path = "CDDM/ks_background.parquet"
    return Data.read_file(path)

In [None]:
Data.get_ks_background()

Unnamed: 0_level_0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,-20F,-20Y,-20W,-20H,-20K,-20R,-20Q,-20N,-20D,-20E,-20s,-20t,-20y,-19P,-19G,-19A,-19C,-19S,-19T,-19V,-19I,-19L,-19M,-19F,-19Y,-19W,-19H,-19K,-19R,-19Q,-19N,-19D,-19E,-19s,-19t,-19y,-18P,-18G,-18A,-18C,...,18E,18s,18t,18y,19P,19G,19A,19C,19S,19T,19V,19I,19L,19M,19F,19Y,19W,19H,19K,19R,19Q,19N,19D,19E,19s,19t,19y,20P,20G,20A,20C,20S,20T,20V,20I,20L,20M,20F,20Y,20W,20H,20K,20R,20Q,20N,20D,20E,20s,20t,20y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
ks_S,0.074296,0.068105,0.075925,0.014664,0.053246,0.038517,0.051290,0.040342,0.078989,0.020269,0.028806,0.016228,0.008081,0.023266,0.068952,0.056765,0.046728,0.036301,0.053506,0.082573,0.041645,0.015511,0.005996,0.067780,0.069925,0.072069,0.012867,0.056343,0.036392,0.049454,0.038472,0.084546,0.022550,0.025994,0.017611,0.008773,0.020665,0.066220,0.065181,0.048739,0.038991,0.052963,0.079218,0.042566,0.016246,0.006434,0.065772,0.069725,0.072771,0.012960,...,0.082005,0.043003,0.014401,0.006134,0.068513,0.069249,0.073398,0.012980,0.056336,0.037803,0.050114,0.035996,0.080222,0.018801,0.030510,0.014720,0.009635,0.023752,0.071056,0.061087,0.046434,0.037535,0.053258,0.082363,0.044627,0.015656,0.005955,0.070061,0.067710,0.072412,0.015181,0.053469,0.036676,0.058910,0.037885,0.084906,0.018338,0.030093,0.017935,0.007926,0.021294,0.070330,0.059313,0.044200,0.036273,0.054880,0.077316,0.043461,0.014980,0.006449
ks_T,0.061634,0.065896,0.074266,0.016436,0.043829,0.034394,0.059656,0.040177,0.089484,0.024806,0.033024,0.019175,0.010501,0.020088,0.072896,0.056460,0.042764,0.035002,0.065287,0.076548,0.028915,0.018566,0.010196,0.060989,0.072395,0.072395,0.013840,0.046388,0.036502,0.062662,0.040913,0.081217,0.024183,0.031787,0.016730,0.007452,0.019468,0.080152,0.059468,0.044715,0.034981,0.055057,0.083498,0.033916,0.015665,0.005627,0.063562,0.070692,0.069326,0.015777,...,0.086112,0.037420,0.017222,0.011116,0.059683,0.067222,0.073033,0.012879,0.044605,0.034867,0.054343,0.043191,0.082142,0.017905,0.031255,0.018847,0.009267,0.021988,0.076174,0.065023,0.040364,0.043349,0.063295,0.081200,0.030470,0.019632,0.009267,0.063397,0.067497,0.066393,0.015140,0.047784,0.031698,0.053304,0.046838,0.080902,0.019871,0.035483,0.020659,0.010093,0.022394,0.075067,0.060558,0.045261,0.038007,0.058666,0.076644,0.035483,0.019871,0.008989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
human_ST_upper,0.082791,0.066656,0.072951,0.012749,0.111414,0.054709,0.049589,0.032810,0.080030,0.019268,0.025079,0.019063,0.007013,0.022010,0.067710,0.062114,0.047611,0.033510,0.052219,0.080702,0.000000,0.000000,0.000000,0.082364,0.066437,0.073391,0.012502,0.112851,0.054252,0.050835,0.031845,0.078539,0.019614,0.024361,0.017966,0.006609,0.022350,0.067405,0.063942,0.048713,0.033838,0.051655,0.080531,0.000000,0.000000,0.000000,0.083261,0.066199,0.074920,0.013477,...,0.081781,0.000000,0.000000,0.000000,0.081470,0.068046,0.070355,0.013078,0.113824,0.056426,0.049966,0.031747,0.078665,0.016032,0.025418,0.018332,0.006834,0.021884,0.066924,0.063147,0.048788,0.034962,0.052911,0.081189,0.000000,0.000000,0.000000,0.082942,0.067364,0.070982,0.012747,0.114370,0.054382,0.050436,0.032196,0.080711,0.016468,0.026638,0.018652,0.007058,0.021877,0.067252,0.064102,0.047521,0.032946,0.051795,0.079558,0.000000,0.000000,0.000000
human_STY_upper,0.082206,0.066688,0.072961,0.012965,0.110244,0.054656,0.049733,0.033379,0.080515,0.019334,0.025206,0.020031,0.007049,0.022131,0.067141,0.062035,0.047677,0.033640,0.052120,0.080289,0.000000,0.000000,0.000000,0.081832,0.066727,0.073458,0.012827,0.111061,0.053752,0.050734,0.032307,0.078615,0.019802,0.024967,0.018880,0.006679,0.022410,0.067327,0.063718,0.048352,0.034185,0.051743,0.080624,0.000000,0.000000,0.000000,0.082377,0.066193,0.074272,0.013311,...,0.081579,0.000000,0.000000,0.000000,0.080805,0.067923,0.070240,0.013196,0.112567,0.056073,0.050258,0.032331,0.079073,0.016362,0.025728,0.019064,0.006917,0.021705,0.066891,0.062991,0.048728,0.035155,0.052847,0.081146,0.000000,0.000000,0.000000,0.082350,0.067688,0.070582,0.012733,0.112743,0.054403,0.050360,0.032901,0.080631,0.016863,0.026965,0.019844,0.007094,0.021870,0.067153,0.063812,0.047677,0.032840,0.052027,0.079465,0.000000,0.000000,0.000000


In [None]:
#| export
@patch_to(Data)
def get_cddm():
    """Get the CDDM dataset."""
    path = "CDDM/pssms.parquet"
    return Data.read_file(path)

In [None]:
Data.get_cddm()

Unnamed: 0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,-20F,-20Y,-20W,-20H,-20K,-20R,-20Q,-20N,-20D,-20E,-20pS,-20pT,-20pY,-19P,-19G,-19A,-19C,-19S,-19T,-19V,-19I,-19L,-19M,-19F,-19Y,-19W,-19H,-19K,-19R,-19Q,-19N,-19D,-19E,-19pS,-19pT,-19pY,-18P,-18G,-18A,-18C,...,18E,18pS,18pT,18pY,19P,19G,19A,19C,19S,19T,19V,19I,19L,19M,19F,19Y,19W,19H,19K,19R,19Q,19N,19D,19E,19pS,19pT,19pY,20P,20G,20A,20C,20S,20T,20V,20I,20L,20M,20F,20Y,20W,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
SRC,0.054538,0.081380,0.060077,0.012356,0.036216,0.032382,0.055816,0.052407,0.083511,0.023434,0.031530,0.023008,0.005965,0.018747,0.082659,0.058372,0.051555,0.039625,0.059651,0.071155,0.022582,0.020452,0.022582,0.048428,0.075616,0.085387,0.017417,0.035684,0.029737,0.056500,0.052676,0.079864,0.024214,0.032285,0.019966,0.011895,0.019541,0.081563,0.061597,0.044605,0.036534,0.064146,0.071793,0.025064,0.012744,0.012744,0.054968,0.069345,0.066808,0.015222,...,0.080938,0.030075,0.011057,0.010615,0.052375,0.071460,0.064359,0.017754,0.047048,0.035508,0.052375,0.050599,0.091434,0.024856,0.041278,0.015535,0.011096,0.015979,0.064802,0.066134,0.047048,0.035508,0.062583,0.074123,0.028407,0.011540,0.018198,0.05830,0.085447,0.064976,0.012906,0.041834,0.039163,0.043169,0.050734,0.074766,0.032488,0.040053,0.018692,0.008456,0.013351,0.076992,0.060970,0.037383,0.036938,0.052960,0.086337,0.025367,0.015576,0.023142
EPHA3,0.044276,0.088013,0.065335,0.008639,0.037797,0.036717,0.072354,0.048596,0.075594,0.026998,0.031317,0.022678,0.011339,0.021598,0.084233,0.059395,0.046436,0.036177,0.059935,0.064255,0.023758,0.018898,0.015659,0.047875,0.064551,0.087144,0.016138,0.031737,0.039806,0.051103,0.046799,0.080151,0.026358,0.035503,0.015600,0.010221,0.023669,0.090909,0.060785,0.042496,0.033889,0.067240,0.079613,0.020979,0.011296,0.016138,0.047696,0.060557,0.057878,0.018221,...,0.076362,0.039304,0.017967,0.006738,0.050197,0.070502,0.067682,0.020305,0.029329,0.039481,0.053017,0.049069,0.095319,0.027637,0.038917,0.016920,0.011280,0.016920,0.071066,0.061478,0.039481,0.037225,0.062606,0.083474,0.029893,0.011844,0.016356,0.04918,0.084228,0.064443,0.017524,0.031656,0.038440,0.047484,0.062747,0.070661,0.031656,0.042962,0.017524,0.007914,0.014132,0.083098,0.059356,0.031091,0.033917,0.056529,0.100622,0.025438,0.013567,0.015828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MAST2,0.046512,0.116279,0.069767,0.000000,0.023256,0.069767,0.069767,0.000000,0.046512,0.046512,0.023256,0.000000,0.000000,0.162791,0.023256,0.023256,0.023256,0.000000,0.046512,0.139535,0.023256,0.023256,0.023256,0.023256,0.162791,0.093023,0.000000,0.046512,0.046512,0.023256,0.046512,0.093023,0.000000,0.046512,0.046512,0.000000,0.093023,0.023256,0.046512,0.023256,0.000000,0.046512,0.116279,0.023256,0.000000,0.000000,0.023256,0.023256,0.093023,0.000000,...,0.146341,0.000000,0.000000,0.000000,0.000000,0.048780,0.121951,0.024390,0.073171,0.048780,0.048780,0.024390,0.073171,0.000000,0.146341,0.000000,0.000000,0.000000,0.097561,0.048780,0.000000,0.048780,0.073171,0.073171,0.048780,0.000000,0.000000,0.00000,0.024390,0.048780,0.024390,0.024390,0.097561,0.170732,0.024390,0.048780,0.048780,0.024390,0.000000,0.073171,0.000000,0.048780,0.219512,0.000000,0.048780,0.024390,0.024390,0.024390,0.000000,0.000000
BRAF,0.095238,0.071429,0.047619,0.000000,0.095238,0.047619,0.071429,0.000000,0.095238,0.047619,0.000000,0.000000,0.000000,0.023810,0.023810,0.071429,0.023810,0.023810,0.095238,0.095238,0.047619,0.000000,0.023810,0.071429,0.047619,0.023810,0.000000,0.000000,0.047619,0.000000,0.071429,0.047619,0.000000,0.023810,0.071429,0.000000,0.023810,0.095238,0.071429,0.000000,0.071429,0.023810,0.166667,0.119048,0.023810,0.000000,0.047619,0.071429,0.071429,0.000000,...,0.025000,0.025000,0.025000,0.000000,0.025000,0.075000,0.025000,0.000000,0.125000,0.050000,0.050000,0.025000,0.050000,0.050000,0.000000,0.025000,0.000000,0.000000,0.075000,0.100000,0.025000,0.050000,0.050000,0.050000,0.100000,0.050000,0.000000,0.02500,0.075000,0.100000,0.000000,0.075000,0.100000,0.100000,0.050000,0.050000,0.000000,0.000000,0.000000,0.025000,0.000000,0.075000,0.075000,0.025000,0.025000,0.075000,0.075000,0.050000,0.000000,0.000000


In [None]:
#| export
@patch_to(Data)
def get_cddm_upper():
    """Get the CDDM dataset of all uppercase sequence."""
    path = "CDDM/pssms_upper.parquet"
    return Data.read_file(path)

In [None]:
Data.get_cddm_upper()

Unnamed: 0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,-20F,-20Y,-20W,-20H,-20K,-20R,-20Q,-20N,-20D,-20E,-20pS,-20pT,-20pY,-19P,-19G,-19A,-19C,-19S,-19T,-19V,-19I,-19L,-19M,-19F,-19Y,-19W,-19H,-19K,-19R,-19Q,-19N,-19D,-19E,-19pS,-19pT,-19pY,-18P,-18G,-18A,-18C,...,18E,18pS,18pT,18pY,19P,19G,19A,19C,19S,19T,19V,19I,19L,19M,19F,19Y,19W,19H,19K,19R,19Q,19N,19D,19E,19pS,19pT,19pY,20P,20G,20A,20C,20S,20T,20V,20I,20L,20M,20F,20Y,20W,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
SRC,0.054538,0.081380,0.060077,0.012356,0.058798,0.052833,0.055816,0.052407,0.083511,0.023434,0.031530,0.045590,0.005965,0.018747,0.082659,0.058372,0.051555,0.039625,0.059651,0.071155,0.0,0.0,0.0,0.048428,0.075616,0.085387,0.017417,0.060748,0.042481,0.056500,0.052676,0.079864,0.024214,0.032285,0.032710,0.011895,0.019541,0.081563,0.061597,0.044605,0.036534,0.064146,0.071793,0.0,0.0,0.0,0.054968,0.069345,0.066808,0.015222,...,0.080938,0.0,0.0,0.0,0.052375,0.071460,0.064359,0.017754,0.075455,0.047048,0.052375,0.050599,0.091434,0.024856,0.041278,0.033733,0.011096,0.015979,0.064802,0.066134,0.047048,0.035508,0.062583,0.074123,0.0,0.0,0.0,0.05830,0.085447,0.064976,0.012906,0.067201,0.054740,0.043169,0.050734,0.074766,0.032488,0.040053,0.041834,0.008456,0.013351,0.076992,0.060970,0.037383,0.036938,0.052960,0.086337,0.0,0.0,0.0
EPHA3,0.044276,0.088013,0.065335,0.008639,0.061555,0.055616,0.072354,0.048596,0.075594,0.026998,0.031317,0.038337,0.011339,0.021598,0.084233,0.059395,0.046436,0.036177,0.059935,0.064255,0.0,0.0,0.0,0.047875,0.064551,0.087144,0.016138,0.052717,0.051103,0.051103,0.046799,0.080151,0.026358,0.035503,0.031737,0.010221,0.023669,0.090909,0.060785,0.042496,0.033889,0.067240,0.079613,0.0,0.0,0.0,0.047696,0.060557,0.057878,0.018221,...,0.076362,0.0,0.0,0.0,0.050197,0.070502,0.067682,0.020305,0.059222,0.051325,0.053017,0.049069,0.095319,0.027637,0.038917,0.033277,0.011280,0.016920,0.071066,0.061478,0.039481,0.037225,0.062606,0.083474,0.0,0.0,0.0,0.04918,0.084228,0.064443,0.017524,0.057094,0.052007,0.047484,0.062747,0.070661,0.031656,0.042962,0.033352,0.007914,0.014132,0.083098,0.059356,0.031091,0.033917,0.056529,0.100622,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MAST2,0.046512,0.116279,0.069767,0.000000,0.046512,0.093023,0.069767,0.000000,0.046512,0.046512,0.023256,0.023256,0.000000,0.162791,0.023256,0.023256,0.023256,0.000000,0.046512,0.139535,0.0,0.0,0.0,0.023256,0.162791,0.093023,0.000000,0.069767,0.046512,0.023256,0.046512,0.093023,0.000000,0.046512,0.046512,0.000000,0.093023,0.023256,0.046512,0.023256,0.000000,0.046512,0.116279,0.0,0.0,0.0,0.023256,0.023256,0.093023,0.000000,...,0.146341,0.0,0.0,0.0,0.000000,0.048780,0.121951,0.024390,0.121951,0.048780,0.048780,0.024390,0.073171,0.000000,0.146341,0.000000,0.000000,0.000000,0.097561,0.048780,0.000000,0.048780,0.073171,0.073171,0.0,0.0,0.0,0.00000,0.024390,0.048780,0.024390,0.048780,0.097561,0.170732,0.024390,0.048780,0.048780,0.024390,0.000000,0.073171,0.000000,0.048780,0.219512,0.000000,0.048780,0.024390,0.024390,0.0,0.0,0.0
BRAF,0.095238,0.071429,0.047619,0.000000,0.142857,0.047619,0.071429,0.000000,0.095238,0.047619,0.000000,0.023810,0.000000,0.023810,0.023810,0.071429,0.023810,0.023810,0.095238,0.095238,0.0,0.0,0.0,0.071429,0.047619,0.023810,0.000000,0.119048,0.071429,0.000000,0.071429,0.047619,0.000000,0.023810,0.071429,0.000000,0.023810,0.095238,0.071429,0.000000,0.071429,0.023810,0.166667,0.0,0.0,0.0,0.047619,0.071429,0.071429,0.000000,...,0.025000,0.0,0.0,0.0,0.025000,0.075000,0.025000,0.000000,0.225000,0.100000,0.050000,0.025000,0.050000,0.050000,0.000000,0.025000,0.000000,0.000000,0.075000,0.100000,0.025000,0.050000,0.050000,0.050000,0.0,0.0,0.0,0.02500,0.075000,0.100000,0.000000,0.125000,0.100000,0.100000,0.050000,0.050000,0.000000,0.000000,0.000000,0.025000,0.000000,0.075000,0.075000,0.025000,0.025000,0.075000,0.075000,0.0,0.0,0.0


In [None]:
#| export
@patch_to(Data)
def get_cddm_LO():
    """Get CDDM Log-odds data with 'STY' background."""
    path = "CDDM/pssms_LO.parquet"
    return Data.read_file(path)

In [None]:
Data.get_cddm_LO()

Unnamed: 0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,-20F,-20Y,-20W,-20H,-20K,-20R,-20Q,-20N,-20D,-20E,-20pS,-20pT,-20pY,-19P,-19G,-19A,-19C,-19S,-19T,-19V,-19I,-19L,-19M,-19F,-19Y,-19W,-19H,-19K,-19R,-19Q,-19N,-19D,-19E,-19pS,-19pT,-19pY,-18P,-18G,-18A,-18C,...,18E,18pS,18pT,18pY,19P,19G,19A,19C,19S,19T,19V,19I,19L,19M,19F,19Y,19W,19H,19K,19R,19Q,19N,19D,19E,19pS,19pT,19pY,20P,20G,20A,20C,20S,20T,20V,20I,20L,20M,20F,20Y,20W,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
SRC,-0.298965,0.245926,-0.306472,-0.315249,-0.436393,-0.216885,0.039840,0.331979,0.006587,0.074104,0.026464,0.300781,-0.484863,-0.234235,0.238737,0.035866,0.173786,0.112172,0.062717,-0.187118,-0.598455,0.418275,1.228850,-0.394990,0.116437,0.241674,0.270796,-0.536063,-0.295952,0.089650,0.359646,-0.082026,0.078067,0.157708,0.196729,0.443758,-0.096374,0.191619,-0.036922,-0.065461,-0.093194,0.198348,-0.149715,-0.521873,-0.217518,0.779107,-0.209860,0.005564,-0.061876,0.087346,...,-0.024870,-0.295180,-0.362786,0.382054,-0.265140,0.085462,-0.135241,0.341197,-0.144617,-0.081201,-0.016493,0.302052,0.137508,0.404226,0.299088,-0.066971,0.165626,-0.468907,-0.152143,0.097919,0.055576,-0.162821,0.125989,-0.139554,-0.371114,-0.369706,1.148641,-0.219858,0.305918,-0.086212,-0.217325,-0.300629,0.133833,-0.379744,0.266731,-0.143992,0.691717,0.259364,-0.031653,-0.018293,-0.661728,0.104673,0.041005,-0.236473,-0.001987,-0.082064,0.136414,-0.538512,0.006337,1.268341
EPHA3,-0.599679,0.358959,-0.185427,-0.831496,-0.374767,-0.035616,0.414240,0.223052,-0.137106,0.278334,0.016725,0.279945,0.441832,-0.030005,0.265960,0.060927,0.022921,-0.019164,0.069583,-0.334271,-0.525210,0.304329,0.700644,-0.411559,-0.111818,0.271060,0.160722,-0.705150,0.124806,-0.055189,0.188981,-0.076858,0.200475,0.294762,-0.159291,0.224919,0.180083,0.348124,-0.056065,-0.135338,-0.201591,0.266315,-0.000554,-0.778526,-0.391503,1.119694,-0.414603,-0.189918,-0.268878,0.346775,...,-0.108833,0.090914,0.337630,-0.273672,-0.326395,0.065984,-0.062604,0.534849,-0.826443,0.071809,0.001107,0.257761,0.197541,0.557236,0.214109,0.056291,0.189352,-0.386287,-0.019033,-0.007411,-0.197407,-0.094701,0.126509,0.031850,-0.297539,-0.332173,0.994725,-0.465271,0.285190,-0.098081,0.223956,-0.702797,0.106930,-0.242274,0.573323,-0.225459,0.654314,0.360504,-0.124708,-0.113799,-0.579696,0.214783,0.002284,-0.502364,-0.125070,0.012040,0.357301,-0.534483,-0.192918,0.720322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MAST2,-0.528628,0.760762,-0.090724,-20.552054,-1.075446,0.890487,0.361717,-21.989358,-0.837785,1.063081,-0.412652,-20.832929,-19.671043,2.884025,-1.590838,-1.291829,-0.974740,-21.805810,-0.296229,0.784477,-0.556038,0.603650,1.271267,-1.453246,1.222693,0.365256,-20.461283,-0.153747,0.349399,-1.190998,0.180083,0.138019,-21.129349,0.684414,1.416774,-19.738124,2.154697,-1.618709,-0.442197,-1.005072,-21.893985,-0.265423,0.545967,-0.629882,-20.498936,-19.502311,-1.450868,-1.570628,0.415703,-20.450380,...,0.829586,-21.815323,-20.439323,-19.635590,-22.585576,-0.465375,0.786858,0.799349,0.492504,0.376951,-0.119057,-0.750758,-0.183951,-20.840923,2.124971,-20.634048,-19.916025,-21.076626,0.438112,-0.341170,-22.110138,0.295330,0.351480,-0.158216,0.408965,-20.507940,-19.646705,-22.694922,-1.502811,-0.499805,0.700926,-1.078985,1.450633,1.603930,-0.789927,-0.760077,1.278125,-0.456257,-20.865612,3.094973,-21.010260,-0.553723,1.889130,-22.070431,0.399205,-1.200649,-1.687266,-0.595170,-20.564588,-19.873740
BRAF,0.505319,0.057744,-0.641739,-20.552054,0.958501,0.339472,0.395665,-21.989358,0.196162,1.097028,-21.561812,-20.832929,-19.671043,0.110618,-1.556891,0.327081,-0.940792,-0.622702,0.737718,0.233462,0.477909,-20.545511,1.305214,0.165663,-0.550715,-1.600796,-20.461283,-22.302907,0.383346,-22.340159,0.798993,-0.828033,-21.129349,-0.281638,2.035683,-19.738124,0.188645,0.415238,0.176712,-22.154233,0.874085,-1.231475,1.065341,1.725993,0.684172,-19.502311,-0.416921,0.048281,0.034613,-20.450380,...,-1.719752,-0.561825,0.814174,-19.635590,-1.332079,0.155211,-1.499446,-20.418525,1.265094,0.412575,-0.083433,-0.715134,-0.733289,1.412574,-21.677864,0.619449,-19.916025,-21.076626,0.058698,0.694454,-0.856641,0.330954,-0.197858,-0.707554,1.444589,1.745557,-19.646705,-1.441424,0.117775,0.535819,-20.516947,0.541601,1.486257,0.832199,0.245697,-0.724453,-20.939748,-21.674130,-20.865612,1.545635,-21.010260,0.066864,0.339792,-0.816934,-0.565170,0.419937,-0.066680,0.440454,-20.564588,-19.873740


In [None]:
#| export
@patch_to(Data)
def get_cddm_LO_upper():
    """Get CDDM Log-odds data of all-uppercase sequence with 'STY' background."""
    path = "CDDM/pssms_LO_upper.parquet"
    return Data.read_file(path)

In [None]:
Data.get_cddm_LO_upper()

Unnamed: 0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,-20F,-20Y,-20W,-20H,-20K,-20R,-20Q,-20N,-20D,-20E,-20pS,-20pT,-20pY,-19P,-19G,-19A,-19C,-19S,-19T,-19V,-19I,-19L,-19M,-19F,-19Y,-19W,-19H,-19K,-19R,-19Q,-19N,-19D,-19E,-19pS,-19pT,-19pY,-18P,-18G,-18A,-18C,...,18E,18pS,18pT,18pY,19P,19G,19A,19C,19S,19T,19V,19I,19L,19M,19F,19Y,19W,19H,19K,19R,19Q,19N,19D,19E,19pS,19pT,19pY,20P,20G,20A,20C,20S,20T,20V,20I,20L,20M,20F,20Y,20W,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
SRC,-0.298965,0.245926,-0.306472,-0.315249,-0.500805,-0.002884,0.039840,0.331979,0.006587,0.074104,0.026464,0.687254,-0.484863,-0.234235,0.238737,0.035866,0.173786,0.112172,0.062717,-0.187118,0.0,0.0,0.0,-0.394990,0.116437,0.241674,0.270796,-0.530225,-0.272866,0.089650,0.359646,-0.082026,0.078067,0.157708,0.396654,0.443758,-0.096374,0.191619,-0.036922,-0.065461,-0.093194,0.198348,-0.149715,0.0,0.0,0.0,-0.209860,0.005564,-0.061876,0.087346,...,-0.024870,0.0,0.0,0.0,-0.265140,0.085462,-0.135241,0.341197,-0.234110,-0.157485,-0.016493,0.302052,0.137508,0.404226,0.299088,0.462480,0.165626,-0.468907,-0.152143,0.097919,0.055576,-0.162821,0.125989,-0.139554,0.0,0.0,0.0,-0.219858,0.305918,-0.086212,-0.217325,-0.395092,0.096392,-0.379744,0.266731,-0.143992,0.691717,0.259364,0.542941,-0.018293,-0.661728,0.104673,0.041005,-0.236473,-0.001987,-0.082064,0.136414,0.0,0.0,0.0
EPHA3,-0.599679,0.358959,-0.185427,-0.831496,-0.434706,0.071154,0.414240,0.223052,-0.137106,0.278334,0.016725,0.437268,0.441832,-0.030005,0.265960,0.060927,0.022921,-0.019164,0.069583,-0.334271,0.0,0.0,0.0,-0.411559,-0.111818,0.271060,0.160722,-0.734799,-0.006279,-0.055189,0.188981,-0.076858,0.200475,0.294762,0.353099,0.224919,0.180083,0.348124,-0.056065,-0.135338,-0.201591,0.266315,-0.000554,0.0,0.0,0.0,-0.414603,-0.189918,-0.268878,0.346775,...,-0.108833,0.0,0.0,0.0,-0.326395,0.065984,-0.062604,0.534849,-0.583601,-0.031956,0.001107,0.257761,0.197541,0.557236,0.214109,0.442850,0.189352,-0.386287,-0.019033,-0.007411,-0.197407,-0.094701,0.126509,0.031850,0.0,0.0,0.0,-0.465271,0.285190,-0.098081,0.223956,-0.630220,0.022506,-0.242274,0.573323,-0.225459,0.654314,0.360504,0.216062,-0.113799,-0.579696,0.214783,0.002284,-0.502364,-0.125070,0.012040,0.357301,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MAST2,-0.528628,0.760762,-0.090724,-20.552054,-0.838992,0.813257,0.361717,-21.989358,-0.837785,1.063081,-0.412652,-0.283875,-19.671043,2.884025,-1.590838,-1.291829,-0.974740,-21.805810,-0.296229,0.784477,0.0,0.0,0.0,-1.453246,1.222693,0.365256,-20.461283,-0.330500,-0.142089,-1.190998,0.180083,0.138019,-21.129349,0.684414,0.904502,-19.738124,2.154697,-1.618709,-0.442197,-1.005072,-21.893985,-0.265423,0.545967,0.0,0.0,0.0,-1.450868,-1.570628,0.415703,-20.450380,...,0.829586,0.0,0.0,0.0,-22.585576,-0.465375,0.786858,0.799349,0.458506,-0.105326,-0.119057,-0.750758,-0.183951,-20.840923,2.124971,-21.223241,-19.916025,-21.076626,0.438112,-0.341170,-22.110138,0.295330,0.351480,-0.158216,0.0,0.0,0.0,-22.694922,-1.502811,-0.499805,0.700926,-0.857265,0.930110,1.603930,-0.789927,-0.760077,1.278125,-0.456257,-21.453288,3.094973,-21.010260,-0.553723,1.889130,-22.070431,0.399205,-1.200649,-1.687266,0.0,0.0,0.0
BRAF,0.505319,0.057744,-0.641739,-20.552054,0.779918,-0.152796,0.395665,-21.989358,0.196162,1.097028,-21.561812,-0.249928,-19.671043,0.110618,-1.556891,0.327081,-0.940792,-0.622702,0.737718,0.233462,0.0,0.0,0.0,0.165663,-0.550715,-1.600796,-20.461283,0.440412,0.476821,-22.340159,0.798993,-0.828033,-21.129349,-0.281638,1.523411,-19.738124,0.188645,0.415238,0.176712,-22.154233,0.874085,-1.231475,1.065341,0.0,0.0,0.0,-0.416921,0.048281,0.034613,-20.450380,...,-1.719752,0.0,0.0,0.0,-1.332079,0.155211,-1.499446,-20.418525,1.342127,0.930298,-0.083433,-0.715134,-0.733289,1.412574,-21.677864,0.030256,-19.916025,-21.076626,0.058698,0.694454,-0.856641,0.330954,-0.197858,-0.707554,0.0,0.0,0.0,-1.441424,0.117775,0.535819,-20.516947,0.500287,0.965734,0.832199,0.245697,-0.724453,-20.939748,-21.674130,-21.453288,1.545635,-21.010260,0.066864,0.339792,-0.816934,-0.565170,0.419937,-0.066680,0.0,0.0,0.0


### Amino acid

In [None]:
#| export
@patch_to(Data)
def get_aa_info():
    """Get amino acid information."""
    path = f"amino_acids/aa_info.parquet"
    return Data.read_file(path)

In [None]:
Data.get_aa_info()

Unnamed: 0_level_0,Name,SMILES,MW,pKa1,pKb2,pKx3,pl4,H,VSC,P1,P2,SASA,NCISC,phospho
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A,Alanine,C[C@@H](C(=O)O)N,89.10,2.34,9.69,,6.00,0.62,27.5,8.1,0.046,1.181,0.007187,0
C,Cysteine,C([C@@H](C(=O)O)N)S,121.16,1.96,10.28,8.18,5.07,0.29,44.6,5.5,0.128,1.461,-0.036610,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Kac,Acetyllysine,CC(=O)NCCCC[C@H](N)C(=O)O,188.23,,,,,,,,,,,0
Kme3,Trimethyllysine,C[N+](C)(C)CCCC[C@H](N)C(=O)O,189.28,,,,,,,,,,,0


In [None]:
#| export
@patch_to(Data)
def get_aa_rdkit():
    """Get RDKit representations of amino acids."""
    path = "amino_acids/aa_rdkit.parquet"
    return Data.read_file(path)

In [None]:
Data.get_aa_rdkit()

Unnamed: 0_level_0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,MinPartialCharge,MaxAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRLOW,AvgIpc,BalabanJ,BertzCT,Chi0n,Chi0v,Chi1,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA3,...,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA8,TPSA,EState_VSA1,EState_VSA10,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState10,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,NHOHCount,NOCount,NumAliphaticHeterocycles,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,RingCount,MolLogP,fr_Al_COO,fr_Al_OH,fr_Ar_N,fr_C_O,fr_NH0,fr_NH1,fr_NH2,fr_SH,fr_imidazole,fr_priamide,fr_sulfide,fr_unbrch_alkane
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
A,9.574074,0.731481,-0.962963,0.451352,89.094,-0.480094,0.480094,2.000000,2.166667,2.166667,16.367425,10.385341,1.990289,-1.897069,1.676610,-2.250004,-0.137563,1.360964,3.257586,59.813538,3.510162,3.510162,2.642734,1.627090,1.126913,1.126913,0.389528,0.389528,0.000000,0.000000,-0.57,13.609640,5.43,1.767634,1.721545,10.840195,6.041841,0.0,0.00000,5.969305,4.794537,0.0,0.000000,0.0,6.923737,0.000000,0.000000,9.901065,5.969305,0.000000,...,17.117674,4.794537,0.0,6.923737,0.0,63.32,12.011146,4.794537,0.000000,0.000000,0.000000,6.923737,0.0,0.0,0.000000,10.840195,0.000000,9.574074,7.865741,4.835648,-0.962963,-0.731481,0.000000,1.418981,0.000000,0.666667,3.0,3.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,1.0,0.0,-0.5818,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
C,9.756435,0.189815,-1.004630,0.424382,121.161,-0.480064,0.480064,2.000000,2.428571,2.428571,32.116867,10.376545,2.056705,-1.960761,1.795249,-2.309520,-0.137419,1.645711,3.343417,75.335159,3.664483,4.558910,3.180739,2.406671,1.127305,1.492453,0.513894,0.907286,0.078093,0.234278,-0.22,27.977093,6.78,2.872925,2.472042,10.840195,6.041841,0.0,0.00000,5.969305,4.794537,0.0,12.628789,0.0,0.000000,5.752854,0.000000,9.901065,18.598094,0.000000,...,22.870527,4.794537,0.0,0.000000,0.0,63.32,12.011146,4.794537,5.752854,0.000000,0.000000,0.000000,0.0,0.0,12.628789,10.840195,3.649043,9.756435,8.008102,4.939815,-0.814815,-0.816358,0.000000,0.000000,0.000000,0.666667,3.0,3.0,0.0,0.0,0.0,0.0,3.0,3.0,4.0,2.0,0.0,-0.6719,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Kac,10.419084,0.067099,-0.972660,0.499039,188.227,-0.480086,0.480086,1.461538,2.076923,2.538462,16.367568,10.120530,2.091144,-2.051830,1.973906,-2.297932,-0.138286,2.022803,3.255326,182.227953,7.746838,7.746838,6.036581,4.265665,2.923476,2.923476,1.633048,1.633048,0.877145,0.877145,-1.10,519.860388,11.90,6.419740,7.266972,16.156983,6.041841,0.0,5.90718,5.969305,9.589074,0.0,0.000000,0.0,19.262465,13.468494,0.000000,14.695602,11.876485,5.316789,...,29.569610,9.589074,0.0,26.186202,0.0,92.42,12.011146,9.589074,5.907180,19.386400,6.420822,6.923737,0.0,0.0,5.316789,10.840195,0.000000,20.695587,11.053315,5.271623,-1.039759,-0.780252,1.933755,2.032398,0.000000,0.750000,4.0,5.0,0.0,0.0,0.0,0.0,3.0,3.0,5.0,6.0,0.0,-0.2953,1.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0
Kme3,10.361469,0.578704,-0.897918,0.465620,189.279,-0.480086,0.480086,1.384615,1.923077,2.307692,16.367548,10.113915,2.114811,-2.232682,2.021932,-2.354302,-0.870130,1.951693,3.414940,163.487862,8.785803,8.785803,5.827186,4.615856,4.454444,4.454444,1.911260,1.911260,1.073871,1.073871,-0.61,396.193584,12.39,5.191317,7.741185,15.323226,6.041841,0.0,0.00000,5.969305,4.794537,0.0,0.000000,0.0,19.262465,0.000000,27.687772,14.384095,5.969305,0.000000,...,49.288477,4.794537,0.0,19.262465,0.0,63.32,12.011146,4.794537,0.000000,6.420822,23.869431,0.000000,0.0,0.0,21.143016,10.840195,0.000000,10.361469,8.512288,5.361669,-0.897918,-0.687529,2.502778,1.061844,6.366441,0.888889,3.0,4.0,0.0,0.0,0.0,0.0,2.0,2.0,4.0,6.0,0.0,0.2748,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
#| export
@patch_to(Data)
def get_aa_morgan():
    """Get Morgan fingerprint representations of amino acids."""
    path = "amino_acids/aa_morgan.parquet"
    return Data.read_file(path)

In [None]:
Data.get_aa_morgan()

Unnamed: 0_level_0,morgan_1,morgan_11,morgan_24,morgan_27,morgan_70,morgan_74,morgan_79,morgan_80,morgan_82,morgan_116,morgan_118,morgan_119,morgan_132,morgan_140,morgan_172,morgan_192,morgan_197,morgan_210,morgan_222,morgan_227,morgan_229,morgan_245,morgan_280,morgan_283,morgan_294,morgan_295,morgan_305,morgan_310,morgan_319,morgan_321,morgan_322,morgan_328,morgan_362,morgan_364,morgan_376,morgan_378,morgan_394,morgan_412,morgan_414,morgan_425,morgan_429,morgan_473,morgan_482,morgan_486,morgan_545,morgan_550,morgan_553,morgan_575,morgan_592,morgan_623,...,morgan_1431,morgan_1451,morgan_1452,morgan_1456,morgan_1459,morgan_1507,morgan_1517,morgan_1544,morgan_1558,morgan_1564,morgan_1573,morgan_1595,morgan_1602,morgan_1607,morgan_1633,morgan_1644,morgan_1685,morgan_1693,morgan_1716,morgan_1719,morgan_1736,morgan_1737,morgan_1750,morgan_1751,morgan_1752,morgan_1754,morgan_1758,morgan_1773,morgan_1778,morgan_1783,morgan_1785,morgan_1791,morgan_1819,morgan_1838,morgan_1840,morgan_1844,morgan_1847,morgan_1849,morgan_1873,morgan_1876,morgan_1879,morgan_1882,morgan_1898,morgan_1911,morgan_1912,morgan_1926,morgan_1937,morgan_1942,morgan_1946,morgan_1970
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
A,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
C,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Kac,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0
Kme3,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0


### Phosphoproteomics

In [None]:
#| export
@patch_to(Data)
def get_cptac_ensembl_site():
    """Get CPTAC dataset with unique EnsemblProteinID+site."""
    path = "phosphosites/linkedOmicsKB_ref_pan.parquet"
    return Data.read_file(path)

In [None]:
Data.get_cptac_ensembl_site()

Unnamed: 0,gene,site,site_seq,protein,gene_name,gene_site,protein_site
0,ENSG00000003056.8,S267,DDQLGEESEERDDHL,ENSP00000000412.3,M6PR,M6PR_S267,ENSP00000000412_S267
1,ENSG00000003056.8,S267,DDQLGEESEERDDHL,ENSP00000440488.2,M6PR,M6PR_S267,ENSP00000440488_S267
...,...,...,...,...,...,...,...
488584,ENSG00000143631.11,S648,ASRNHHGSAQEQSRD,ENSP00000357789.1,FLG,FLG_S648,ENSP00000357789_S648
488585,ENSG00000143520.6,S2310,DTTRHGHSGYGQSTQ,ENSP00000373370.4,FLG2,FLG2_S2310,ENSP00000373370_S2310


In [None]:
#| export
@patch_to(Data)
def get_cptac_unique_site():
    """Get CPTAC dataset with unique site sequences."""
    path = "phosphosites/cptac_unique_site.parquet"
    return Data.read_file(path)

In [None]:
Data.get_cptac_unique_site()

Unnamed: 0,site_seq,gene_site,num_site,acceptor
0,AAAAAAASFPWSAFG,ZBTB7A_S182,1,S
1,AAAAAAASGAAGGGG,INTS3_S16,1,S
...,...,...,...,...
125474,______MYPAGPPAG,TIGD5_Y2,1,Y
125475,_______SPASLPLA,RFLNB_S1,1,S


In [None]:
#| export
@patch_to(Data)
def get_cptac_gene_site():
    """Get CPTAC dataset with unique Gene+site."""
    path = "phosphosites/linkedOmics_ref_pan.parquet"
    return Data.read_file(path)

In [None]:
Data.get_cptac_gene_site()

Unnamed: 0,gene,site,site_seq,protein,gene_name,gene_site,protein_site
0,ENSG00000003056.8,S267,DDQLGEESEERDDHL,ENSP00000000412.3,M6PR,M6PR_S267,ENSP00000000412_S267
1,ENSG00000048028.11,S1053,PPTIRPNSPYDLCSR,ENSP00000003302.4,USP28,USP28_S1053,ENSP00000003302_S1053
...,...,...,...,...,...,...,...
126223,ENSG00000143631.11,S648,ASRNHHGSAQEQSRD,ENSP00000357789.1,FLG,FLG_S648,ENSP00000357789_S648
126224,ENSG00000143520.6,S2310,DTTRHGHSGYGQSTQ,ENSP00000373370.4,FLG2,FLG2_S2310,ENSP00000373370_S2310


In [None]:
#| export
@patch_to(Data)
def get_psp_human_site():
    """Get PhosphoSitePlus human dataset (Gene+site)."""
    path = "phosphosites/psp_human.parquet"
    return Data.read_file(path)

In [None]:
Data.get_psp_human_site()

Unnamed: 0,gene,protein,uniprot,site,gene_site,SITE_GRP_ID,species,site_seq,LT_LIT,MS_LIT,MS_CST,CST_CAT#,Ambiguous_Site
0,YWHAB,14-3-3 beta,P31946,T2,YWHAB_T2,15718712,human,______MtMDksELV,,3.0,1.0,,0
1,YWHAB,14-3-3 beta,P31946,S6,YWHAB_S6,15718709,human,__MtMDksELVQkAk,,8.0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
240009,ZZZ3,ZZZ3,Q8IYH5,S677,ZZZ3_S677,23077721,human,yPPEEVEsRRWQKIA,,,1.0,,0
240010,ZZZ3,ZZZ3,Q8IYH5,S777,ZZZ3_S777,41455930,human,NTAVEDAsDDESIPI,,2.0,,,0


In [None]:
#| export
@patch_to(Data)
def get_ochoa_site():
    """Get phosphoproteomics dataset from Ochoa et al."""
    path = "phosphosites/ochoa_site.parquet"
    return Data.read_file(path)

In [None]:
Data.get_ochoa_site()

Unnamed: 0,uniprot,position,residue,is_disopred,disopred_score,log10_hotspot_pval_min,isHotspot,uniprot_position,functional_score,current_uniprot,name,gene,Sequence,is_valid,site_seq,gene_site
0,A0A075B6Q4,24,S,1.0,0.91,6.839384,1.0,A0A075B6Q4_24,0.149257,A0A075B6Q4,A0A075B6Q4_HUMAN,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,True,VDDEKGDSNDDYDSA,A0A075B6Q4_S24
1,A0A075B6Q4,35,S,1.0,0.87,9.192622,0.0,A0A075B6Q4_35,0.136966,A0A075B6Q4,A0A075B6Q4_HUMAN,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,True,YDSAGLLSDEDCMSV,A0A075B6Q4_S35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112279,V9GYY5,134,T,1.0,0.83,2.055830,0.0,V9GYY5_134,0.187417,V9GYY5,V9GYY5_HUMAN,,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,True,SEEEASSTEKPTKAL,V9GYY5_T134
112280,V9GYY5,138,T,1.0,0.82,0.726611,0.0,V9GYY5_138,0.121025,V9GYY5,V9GYY5_HUMAN,,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,True,ASSTEKPTKALPRKS,V9GYY5_T138


In [None]:
#| export
@patch_to(Data)
def get_combine_site_psp_ochoa() -> pd.DataFrame:
    """
    Get the combined dataset from Ochoa and PhosphoSitePlus.
    """
    path = "phosphosites/combine_site_psp_ochoa.parquet"
    return Data.read_file(path)

In [None]:
Data.get_combine_site_psp_ochoa()

Unnamed: 0,uniprot,gene,site,site_seq,source,AM_pathogenicity,CDDM_upper,CDDM_max_score
0,A0A024R4G9,C19orf48,S20,ITGSRLLSMVPGPAR,psp,,"PRKX,AKT1,PKG1,P90RSK,HIPK4,AKT3,HIPK1,PKACB,H...",2.407041
1,A0A075B6Q4,,S24,VDDEKGDSNDDYDSA,ochoa,,"CK2A2,CK2A1,GRK7,GRK5,CK1G1,CK1A,IKKA,CK1G2,CA...",2.295654
...,...,...,...,...,...,...,...,...
121417,V9GYY5,,T134,SEEEASSTEKPTKAL,ochoa,,"ASK1,PERK,EEF2K,MAP2K4,MEKK2,MST1,BMPR1B,OSR1,...",1.832532
121418,V9GYY5,,T138,ASSTEKPTKALPRKS,ochoa,,"ASK1,MEK2,MPSK1,TNIK,PBK,MST2,MINK,NEK4,LKB1,MEK5",1.807565


In [None]:
#| export
@patch_to(Data)
def get_combine_site_phosphorylated():
    """
    Get the combined phosphorylated dataset from Ochoa and PhosphoSitePlus.
    """
    path = "phosphosites/phosphorylated_combine_site.parquet"
    return Data.read_file(path)

In [None]:
Data.get_combine_site_phosphorylated()

Unnamed: 0,uniprot,gene,site,site_seq,source,AM_pathogenicity,CDDM,PSPA,CDDM_max_score,PSPA_max_score
0,A0A024R4G9,C19orf48,S20,ITGSRLLsMVPGPAR,psp,,"PRKX,PKG1,AKT1,AKT3,HIPK4,P90RSK,PKACB,PKACA,P...","MAPKAPK5,AKT1,RSK3,P70S6K,MAPKAPK3,AKT2,DYRK1A...",2.339278,3.726109
1,A0A075B6Q4,,S24,VDDEKGDsNDDYDSA,ochoa,,"CK2A2,CK2A1,GRK7,GRK5,CK1G1,IKKA,CAMK1D,MARK2,...","CAMK2B,CK2A2,CAMK2A,CK2A1,GRK7,TLK2,FAM20C,CAM...",2.253027,4.940056
...,...,...,...,...,...,...,...,...,...,...
120102,V9GYY5,,T134,sEEEAsstEKPtKAL,ochoa,,"PERK,ASK1,EEF2K,MST1,BMPR1B,PBK,MEKK2,OSR1,MST...","CK1G2,GSK3A,ALPHAK3,GRK1,GRK7,GSK3B,BMPR1B,BMP...",1.723089,7.009429
120103,V9GYY5,,T138,AsstEKPtKALPRKS,ochoa,,"ASK1,PBK,TNIK,MPSK1,MINK,MST2,NEK4,MEK2,MST1,BUB1","CK1G3,CK1G2,CK1A2,CK1D,CK1A,GRK3,PASK,GRK2,CK1...",1.651888,4.350109


In [None]:
#| export
@patch_to(Data)
@lru_cache
def get_human_site():
    """
    Get the combined phosphorylated dataset from Ochoa and PhosphoSitePlus (20-length version).
    """
    path = "phosphosites/phosphorylated_combine_site20.parquet"
    return Data.read_file(path)

In [None]:
Data.get_human_site()

Unnamed: 0,substrate_uniprot,substrate_genes,site,source,AM_pathogenicity,substrate_sequence,substrate_species,sub_site,substrate_phosphoseq,position,site_seq
0,A0A024R4G9,C19orf48 MGC13170 hCG_2008493,S20,psp,,MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...,Homo sapiens (Human),A0A024R4G9_S20,MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTRTWLLSH...,20,_MTVLEAVLEIQAITGSRLLsMVPGPARPPGSCWDPTQCTR
1,A0A075B6Q4,,S24,ochoa,,MDIQKSENEDDSEWEDVDDEKGDSNDDYDSAGLLSDEDCMSVPGKT...,Homo sapiens (Human),A0A075B6Q4_S24,MDIQKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPGKT...,24,QKSENEDDSEWEDVDDEKGDsNDDYDSAGLLsDEDCMSVPG
...,...,...,...,...,...,...,...,...,...,...,...
121330,V9GYY5,,T134,ochoa,,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,Homo sapiens (Human),V9GYY5_T134,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,134,LGLtPPEGGAGDRsEEEAsstEKPtKALPRKSRDPLLSQRI
121331,V9GYY5,,T138,ochoa,,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,Homo sapiens (Human),V9GYY5_T138,KRDGDDRRPRLVLSFDEEKRREYLTGFHKRKVERKKAAIEEIKQRL...,138,PPEGGAGDRsEEEAsstEKPtKALPRKSRDPLLSQRISSLT


### Reactome

In [None]:
#| export
@patch_to(Data)
@lru_cache
def get_reactome_pathway_lo() -> pd.DataFrame:
    """
    Get lowest reactome pathways with Uniprot ID as identifier.
    """
    path = "reactome_lowest_level.parquet"
    return Data.read_file(path)

In [None]:
Data.get_reactome_pathway_lo()

Unnamed: 0,uniprot,reactome_id,pathway,type,species
0,A0A023GPK8,R-DME-373753,Nephrin family interactions,IEA,Drosophila melanogaster
1,A0A023GRW3,R-DME-72163,mRNA Splicing - Major Pathway,IEA,Drosophila melanogaster
...,...,...,...,...,...
306903,Z4YHD9,R-BTA-9734091,Drug-mediated inhibition of MET activation,IEA,Bos taurus
306904,Z4YJC3,R-GGA-983168,Antigen processing: Ubiquitination & Proteasom...,IEA,Gallus gallus


In [None]:
#| export
@patch_to(Data)
@lru_cache
def get_reactome_pathway() -> pd.DataFrame:
    """
    Get all level reactome pathways with Uniprot ID as identifier.
    """
    path = "reactome_all_levels.parquet"
    path_all = Data.read_file(path)
    # path_lo = Data.get_reactome_pathway_lo()
    # path_all['lowest'] = path_all.reactome_id.isin(path_lo.reactome_id).astype(int)
    return path_all

The data is from from Reactome/Download_data: https://reactome.org/download-data

Download `UniProt to All pathways` under `Identifier mapping files`

For type, there are IEA (Inferred from Electronic Annotation) and TAS (Traceable Author Statement, higher confidence)

In [None]:
Data.get_reactome_pathway()

Unnamed: 0,uniprot,reactome_id,pathway,type,species
0,A0A023GPK8,R-DME-1500931,Cell-Cell communication,IEA,Drosophila melanogaster
1,A0A023GPK8,R-DME-373753,Nephrin family interactions,IEA,Drosophila melanogaster
...,...,...,...,...,...
888430,Z4YJC3,R-GGA-983168,Antigen processing: Ubiquitination & Proteasom...,IEA,Gallus gallus
888431,Z4YJC3,R-GGA-983169,Class I MHC mediated antigen processing & pres...,IEA,Gallus gallus


## CPTAC

In [None]:
#| export
class CPTAC:
    
    "A class for fetching CPTAC phosphoproteomics data."
    @staticmethod
    def _read_file(cancer: str, # cancer type CPTAC
                    is_Tumor: bool=True, # tumor tissue or normal
                    is_KB: bool=False, # whether it is for LinkedOmicsKB or LinkedOmics
                   ):
        "Fetches the data from the given path and returns a DataFrame"
        
        # path of ID and data
        sample_type = "Tumor" if is_Tumor else "Normal"
        ID_URL = f"https://zenodo.org/records/8196130/files/bcm-{cancer.lower()}-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
        DATA_URL = f"https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/{cancer.upper()}/{cancer.upper()}_phospho_site_abundance_log2_reference_intensity_normalized_{sample_type}.txt"

        # Load ID data
        ref = pd.read_csv(ID_URL, compression='gzip', sep='\t')[['protein','gene','gene_name']].drop_duplicates().reset_index(drop=True)
        
        # Load CPTAC phosphoproteomics data
        try:
            raw = pd.read_csv(DATA_URL, sep='\t')
        except Exception as e:
            print(f'{cancer} has {e}')
        else:
            info = pd.DataFrame({'gene':raw.idx.str.split('|').str[0],
                                 'site':raw.idx.str.split('|').str[2],
                                 'site_seq':raw.idx.str.split('|').str[3]})

            print(f'the {cancer} dataset length is: {info.shape[0]}')

            # Merge ensembl ID with gene name
            info = info.merge(ref,'left')
            print(f'after id mapping, the length is {info.shape[0]}')

            print(f'{info.gene_name.isna().sum()} sites does not have a mapped gene name')

            info['gene_site'] = info['gene_name'] + '_' + info['site']
            info['protein_site'] = info['protein'].str.split('.').str[0] + '_' + info['site']
            
            info = info.drop_duplicates(subset="protein_site" if is_KB else "gene_site").reset_index(drop=True)
            print(f'after removing duplicates of protein_site, the length is {info.shape[0]}')

            return info
    

In [None]:
#| export
@patch_to(CPTAC)
def list_cancer():
    "List available CPTAC cancer type"
    return ['HNSCC','GBM','COAD','CCRCC','LSCC','BRCA','UCEC','LUAD','PDAC','OV']

In [None]:
CPTAC.list_cancer()

['HNSCC', 'GBM', 'COAD', 'CCRCC', 'LSCC', 'BRCA', 'UCEC', 'LUAD', 'PDAC', 'OV']

In [None]:
#| export
@patch_to(CPTAC)
def get_id(cancer_type: str,
           is_Tumor: bool=True, # tumor tissue or normal
           is_KB: bool=False, # whether it is for LinkedOmicsKB or LinkedOmics
          ):
    "Get CPTAC phosphorylation sites information given a cancer type"
    assert cancer_type in CPTAC.list_cancer(), "cancer type is not included, check available cancer types from CPTAC.list_cancer()"
    return CPTAC._read_file(cancer_type,is_Tumor, is_KB)

Use `CPTAC.get_id()` to load CPTAC phosphorylation site information. Fold change of various conditions can be acquired through [LinkedOmics](https://www.linkedomics.org/login.php) or [LinkedOmicsKB](https://kb.linkedomics.org/). Use `is_KB` to indicate whether the phosphorylation site information is for LinkedOmics or LinkedOmicsKB.

```python
# normal tissue
CPTAC.get_id('CCRCC',is_KB=True, is_Tumor=False)
```

```python
# tumor
CPTAC.get_id('CCRCC',is_KB=True, is_Tumor=True)
```

## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()