# Import libraries 

In [None]:
# import necessary libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load data

In [None]:
df = pd.read_csv('../data/41586_2022_5575_MOESM5_ESM.csv', delimiter=';')


In [None]:
df

# Data pre processing

In [None]:
# columns needed 
columns_needed = ['Uniprot Primary Accession', 'Protein','Phosphosite','SITE_+/-7_AA' ]

Select the rank columns 

In [None]:
rank_columns = [x for x in df.columns if 'rank' in x]
rank_columns

Cleaned up data 

In [None]:
data = df[columns_needed + rank_columns]
data.head()

In [None]:
data

# Creating Ranked Kinase DataFrame 

In [None]:
def create_kinase_ranked(number: int, data: pd.DataFrame = data, 
                         columns_needed: list = columns_needed, 
                         rank_columns: list = rank_columns) -> pd.DataFrame:
    """
    Creates a DataFrame of kinases ranked based on a rank number.

    Parameters:
    number (int): The rank number to filter the data.
    data (DataFrame): The original DataFrame containing kinase data.
    columns_needed (list): List of column names to be included in the final DataFrame.
    rank_columns (list): List of column names in the original DataFrame that contain ranking information.

    Returns:
    DataFrame: A new DataFrame containing kinases of the specified rank, with columns as specified in columns_needed.
    """
    kinase_ranked = pd.DataFrame()
    for rank in rank_columns: 
        new_df = data[data[rank] == number]
        new_df[f'ranked_{number}'] = rank
        kinase_ranked = pd.concat([kinase_ranked, new_df[columns_needed+[f'ranked_{number}']].reset_index(drop=True)], axis=0)
    return kinase_ranked




## Generating and Saving Ranked Kinase Data to CSV Files

In [None]:
for rank in [1,2,3]: 
    kinase_ranked = create_kinase_ranked(rank)
    # create a csv file in the data folder 
    kinase_ranked.to_csv(f'../data/kinase_ranked_{rank}.csv', index=False)

# Merge the data

In [82]:
# load the kinsase substrate humand data 
df_human = pd.read_csv('../data/Kinase_Substrate_Dataset_human.txt', delimiter='\t')
df_human.head()

Unnamed: 0,GENE,KINASE,KIN_ACC_ID,KIN_ORGANISM,SUBSTRATE,SUB_GENE_ID,SUB_ACC_ID,SUB_GENE,SUB_ORGANISM,SUB_MOD_RSD,SITE_GRP_ID,SITE_+/-7_AA,DOMAIN,IN_VIVO_RXN,IN_VITRO_RXN,CST_CAT#
0,EIF2AK1,HRI,Q9BQI3,human,eIF2-alpha,1965.0,P05198,EIF2S1,human,S52,447635,MILLsELsRRRIRsI,S1,,X,3597; 9721; 3398; 5199; 53085
1,EIF2AK1,HRI,Q9BQI3,human,eIF2-alpha,1965.0,P05198,EIF2S1,human,S49,450210,IEGMILLsELsRRRI,S1,,X,
2,PRKCD,PKCD,Q05655,human,HDAC5,10014.0,Q9UQL6,HDAC5,human,S259,447995,FPLRkTAsEPNLKVR,,,X,3443
3,PRKCD,PKCD,Q05655,human,PTPRA iso2,5786.0,P18433-2,PTPRA,human,S204,447612,PLLARSPsTNRKYPP,,X,,
4,PRKCD,PKCD,Q05655,human,Bcl-2,596.0,P10415,BCL2,human,S70,448395,RDPVARtsPLQtPAA,,X,,2834; 2827


In [83]:
id_columns = [x for x in df_human.columns if 'ID' in x]
id_columns

['KIN_ACC_ID', 'SUB_GENE_ID', 'SUB_ACC_ID', 'SITE_GRP_ID']

In [84]:
column_needed = ['KINASE', 'SUBSTRATE' , 'KIN_ORGANISM', 'SUB_ORGANISM', 'SITE_+/-7_AA','SUB_MOD_RSD']

In [85]:
df_human = df_human[column_needed+id_columns]

In [None]:
df_1 = create_kinase_ranked(1)
df_2 = create_kinase_ranked(2)
df_3 = create_kinase_ranked(3)

In [118]:
data_frame_dict = {'kinase_ranked_1': df_1, 'kinase_ranked_2': df_2, 'kinase_ranked_3': df_3}

In [141]:
current_rank = 1

In [142]:
df_human

Unnamed: 0,KINASE,SUBSTRATE,KIN_ORGANISM,SUB_ORGANISM,SITE_+/-7_AA,SUB_MOD_RSD,KIN_ACC_ID,SUB_GENE_ID,SUB_ACC_ID,SITE_GRP_ID
0,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635
1,HRI,eIF2-alpha,human,human,IEGMILLsELsRRRI,S49,Q9BQI3,1965.0,P05198,450210
2,PKCD,HDAC5,human,human,FPLRkTAsEPNLKVR,S259,Q05655,10014.0,Q9UQL6,447995
3,PKCD,PTPRA iso2,human,human,PLLARSPsTNRKYPP,S204,Q05655,5786.0,P18433-2,447612
4,PKCD,Bcl-2,human,human,RDPVARtsPLQtPAA,S70,Q05655,596.0,P10415,448395
...,...,...,...,...,...,...,...,...,...,...
13728,ULK2,Raptor,human,human,QRVLDtssLtQsAPA,S855,Q8IYT8,57521.0,Q8N122,3205935
13729,ULK2,Raptor,human,human,DtssLtQsAPAsPtN,S859,Q8IYT8,57521.0,Q8N122,2024885
13730,ULK2,SEC16A,human,human,LAQPINFsVSLSNSH,S846,Q8IYT8,9919.0,O15027,55578720
13731,ULK2,PIK3C3,human,human,ESsPILTsFELVKVP,S249,Q8IYT8,5289.0,Q8NEB9,35483209


In [143]:
left = ['SUB_ACC_ID', 'SITE_+/-7_AA', 'SUB_MOD_RSD']
right = ['Uniprot Primary Accession', 'SITE_+/-7_AA', 'Phosphosite']


In [156]:
df_human.merge(data_frame_dict[f'kinase_ranked_{current_rank}'], left_on=['SUB_ACC_ID'], right_on=['Uniprot Primary Accession'], how='left')

Unnamed: 0,KINASE,SUBSTRATE,KIN_ORGANISM,SUB_ORGANISM,SITE_+/-7_AA_x,SUB_MOD_RSD,KIN_ACC_ID,SUB_GENE_ID,SUB_ACC_ID,SITE_GRP_ID,Uniprot Primary Accession,Protein,Phosphosite,SITE_+/-7_AA_y,ranked_1
0,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,P05198,IF2A,T279,QMEPKVVTDTDETEL,ALPHAK3_rank
1,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,P05198,IF2A,T281,EPKVVTDTDETELAR,ALPHAK3_rank
2,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,P05198,IF2A,S91,DLSKRRVSPEEAIKC,CLK4_rank
3,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,P05198,IF2A,S52,MILLSELSRRRIRSI,HRI_rank
4,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,P05198,IF2A,S158,DAFKHAVSDPSILDS,MARK3_rank
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266194,ULK2,DENND3,human,human,THRRMVVsMPNLQDI,S472,Q8IYT8,22898.0,A2RUS2,7997148,A2RUS2,DEND3,S503,TAGCRGSSAVLNVTP,CDKL5_rank
266195,ULK2,DENND3,human,human,THRRMVVsMPNLQDI,S472,Q8IYT8,22898.0,A2RUS2,7997148,A2RUS2,DEND3,S502,DTAGCRGSSAVLNVT,NIM1_rank
266196,ULK2,DENND3,human,human,THRRMVVsMPNLQDI,S472,Q8IYT8,22898.0,A2RUS2,7997148,A2RUS2,DEND3,S490,ELAPRNSSLRLTDTA,PHKG1_rank
266197,ULK2,DENND3,human,human,THRRMVVsMPNLQDI,S472,Q8IYT8,22898.0,A2RUS2,7997148,A2RUS2,DEND3,S489,PELAPRNSSLRLTDT,TSSK2_rank


In [145]:
df_human.merge(data_frame_dict[f'kinase_ranked_{current_rank}'], left_on=['SITE_+/-7_AA'], right_on=['SITE_+/-7_AA'], how='left')

Unnamed: 0,KINASE,SUBSTRATE,KIN_ORGANISM,SUB_ORGANISM,SITE_+/-7_AA,SUB_MOD_RSD,KIN_ACC_ID,SUB_GENE_ID,SUB_ACC_ID,SITE_GRP_ID,Uniprot Primary Accession,Protein,Phosphosite,ranked_1
0,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,,,,
1,HRI,eIF2-alpha,human,human,IEGMILLsELsRRRI,S49,Q9BQI3,1965.0,P05198,450210,,,,
2,PKCD,HDAC5,human,human,FPLRkTAsEPNLKVR,S259,Q05655,10014.0,Q9UQL6,447995,,,,
3,PKCD,PTPRA iso2,human,human,PLLARSPsTNRKYPP,S204,Q05655,5786.0,P18433-2,447612,,,,
4,PKCD,Bcl-2,human,human,RDPVARtsPLQtPAA,S70,Q05655,596.0,P10415,448395,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13728,ULK2,Raptor,human,human,QRVLDtssLtQsAPA,S855,Q8IYT8,57521.0,Q8N122,3205935,,,,
13729,ULK2,Raptor,human,human,DtssLtQsAPAsPtN,S859,Q8IYT8,57521.0,Q8N122,2024885,,,,
13730,ULK2,SEC16A,human,human,LAQPINFsVSLSNSH,S846,Q8IYT8,9919.0,O15027,55578720,,,,
13731,ULK2,PIK3C3,human,human,ESsPILTsFELVKVP,S249,Q8IYT8,5289.0,Q8NEB9,35483209,,,,


In [146]:
df_human.merge(data_frame_dict[f'kinase_ranked_{current_rank}'], left_on=['SUB_MOD_RSD'], right_on=['Phosphosite'], how='left')

Unnamed: 0,KINASE,SUBSTRATE,KIN_ORGANISM,SUB_ORGANISM,SITE_+/-7_AA_x,SUB_MOD_RSD,KIN_ACC_ID,SUB_GENE_ID,SUB_ACC_ID,SITE_GRP_ID,Uniprot Primary Accession,Protein,Phosphosite,SITE_+/-7_AA_y,ranked_1
0,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,P25705,ATPA,S52,KTGTAEMSSILEERI,ALK4_rank
1,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,P01834,IGKC,S52,NALQSGNSQESVTEQ,ATM_rank
2,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,Q9H1P3,OSBL2,S52,GKTGERPSQENGIQK,ATR_rank
3,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,Q9H568,ACTL8,S52,SYARRRVSLGIDICH,AURB_rank
4,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,O43581,SYT7,S52,LGKRYKNSLETVGTP,BCKDK_rank
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762240,ULK2,DENND3,human,human,THRRMVVsMPNLQDI,S472,Q8IYT8,22898.0,A2RUS2,7997148,Q92985,IRF7,S472,TQREGVSSLDSSSLS,RAF1_rank
762241,ULK2,DENND3,human,human,THRRMVVsMPNLQDI,S472,Q8IYT8,22898.0,A2RUS2,7997148,Q13557,KCC2D,S472,VHFHRSGSPTVPIKP,SBK_rank
762242,ULK2,DENND3,human,human,THRRMVVsMPNLQDI,S472,Q8IYT8,22898.0,A2RUS2,7997148,Q66K74,MAP1S,S472,EGPGRAESKESVGSR,SKMLCK_rank
762243,ULK2,DENND3,human,human,THRRMVVsMPNLQDI,S472,Q8IYT8,22898.0,A2RUS2,7997148,Q14123,PDE1C,S472,QRRSSLNSISSSDAK,TTBK1_rank


In [147]:
df_human.merge(data_frame_dict[f'kinase_ranked_{current_rank}'], left_on=left, right_on=right, how='left')

Unnamed: 0,KINASE,SUBSTRATE,KIN_ORGANISM,SUB_ORGANISM,SITE_+/-7_AA,SUB_MOD_RSD,KIN_ACC_ID,SUB_GENE_ID,SUB_ACC_ID,SITE_GRP_ID,Uniprot Primary Accession,Protein,Phosphosite,ranked_1
0,HRI,eIF2-alpha,human,human,MILLsELsRRRIRsI,S52,Q9BQI3,1965.0,P05198,447635,,,,
1,HRI,eIF2-alpha,human,human,IEGMILLsELsRRRI,S49,Q9BQI3,1965.0,P05198,450210,,,,
2,PKCD,HDAC5,human,human,FPLRkTAsEPNLKVR,S259,Q05655,10014.0,Q9UQL6,447995,,,,
3,PKCD,PTPRA iso2,human,human,PLLARSPsTNRKYPP,S204,Q05655,5786.0,P18433-2,447612,,,,
4,PKCD,Bcl-2,human,human,RDPVARtsPLQtPAA,S70,Q05655,596.0,P10415,448395,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13728,ULK2,Raptor,human,human,QRVLDtssLtQsAPA,S855,Q8IYT8,57521.0,Q8N122,3205935,,,,
13729,ULK2,Raptor,human,human,DtssLtQsAPAsPtN,S859,Q8IYT8,57521.0,Q8N122,2024885,,,,
13730,ULK2,SEC16A,human,human,LAQPINFsVSLSNSH,S846,Q8IYT8,9919.0,O15027,55578720,,,,
13731,ULK2,PIK3C3,human,human,ESsPILTsFELVKVP,S249,Q8IYT8,5289.0,Q8NEB9,35483209,,,,


In [148]:
df_human.merge(data_frame_dict[f'kinase_ranked_{current_rank}'], left_on=left, right_on=right, how='left')[f'ranked_{current_rank}']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
13728    NaN
13729    NaN
13730    NaN
13731    NaN
13732    NaN
Name: ranked_1, Length: 13733, dtype: object