# PSSM

> Functions related with PSSMs

## Setup

In [None]:
#| default_exp pssm

In [None]:
#| export
import numpy as np, pandas as pd
from katlas.data import *
from katlas.preprocess import *
from fastcore.meta import delegates

In [None]:
#| hide
from nbdev import show_doc
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', 100) # show all columns

```python
from katlas.pssm import *
```

## PSSM

In [None]:
#| export
def get_prob(df: pd.DataFrame, col: str, aa_order=[i for i in 'PGACSTVILMFYWHKRQNDEsty']):
    """Get the probability matrix of PSSM from phosphorylation site sequences."""
    
    site = check_seq_df(df, col)
    
    site_array = np.array(site.apply(list).tolist())
    seq_len = site_array.shape[1]
    
    position = list(range(-(seq_len // 2), (seq_len // 2)+1)) # add 1 because range do not include the final num
    
    site_df = pd.DataFrame(site_array, columns=position)
    melted = site_df.melt(var_name='Position', value_name='aa')
    
    grouped = melted.groupby(['Position', 'aa']).size().reset_index(name='Count')
    grouped = grouped[grouped.aa.isin(aa_order)].reset_index(drop=True)
    
    pivot_df = grouped.pivot(index='aa', columns='Position', values='Count').fillna(0)
    pssm_df = pivot_df / pivot_df.sum()
    
    pssm_df = pssm_df.reindex(index=aa_order, columns=position, fill_value=0)
    pssm_df = pssm_df.rename(index={'s': 'pS', 't': 'pT', 'y': 'pY'})
    
    return pssm_df

In [None]:
ks = Data.get_ks_dataset()

In [None]:
ks_k = ks[ks.kinase_uniprot=='P00519']

In [None]:
pssm_df = get_prob(ks_k,'site_seq')
pssm_df.head()

Position,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
P,0.050061,0.048691,0.062349,0.055489,0.046988,0.054753,0.064787,0.05509,0.056683,0.048272,0.052257,0.054599,0.053974,0.04317,0.060839,0.067138,0.049971,0.055817,0.076968,0.049354,0.0,0.020024,0.049645,0.12537,0.054997,0.056872,0.057382,0.057588,0.062048,0.053463,0.058104,0.052728,0.05114,0.069436,0.063164,0.057716,0.056639,0.051072,0.050697,0.052163,0.060703
G,0.080586,0.080341,0.069007,0.067551,0.08253,0.070397,0.093581,0.073054,0.077566,0.072706,0.067102,0.077745,0.052788,0.07806,0.07088,0.065371,0.087008,0.073443,0.084019,0.065217,0.0,0.091284,0.06974,0.062685,0.070373,0.075237,0.060371,0.072585,0.08012,0.077157,0.072783,0.099939,0.070856,0.071916,0.075672,0.071518,0.064821,0.080076,0.08872,0.062341,0.090735
A,0.080586,0.080341,0.062954,0.054282,0.075301,0.0716,0.070186,0.07006,0.065632,0.070322,0.077791,0.073591,0.053381,0.06505,0.069108,0.071849,0.062316,0.081669,0.063455,0.060517,0.0,0.102473,0.084515,0.075103,0.063276,0.075237,0.083084,0.04919,0.053614,0.073512,0.073394,0.064378,0.077634,0.069436,0.072545,0.063363,0.079924,0.088272,0.087452,0.057888,0.070927
C,0.017094,0.012781,0.013317,0.019903,0.012048,0.017449,0.007798,0.014371,0.013126,0.012515,0.023159,0.01543,0.012456,0.014784,0.010632,0.012956,0.014697,0.009988,0.010576,0.008226,0.0,0.017079,0.01773,0.023655,0.018924,0.010664,0.016736,0.010798,0.008434,0.010328,0.012232,0.007357,0.017868,0.014879,0.012508,0.01192,0.01888,0.019546,0.014575,0.019084,0.014058
S,0.047619,0.03591,0.04661,0.030157,0.037349,0.04272,0.041992,0.041916,0.03401,0.039333,0.037411,0.031454,0.043891,0.035482,0.024808,0.029446,0.026455,0.016451,0.021739,0.009401,0.0,0.019435,0.01773,0.019515,0.031934,0.029028,0.028691,0.034793,0.024699,0.032199,0.029969,0.024525,0.036352,0.047117,0.040025,0.042033,0.040277,0.039092,0.051965,0.041349,0.039617


In [None]:
#| export
def pssm_to_seq(pssm_df, 
                thr=0.4, # threshold of probability to show in sequence
                contain_sty=True, # keep only s,t,y values (last three) in center 0 position
                ):
    "Represent PSSM in string sequence of amino acids"
    
    pssm_df = pssm_df.copy()
    if contain_sty:
        pssm_df.loc[pssm_df.index[:-3], 0] = 0  # keep only s,t,y in center 0 position

    pssm_df.index = pssm_df.index.map(lambda x: x.replace('pS', 's').replace('pT', 't').replace('pY', 'y'))

    consensus = []
    for i, col in enumerate(pssm_df.columns):
        top = pssm_df[col].nlargest(3)
        passing = [aa for aa, prob in zip(top.index, top.values) if prob > thr]

        if not passing:
            symbol = '.'
        elif len(passing) == 1:
            symbol = passing[0]
        else:
            symbol = f"[{'/'.join(passing)}]"

        if col == 0:  # center position
            if symbol.startswith('['):
                symbol = symbol[:-1] + ']*'
            else:
                symbol += '*'

        consensus.append(symbol)

    return ''.join(consensus)

In [None]:
pssm_to_seq(pssm_df,thr=0.1)

'........K.K.K..E.EEVy*[E/A].[L/P]....K..........L.'

In [None]:
#| export
def recover_pssm(flat_pssm:pd.Series,aa_order=list('PGACSTVILMFYWHKRQNDEsty')):
    "Recover 2D pssm from flat pssm Series"
    df = flat_pssm.copy().reset_index()
    df.columns=['info','value']
    df['Position']=df['info'].str.extract(r'(-?\d+)').astype(int)
    df['aa']=df['info'].str.extract(r'-?\d+\s*(.*)')
    df = df.pivot(index='aa',columns='Position',values='value').fillna(0)
    return df.reindex(index=aa_order).rename(index={'s': 'pS', 't': 'pT', 'y': 'pY'})

In [None]:
pspa = Data.get_pspa_all_norm()

In [None]:
flat_pssm = pspa.loc['AAK1'].dropna()

In [None]:
recovered = recover_pssm(flat_pssm)
recovered

Position,-5,-4,-3,-2,-1,0,1,2,3,4
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P,0.0720,0.0534,0.1084,0.0226,0.1136,0.0,0.0463,0.0527,0.0681,0.0628
G,0.0245,0.0642,0.0512,0.0283,0.0706,0.0,0.7216,0.0749,0.0923,0.0702
...,...,...,...,...,...,...,...,...,...,...
pT,0.0201,0.0332,0.0303,0.0209,0.0121,1.0,0.0123,0.0409,0.0335,0.0251
pY,0.0611,0.0339,0.0274,0.0486,0.0178,0.0,0.0100,0.0410,0.0359,0.0270


In [None]:
#| export
def process_pssm(pssm_df):
    "Keep only s,t,y values in center 0 position; normalize per position"
    pssm_df=pssm_df.copy()
    pssm_df.columns= pssm_df.columns.astype(int)
    pssm_df.loc[pssm_df.index[:-3], 0] = 0
    pssm_df = pssm_df/pssm_df.sum()
    return pssm_df

In [None]:
norm_pssm = process_pssm(recovered)
norm_pssm.head()

Position,-5,-4,-3,-2,-1,0,1,2,3,4
aa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
P,0.058446,0.041715,0.0861,0.017935,0.096068,0.0,0.042649,0.040482,0.05264,0.05026
G,0.019888,0.050152,0.040667,0.022459,0.059704,0.0,0.664702,0.057536,0.071346,0.056182
A,0.023054,0.055152,0.08888,0.042695,0.032558,0.0,0.02874,0.057613,0.044987,0.051701
C,0.037016,0.043747,0.052025,0.046663,0.026469,0.0,0.020542,0.052543,0.057355,0.048259
S,0.0345,0.048356,0.041859,0.044044,0.046089,0.0,0.013172,0.042403,0.044987,0.044818


In [None]:
#| export
def pssm2dict(pssm_df):
    "Convert pssm dataframe to dict"
    pssm_df=pssm_df.copy()
    pssm_df = pssm_df.unstack().reset_index(name='value')
    pssm_df['position_residue']=pssm_df.iloc[:,0].astype(str)+pssm_df.iloc[:,1]
    return pssm_df.set_index('position_residue')['value'].round(5).to_dict()

In [None]:
pssm2dict(pssm_df.iloc[:1,:10])

{'-20P': 0.05006,
 '-19P': 0.04869,
 '-18P': 0.06235,
 '-17P': 0.05549,
 '-16P': 0.04699,
 '-15P': 0.05475,
 '-14P': 0.06479,
 '-13P': 0.05509,
 '-12P': 0.05668,
 '-11P': 0.04827}

## JS divergence

In [None]:
#| export
def js_divergence(p1, # pssm 
                  p2, # pssm
                  mean=True):
    "p1 and p2 are two arrays (df or np) with index as aa and column as position"
    assert p1.shape==p2.shape
    mask = (p1 + p2) > 0 #skip those with double 0
    p1,p2 = p1[mask], p2[mask]
    
    m = 0.5 * (p1 + p2)
    js = 0.5 * np.sum(p1 * np.log(p1 / m + 1e-10), axis=0) + \
         0.5 * np.sum(p2 * np.log(p2 / m + 1e-10), axis=0)
    return np.mean(js) if mean else js

In [None]:
js_divergence(pssm_df,pssm_df)

1.0000000826903708e-10

In [None]:
#| export
def js_divergence_flat(p1_flat, # pd.Series of flattened pssm
                       p2_flat, # pd.Series of flattened pssm
                       ):

    "p1 and p2 are two flattened pd.Series with index as aa and column as position"

    js = js_divergence(p1_flat,p2_flat,mean=False)
    total_position = len(p1_flat.index.str.extract(r'(-?\d+)').drop_duplicates())
    return js/total_position

In [None]:
flat_norm_pssm = pd.Series(pssm2dict(norm_pssm))

In [None]:
js_divergence(flat_norm_pssm,flat_norm_pssm)

1.0000050826907844e-09

## Entropy & Information Content

In [None]:
#| export
def entropy(pssm_df,# a dataframe of pssm with index as aa and column as position
            return_min=False, # return min entropy as a single value or return all entropy as a series
            exclude_zero=False, # exclude the column of 0 (center position) in the entropy calculation
            contain_sty=True, # keep only s,t,y values (last three) in center 0 position
            ): 
    "Calculate entropy per position (max) of a PSSM surrounding 0"
    pssm_df = pssm_df.copy()
    pssm_df.columns= pssm_df.columns.astype(int)
    if 0 in pssm_df.columns:
        if exclude_zero:
            pssm_df = pssm_df.drop(columns=[0])
        if contain_sty:                       
            pssm_df.loc[pssm_df.index[:-3], 0] = 0
    pssm_df = pssm_df/pssm_df.sum()
    per_position = -np.sum(pssm_df * np.log2(pssm_df + 1e-9), axis=0)
    return per_position.min() if return_min else per_position

In [None]:
entropy(pssm_df)

Position
-20    4.324109
-19    4.257291
         ...   
 19    4.293755
 20    4.279981
Length: 41, dtype: float64

In [None]:
#| export
@delegates(entropy)
def entropy_flat(flat_pssm:pd.Series,**kwargs): 
    "Calculate entropy per position of a flat PSSM surrounding 0"
    pssm_df = recover_pssm(flat_pssm)
    return entropy(pssm_df,**kwargs)

In [None]:
#| export
def get_IC_standard(pssm_df):
    """Calculate the standard information content (bits) from frequency matrix, 
    using the same number of residues log2(len(pssm_df)) for all positions"""
    
    entropy_position=entropy(pssm_df)
    
    # information_content = max_entropy - entropy --> log2(N) - entropy
    IC_position = np.log2((len(pssm_df))) - entropy_position
    scaled_df = pssm_df.mul(IC_position)
    return scaled_df

In [None]:
#| export
@delegates(entropy)
def get_IC(pssm_df,**kwargs):
    """Calculate the information content (bits) from a frequency matrix,
    using log2(3) for the middle position and log2(len(pssm_df)) for others."""
    
    entropy_position = entropy(pssm_df,**kwargs)
    
    max_entropy_array = pd.Series(np.log2(len(pssm_df)), index=pssm_df.columns)
    
    max_entropy_array[0] = np.log2(3)

    # information_content = max_entropy - entropy --> log2(N) - entropy
    IC_position = max_entropy_array - entropy_position
    return IC_position

In [None]:
#| export
@delegates(get_IC)
def get_IC_flat(flat_pssm:pd.Series,**kwargs):
    """Calculate the information content (bits) from a flattened pssm pd.Series,
    using log2(3) for the middle position and log2(len(pssm_df)) for others."""
    
    pssm_df = recover_pssm(flat_pssm)
    return get_IC(pssm_df,**kwargs)

In [None]:
#| export
def get_scaled_IC(pssm_df):
    """For plotting purpose, calculate the scaled information content (bits) from a frequency matrix,
    using log2(3) for the middle position and log2(len(pssm_df)) for others."""
    
    IC_position = get_IC(pssm_df)
    
    return pssm_df.mul(IC_position, axis=1)

## PSPA normalization

In [None]:
#| export
def raw2norm(df: pd.DataFrame, # single kinase's df has position as index, and single amino acid as columns
             PDHK: bool=False, # whether this kinase belongs to PDHK family 
            ):
    
    "Normalize single ST kinase data"
    columns_to_exclude = ['S', 'T', 'C', 't', 'y']
    
    if PDHK:
        columns_to_exclude.append('Y')
        divisor = 16
    else:
        divisor = 17
    
    s = df.drop(columns=columns_to_exclude).sum(1)
    df2 = df.div(s, axis=0)
    df2.C = df2.C / (df2.C.median() * divisor)
    df2['S'] = df2.drop(columns=columns_to_exclude).median(1)
    df2['T'] = df2.drop(columns=columns_to_exclude).median(1)
    df2 = round(df2, 4)
    
    return df2

This function implement the normalization method from [Johnson et al. Nature: An atlas of substrate specificities for the human serine/threonine kinome](https://www.nature.com/articles/s41586-022-05575-3#Sec6)

Specifically,
> - matrices were column-normalized at all positions by the sum of the 17 randomized amino acids (excluding serine, threonine and cysteine), to yield PSSMs. 
>- PDHK1 and PDHK4 were normalized to the 16 randomized amino acids (excluding serine, threonine, cysteine and additionally tyrosine)
>- The cysteine row was scaled by its median to be 1/17 (1/16 for PDHK1 and PDHK4). 
>- The serine and threonine values in each position were set to be the median of that position.
>- The S0/T0 ratio was determined by summing the values of S and T rows in the matrix (SS and ST, respectively), accounting for the different S vs. T composition of the central (1:1) and peripheral (only S or only T) positions (Sctrl and Tctrl, respectively), and then normalizing to the higher value among the two (S0 and T0, respectively, Supplementary Note 1)

This function is usually implemented with the below function, with `normalize` being a bool argument.

In [None]:
#| export
def get_one_kinase(df: pd.DataFrame, #stacked dataframe (paper's raw data)
                   kinase:str, # a specific kinase
                   normalize: bool=False, # normalize according to the paper; special for PDHK1/4
                   drop_s: bool= True, # drop s as s is a duplicates of t in PSPA
                  ):
    "Obtain a specific kinase data from stacked dataframe"
    
    p = pd.DataFrame(df.loc[kinase],columns = [kinase]).reset_index().rename(columns={'index':'substrate'})
    p['position'] = p.substrate.str.extract('(-?\d+)')
    p['aa'] = p.substrate.str[-1]
    p.position = p.position.astype(int)
    pp = p.pivot(index='position', columns='aa', values=kinase)
    if drop_s:
        if 's' in pp.columns:
            pp = pp.drop(columns=['s'])

    if normalize:
        pp = raw2norm(pp, PDHK=True if kinase == 'PDHK1' or kinase == 'PDHK4' else False)
    return pp

Retreive a single kinase data from PSPA data that has an format of kinase as index and position+amino acid as column.

In [None]:
data = Data.get_pspa_st_norm()

In [None]:
get_one_kinase(data,'PDHK1')

aa,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,t,y
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
-5,0.0594,0.0625,0.0589,0.0550,0.0775,0.0697,0.0687,0.0590,0.0515,0.0657,0.0687,0.0613,0.0451,0.0424,0.0594,0.0594,0.0594,0.0573,0.1001,0.0775,0.0583,0.0658
-4,0.0618,0.0621,0.0550,0.0511,0.0739,0.0715,0.0598,0.0601,0.0520,0.0614,0.0744,0.0549,0.0637,0.0552,0.0617,0.0608,0.0608,0.0519,0.0916,0.0739,0.0528,0.0752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,0.0486,0.0609,0.0938,0.0684,0.1024,0.0676,0.0544,0.0583,0.0388,0.0552,0.0637,0.0505,0.0686,0.0502,0.0561,0.0588,0.0588,0.0593,0.0641,0.1024,0.0539,0.0431
4,0.0565,0.0749,0.0631,0.0535,0.0732,0.0655,0.0664,0.0625,0.0496,0.0552,0.0627,0.0640,0.0677,0.0553,0.0604,0.0626,0.0626,0.0579,0.0864,0.0732,0.0548,0.0575


## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| hide
# def get_freq(df_k: pd.DataFrame, # a dataframe for a single kinase that contains phosphorylation sequence splitted by their position
#              aa_order = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the full matrix 
#              aa_order_paper = [i for i in 'PGACSTVILMFYWHKRQNDEsty'], # amino acid to include in the partial matrix
#              position = [i for i in range(-7,8)], # position to include in the full matrix
#              position_paper = [-5,-4,-3,-2,-1,1,2,3,4] # position to include in the partial matrix
#              ):
    
#     "Get frequency matrix given a dataframe of phosphorylation sites for a single kinase"
    

#     #Count frequency for each amino acid at each position
#     melted_k = df_k.melt(
#                     value_vars=[i for i in range(-7, 8)],
#                     var_name='Position', 
#                     value_name='aa')
    
#     # Group by Position and Amino Acid and count occurrences
#     grouped = melted_k.groupby(['Position', 'aa']).size().reset_index(name='Count')
    

#     # Remove wired amino acid
#     aa_include = [i for i in 'PGACSTVILMFYWHKRQNDEsty']
#     grouped = grouped[grouped.aa.isin(aa_include)].reset_index(drop=True)
    
#     # get pivot table
#     pivot_k = grouped.pivot(index='aa', columns='Position', values='Count').fillna(0)
    
#     # Get frequency by dividing the sum of each column
#     freq_k = pivot_k/pivot_k.sum()

    
#     # data from the kinase-substrate dataset, and format is Lew's paper's format
#     paper = freq_k.reindex(index=aa_order_paper,columns=position_paper,fill_value=0)

#     # full pivot data from kinase-substrate dataset
#     full = freq_k.reindex(index=aa_order,columns=position, fill_value=0)

    
#     return paper,full

# # get frequency matrix
# paper_format, full = get_freq(ks_k)
# paper_format.head()

# def get_unique_site(df:pd.DataFrame = None,# dataframe that contains phosphorylation sites
#                     seq_col: str='site_seq', # column name of site sequence
#                     id_col: str='gene_site' # column name of site id
#                    ):
#     "Remove duplicates among phosphorylation sites; return df with new columns of acceptor and number of duplicates"
    
#     unique = df.groupby(seq_col).agg(
#         {id_col: lambda r: '|'.join(r.unique())} )
#     unique['num_site'] = unique[id_col].str.split('|').apply(len) 
#     unique = unique.reset_index()
#     position = len(unique[seq_col][0])//2
#     unique['acceptor'] = unique[seq_col].str[position]
    
#     return unique

# As there are lots of duplicates of the phosphorylation site sequence in the dataset, it could be helpful to remove the duplicated sequences. 

# Implement `get_unique_site` to get unique phosphorylation sites. Need to inform columns of sequence and id.

# df = Data.get_ochoa_site()
# unique = get_unique_site(df,seq_col='site_seq',id_col='gene_site')
# unique.sort_values('num_site',ascending=False).head()