In [1]:
from katlas.data import *
from katlas.pssm import *
import pandas as pd,numpy as np
from tqdm import tqdm
from scipy.stats import pearsonr

In [2]:
pssms = pd.read_parquet('out/all_site_pssms_filter.parquet')

## Cosine similarity with PSPA

In [3]:
pspa=Data.get_pspa_all_scale()

In [4]:
(~pspa.columns.isin(pssms.columns)).sum()

np.int64(0)

In [5]:
df = pssms[pspa.columns].copy()

In [6]:
def js_similarity(pssm1,pssm2):
    "Convert JSD to bits to be in range (0,1) then 1-JSD."
    distance = js_divergence(pssm1,pssm2)/np.log(2)
    similarity = 1-distance
    return similarity

In [7]:
def pearson_surrounding(p1, p2):
    "Compute Pearson correlation between two PSSMs, excluding column 0."
    p1 = p1.drop(columns=0).copy()
    p2 = p2.drop(columns=0).copy()
    s1 = p1.unstack()
    s2 = p2.unstack()
    r, _ = pearsonr(s1, s2)
    return r

In [18]:
def get_js_report(row1,# row1 is motif
                  row2,# row1 is kinase pssm
                 ):
    p1 = recover_pssm(row1) # motif
    p2 = recover_pssm(row2) # kinase pssm
    
    out = js_similarity(p1,p2)
    
    kinase_surrounding_IC=get_IC(p2,exclude_zero=True)
    # motif_surrounding_IC = get_IC(p1,exclude_zero=True)
    
    maxIC_position =kinase_surrounding_IC.idxmax()
    max_surrounding_value = out[maxIC_position]
    
    others = pd.Series(
        {'JSD_mean': out.mean(),
         'pearson_surrounding': pearson_surrounding(p1,p2),
         'kinase_max_IC_position': maxIC_position,
          'max_position_JSD': max_surrounding_value,
          
         # 'motif_IC_surrounding_max': motif_surrounding_IC.max() # -5 to +4
          })
    # return pd.concat([others,out])
    return others

In [9]:
def test(motif,kinase):
    r1 = df.loc[motif]
    r2 = pspa.loc[kinase]
    return get_js_report(r1,r2)

In [13]:
def test_motif(motif):
    a=[]
    for i,r in pspa.iterrows():
        a.append(test(motif,i))
    out = pd.DataFrame(a,index=pspa.index).sort_values('JSD_mean',ascending=False).round(3)
    out['kinase_max_IC_position'] = out['kinase_max_IC_position'].astype(int)
    return out

In [14]:
ss = test_motif(567)

In [15]:
ss

Unnamed: 0_level_0,JSD_mean,pearson_surrounding,kinase_max_IC_position,max_position_JSD,-5,-4,-3,-2,-1,0,1,2,3,4
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
BMPR1B,0.827,0.409,1,0.891,0.911,0.848,0.332,0.899,0.891,0.750,0.891,0.931,0.899,0.919
BMPR1A,0.824,0.460,1,0.887,0.909,0.856,0.379,0.912,0.884,0.726,0.887,0.907,0.872,0.909
COT,0.823,0.232,2,0.911,0.895,0.841,0.290,0.793,0.907,0.907,0.861,0.911,0.887,0.937
TGFBR1,0.816,0.397,-2,0.921,0.897,0.833,0.310,0.921,0.894,0.726,0.844,0.918,0.904,0.910
PLK2,0.814,0.541,-3,0.630,0.895,0.856,0.630,0.833,0.857,0.742,0.816,0.779,0.862,0.869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TNK1,0.678,-0.197,3,0.745,0.876,0.809,0.203,0.737,0.827,0.068,0.791,0.846,0.745,0.875
DDR2,0.675,-0.120,3,0.696,0.871,0.795,0.196,0.728,0.886,0.068,0.785,0.895,0.696,0.833
ZAP70,0.675,0.034,-1,0.707,0.886,0.829,0.284,0.732,0.707,0.068,0.681,0.857,0.779,0.922
FES,0.671,0.011,-1,0.696,0.871,0.816,0.305,0.781,0.696,0.068,0.846,0.814,0.651,0.859


In [17]:
out

Unnamed: 0_level_0,JSD_mean,pearson_surrounding,kinase_max_IC_position,max_position_JSD,-5,-4,-3,-2,-1,0,1,2,3,4
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
CDK17,0.897,0.908,1,0.841,0.930,0.928,0.925,0.915,0.901,0.800,0.841,0.897,0.907,0.920
P38G,0.896,0.926,1,0.850,0.933,0.931,0.923,0.930,0.917,0.714,0.850,0.897,0.937,0.925
CDK18,0.895,0.885,1,0.806,0.930,0.934,0.918,0.913,0.887,0.831,0.806,0.898,0.923,0.910
CDK19,0.894,0.865,1,0.789,0.910,0.910,0.900,0.897,0.889,0.896,0.789,0.908,0.920,0.921
CDK3,0.892,0.808,1,0.833,0.937,0.932,0.935,0.931,0.925,0.791,0.833,0.920,0.779,0.936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EPHA2,0.696,-0.145,-1,0.795,0.921,0.913,0.901,0.942,0.795,0.006,0.037,0.764,0.801,0.878
YANK2,0.695,-0.105,2,0.427,0.914,0.907,0.773,0.826,0.906,0.592,0.056,0.427,0.779,0.765
ZAP70,0.687,-0.079,-1,0.710,0.886,0.886,0.828,0.853,0.710,0.006,0.047,0.847,0.867,0.940
FES,0.683,-0.157,-1,0.732,0.914,0.905,0.879,0.922,0.732,0.006,0.056,0.804,0.734,0.875


In [24]:
dfs={}
for idx in tqdm(df.index,total=len(df)):
    out = test_motif(idx)
    dfs[idx]=out
    out.to_parquet(f'fig/motif/{idx}/kinase3.parquet')
    # break

100%|████████████████████████████████████████████████████████████████████████████████████████| 1169/1169 [2:40:25<00:00,  8.23s/it]


In [23]:
out

Unnamed: 0_level_0,JSD_mean,pearson_surrounding,kinase_max_IC_position,max_position_JSD
kinase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BMPR1B,0.573,0.089,1,0.632
TGFBR1,0.568,0.109,-2,0.598
JNK2,0.565,0.103,1,0.601
CLK3,0.565,0.033,1,0.679
KIS,0.564,0.104,1,0.629
...,...,...,...,...
WEE1_TYR,0.435,-0.103,-1,0.514
PDGFRA,0.434,-0.161,3,0.589
MUSK,0.434,-0.055,1,0.457
DDR2,0.433,-0.107,3,0.513


## With CDDM

In [145]:
cddm=Data.get_cddm()

In [152]:
info['idx'] = info['uniprot']+'_' + info['kinase']

In [154]:
[cddm.index.isin(info.idx)]

[array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
      

In [146]:
cddm

Unnamed: 0,-20P,-20G,-20A,-20C,-20S,-20T,-20V,-20I,-20L,-20M,...,20H,20K,20R,20Q,20N,20D,20E,20pS,20pT,20pY
P12931_SRC,0.054538,0.081380,0.060077,0.012356,0.036216,0.032382,0.055816,0.052407,0.083511,0.023434,...,0.013351,0.076992,0.060970,0.037383,0.036938,0.052960,0.086337,0.025367,0.015576,0.023142
P29320_EPHA3,0.044276,0.088013,0.065335,0.008639,0.037797,0.036717,0.072354,0.048596,0.075594,0.026998,...,0.014132,0.083098,0.059356,0.031091,0.033917,0.056529,0.100622,0.025438,0.013567,0.015828
P07332_FES,0.047231,0.082519,0.070575,0.011401,0.034745,0.039088,0.061889,0.053203,0.088491,0.026059,...,0.013053,0.086266,0.055619,0.040863,0.038025,0.059024,0.085698,0.026674,0.013621,0.019296
Q16288_NTRK3,0.044444,0.074644,0.074074,0.017094,0.033048,0.035328,0.060969,0.058120,0.084330,0.026781,...,0.015682,0.091677,0.052473,0.030760,0.044029,0.057298,0.088661,0.021713,0.015682,0.018094
Q9UM73_ALK,0.045748,0.079765,0.073314,0.018182,0.032845,0.035191,0.067449,0.051026,0.076246,0.027566,...,0.015634,0.090198,0.069152,0.030066,0.043897,0.051112,0.096212,0.026458,0.013229,0.017438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q15746_SMMLCK,0.078947,0.052632,0.078947,0.026316,0.052632,0.026316,0.052632,0.052632,0.052632,0.000000,...,0.000000,0.066667,0.022222,0.044444,0.133333,0.044444,0.000000,0.044444,0.022222,0.000000
Q01973_ROR1,0.097561,0.097561,0.024390,0.000000,0.024390,0.146341,0.170732,0.024390,0.048780,0.000000,...,0.000000,0.075000,0.050000,0.000000,0.050000,0.025000,0.125000,0.125000,0.000000,0.025000
O14976_GAK,0.075000,0.075000,0.175000,0.000000,0.025000,0.025000,0.100000,0.025000,0.050000,0.175000,...,0.000000,0.024390,0.024390,0.000000,0.170732,0.195122,0.024390,0.048780,0.024390,0.000000
Q6P0Q8_MAST2,0.046512,0.116279,0.069767,0.000000,0.023256,0.069767,0.069767,0.000000,0.046512,0.046512,...,0.000000,0.048780,0.219512,0.000000,0.048780,0.024390,0.024390,0.024390,0.000000,0.000000
