In [1]:
import numpy as np
import pandas as pd

import math
import matplotlib.pyplot as plt 
from sklearn.metrics import roc_auc_score, RocCurveDisplay, PrecisionRecallDisplay, average_precision_score

In [2]:
avg_auroc=0
avg_aupr=0

In [3]:
def random_walk(inter, rna_cnt, similarity_matrix, prot_id, Wq, Wu, rq, ru, output_file_name, slice_):
    
    verified_association=inter[inter.PROT_ID==prot_id].RNA_ID.to_numpy()
    inter_size=verified_association.shape[0]
    np.random.shuffle(verified_association)
    associated_rna=verified_association[0:int(slice_*inter_size)] # slicing the dataset and pretending that we dont know about the remaining (1-slice_)*100% of the interactions associated with 'prot_id'
    # rna in associated_rna == True if rna is labelled node
    
    R=similarity_matrix.copy() # correlation matrix
    for i in range(rna_cnt):
        if i in associated_rna:
            R[i]=R[i]*Wq
        else:
            R[i]=R[i]*Wu
    for i in range(rna_cnt):
        sigma=np.sum(R[i])
        R[i]=R[i]/sigma
    
    Lq=R.copy()
    Lu=R.copy()
    
    for i in range(rna_cnt):
        if i in associated_rna:
            Lu[i]*=0
        else:
            Lq[i]*=0
    
    filt=np.zeros(rna_cnt)
    for i in associated_rna:
        filt[i]=1
    
    mod_Q=associated_rna.shape[0]
    X_init=np.zeros((rna_cnt,1))
    for i in associated_rna:
        X_init[i][0]=1/mod_Q
    
    X=X_init.copy()
    pq=np.matmul(filt,X)[0]
    pu=1-pq
    iter_cnt=0
    # print(pq)
    # print("i:"+str(iter_cnt)+" "+str(X[0][0]))
    
    while(True):
        prev=X.copy()
        X=rq*(np.matmul(Lq.transpose(),prev))+pq*(1-rq)*X_init+ru*(np.matmul(Lu.transpose(),prev))+pu*(1-ru)*X_init
        pq=np.matmul(filt,X)[0]
        pu=1-pq

        if(np.linalg.norm(X-prev,ord=1)<1e-10):
            break
        iter_cnt+=1
        # print("i:"+str(iter_cnt)+" "+str(X[0][0]))
    
    print(str(prot_id),end='\t')
    
    X=np.reshape(X,rna_cnt)
    ser=pd.Series(X)
    
    ser.sort_values(ascending=False, inplace=True)
    
    y_true=np.empty(0)
    y_pred=np.empty(0)
    
    f=open(output_file_name,'a')
    f.write('\nPROTEIN ID:\t'+str(prot_id)+'\n')
    f.write('RNA\tCorr. score(Sr)\tInteraction verified\n')
    for index, value in ser.items():
        if index not in associated_rna:
            y_pred=np.append(y_pred,value)
            if index in verified_association:
                flag='YES'
                y_true=np.append(y_true,1)
            else:
                flag='NO'
                y_true=np.append(y_true,0)
            f.write(str(index)+'\t'+str(value)+'\t'+flag+'\n')
    
    f.close()
    auroc=roc_auc_score(y_true, y_pred)
    # RocCurveDisplay.from_predictions(y_true,y_pred,name=f'IRWNRLMF CVS1 (DS {i})')
    print(str(auroc),end='\t')
    global avg_auroc
    avg_auroc+=auroc
    
    aupr=average_precision_score(y_true,y_pred)
    # PrecisionRecallDisplay.from_predictions(y_true,y_pred,name=f'IRWNRLMF CVS1 (DS {i})')
    print(str(aupr))
    global avg_aupr
    avg_aupr+=aupr

In [4]:
rna_sim=pd.read_csv('rna_similarity.txt',sep='\t')
prot_sim=pd.read_csv('prot_similarity.txt',sep='\t')
inter=pd.read_csv('list_of_interactions.txt',sep='\t')

rna_cnt=inter['RNA_ID'].unique().shape[0]
prot_cnt=inter['PROT_ID'].unique().shape[0]

In [5]:
similarity_matrix=np.empty((rna_cnt,rna_cnt))
for i in rna_sim.index:
    similarity_matrix[rna_sim['RNA(i)'][i]][rna_sim['RNA(j)'][i]]=rna_sim['Sim(i,j)'][i]

In [6]:
output_file_name='random_walk_verify_slice_'
slice_=0.8
output_file_name=output_file_name+str(slice_)+'.txt'
Wq=0.8
Wu=0.4
rq=0.8
ru=0.4
open(output_file_name,'w').close()
print('Protein id\tAUROC\tAUPR')
for i in range(prot_cnt):
    random_walk(inter,rna_cnt,similarity_matrix,i,Wq,Wu,rq,ru,output_file_name,slice_)

Protein id	AUROC	AUPR
0	0.4023496828374877	0.02570172779463087
1	0.47753313268916	0.6207417253769398
2	0.5225215960510078	0.029489806818126886
3	0.3856382978723405	0.024847832592580756
4	0.536737166776823	0.6560147872894405
5	0.3002577319587629	0.003946429753691579
6	0.5165986646884273	0.09606860172383525
7	0.5032197634937361	0.11997318958009341
8	0.6132653061224489	0.012038858096430061
9	0.37696540880503143	0.024533162481208296
10	0.40370479270802706	0.020381597218071285
11	0.5207847295864263	0.11204662323770367
12	0.4540608645086257	0.01094816652488118
13	0.5388327721661055	0.027696689343383818
14	0.5733478576615831	0.02448568505957024
15	0.48821157127346054	0.0497060728461078
16	0.6249413970932958	0.021077777907731916
17	0.4556918882072256	0.003628394336872489
18	0.43822978875122565	0.02758025805386783
19	0.4338758901322482	0.004266389177939646
20	1.0	1.0
21	0.25510204081632654	0.0020002768549280178
22	0.3924116424116424	0.0072397287453545255
23	0.5632008154943935	0.0091912130807990

In [7]:
print(f'Average AUROC={avg_auroc/64}')
print(f'Average AUPR={avg_aupr/64}')

Average AUROC=0.48607436637906126
Average AUPR=0.07390520198216172
