In [1]:
import numpy as np
import pandas as pd

import math
import matplotlib.pyplot as plt 
from sklearn.metrics import roc_auc_score, RocCurveDisplay, PrecisionRecallDisplay, average_precision_score

In [2]:
avg_auroc=0
avg_aupr=0

In [3]:
def random_walk(inter, rna_cnt, similarity_matrix, prot_id, Wq, Wu, rq, ru, output_file_name, slice_):
    
    verified_association=inter[inter.PROT_ID==prot_id].RNA_ID.to_numpy()
    inter_size=verified_association.shape[0]
    np.random.shuffle(verified_association)
    associated_rna=verified_association[0:int(slice_*inter_size)] # slicing the dataset and pretending that we dont know about the remaining (1-slice_)*100% of the interactions associated with 'prot_id'
    # rna in associated_rna == True if rna is labelled node
    
    R=similarity_matrix.copy() # correlation matrix
    for i in range(rna_cnt):
        if i in associated_rna:
            R[i]=R[i]*Wq
        else:
            R[i]=R[i]*Wu
    for i in range(rna_cnt):
        sigma=np.sum(R[i])
        R[i]=R[i]/sigma
    
    Lq=R.copy()
    Lu=R.copy()
    
    for i in range(rna_cnt):
        if i in associated_rna:
            Lu[i]*=0
        else:
            Lq[i]*=0
    
    filt=np.zeros(rna_cnt)
    for i in associated_rna:
        filt[i]=1
    
    mod_Q=associated_rna.shape[0]
    X_init=np.zeros((rna_cnt,1))
    for i in associated_rna:
        X_init[i][0]=1/mod_Q
    
    X=X_init.copy()
    pq=np.matmul(filt,X)[0]
    pu=1-pq
    iter_cnt=0
    # print(pq)
    # print("i:"+str(iter_cnt)+" "+str(X[0][0]))
    
    while(True):
        prev=X.copy()
        X=rq*(np.matmul(Lq.transpose(),prev))+pq*(1-rq)*X_init+ru*(np.matmul(Lu.transpose(),prev))+pu*(1-ru)*X_init
        pq=np.matmul(filt,X)[0]
        pu=1-pq

        if(np.linalg.norm(X-prev,ord=1)<1e-10):
            break
        iter_cnt+=1
        # print("i:"+str(iter_cnt)+" "+str(X[0][0]))
    
    print(str(prot_id),end='\t')
    
    X=np.reshape(X,rna_cnt)
    ser=pd.Series(X)
    
    ser.sort_values(ascending=False, inplace=True)
    
    y_true=np.empty(0)
    y_pred=np.empty(0)
    
    f=open(output_file_name,'a')
    f.write('\nPROTEIN ID:\t'+str(prot_id)+'\n')
    f.write('RNA\tCorr. score(Sr)\tInteraction verified\n')
    for index, value in ser.items():
        if index not in associated_rna:
            y_pred=np.append(y_pred,value)
            if index in verified_association:
                flag='YES'
                y_true=np.append(y_true,1)
            else:
                flag='NO'
                y_true=np.append(y_true,0)
            f.write(str(index)+'\t'+str(value)+'\t'+flag+'\n')
    
    f.close()
    auroc=roc_auc_score(y_true, y_pred)
    # RocCurveDisplay.from_predictions(y_true,y_pred,name=f'IRWNRLMF CVS1 (DS {i})')
    print(str(auroc),end='\t')
    global avg_auroc
    avg_auroc+=auroc
    
    aupr=average_precision_score(y_true,y_pred)
    # PrecisionRecallDisplay.from_predictions(y_true,y_pred,name=f'IRWNRLMF CVS1 (DS {i})')
    print(str(aupr))
    global avg_aupr
    avg_aupr+=aupr

In [4]:
rna_sim=pd.read_csv('rna_similarity.txt',sep='\t')
prot_sim=pd.read_csv('prot_similarity.txt',sep='\t')
inter=pd.read_csv('list_of_interactions.txt',sep='\t')

rna_cnt=inter['RNA_ID'].unique().shape[0]
prot_cnt=inter['PROT_ID'].unique().shape[0]

In [5]:
similarity_matrix=np.empty((rna_cnt,rna_cnt))
for i in rna_sim.index:
    similarity_matrix[rna_sim['RNA(i)'][i]][rna_sim['RNA(j)'][i]]=rna_sim['Sim(i,j)'][i]

In [6]:
output_file_name='random_walk_verify_slice_'
slice_=0.8
output_file_name=output_file_name+str(slice_)+'.txt'
Wq=0.8
Wu=0.4
rq=0.8
ru=0.4
open(output_file_name,'w').close()
print('Protein id\tAUROC\tAUPR')
for i in range(prot_cnt):
    random_walk(inter,rna_cnt,similarity_matrix,i,Wq,Wu,rq,ru,output_file_name,slice_)

Protein id	AUROC	AUPR
0	0.4658938622353256	0.05438577006088061
1	0.4682665399440308	0.7598197611963922
2	0.5088919288645691	0.049549677713608534
3	0.38313829787234044	0.01942461595765332
4	0.5060586032165675	0.7731840500299512
5	0.45940721649484534	0.01333916633722538
6	0.5248720764503844	0.16782128365950388
7	0.4232318849563007	0.014810522016910401
8	0.6469387755102041	0.011258926166610797
9	0.5785464709993011	0.03908625141845696
10	0.4486915613054984	0.03855148355538542
11	0.42088519283362164	0.07213208761399495
12	0.5112194131383897	0.023143380590141614
13	0.5246071829405163	0.05770315828009219
14	0.5437232364210052	0.10492082760994563
15	0.41775244299674263	0.02656468176047771
16	0.5127202779846116	0.025987636906769543
17	0.6239263803680981	0.007761446262938711
18	0.4617499375184604	0.055702054123352523
19	0.3594438792811122	0.004491359557661488
20	0.6889564336372846	0.004792904394724645
21	0.713265306122449	0.019769129114348513
22	0.4950623700623701	0.020340063020093613
23	0.53899

In [7]:
print(f'Average AUROC={avg_auroc/64}')
print(f'Average AUPR={avg_aupr/64}')

Average AUROC=0.4980157323507829
Average AUPR=0.061827459885642394
