In [2]:
import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
import random
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
from functools import partial

In [2]:
# UNIPROT data for mapping between UNIPROT accession numbers and UNIPROT entry names
uniprot_metadata_directory = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/"
uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t')

In [3]:
tape_bert_file_name = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/tape_new_bert/tape_new_transformer.npz"
tape_bert_file = np.load(tape_bert_file_name, allow_pickle=True)

In [4]:
transformer_dataframe_avg = pd.DataFrame(columns = ['Entry', 'Vector'])
transformer_dataframe_pool = pd.DataFrame(columns = ['Entry', 'Vector'])
for prot_id in tqdm_notebook(tape_bert_file , total=len(tape_bert_file)):
    protein_entry = prot_id.split('|')[1]
    tmp=tape_bert_file[prot_id].tolist()
    transformer_dataframe_avg = transformer_dataframe_avg.append({'Entry': protein_entry, 'Vector':tmp['avg']},ignore_index=True)
    transformer_dataframe_pool = transformer_dataframe_pool.append({'Entry': protein_entry, 'Vector':tmp['pooled']},ignore_index=True)

HBox(children=(IntProgress(value=0, max=20007), HTML(value='')))




In [34]:
protein_names = transformer_dataframe_avg['Entry'].tolist()

In [3]:
transformer_dataframe_pool.to_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/transformer_dataframe_pool.pkl")
transformer_dataframe_avg.to_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/transformer_dataframe_avg.pkl")

NameError: name 'transformer_dataframe_pool' is not defined

In [4]:
transformer_dataframe_pool = pd.read_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/transformer_dataframe_pool.pkl")
transformer_dataframe_avg = pd.read_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/transformer_dataframe_avg.pkl")

In [5]:
transformer_dataframe_pool.to_csv("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/transformer_dataframe_pool.csv")

In [44]:
transformer_dataframe_avg.to_csv("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/transformer_dataframe_avg.csv")

In [43]:
transformer_dataframe_avg[0:10]

Unnamed: 0,Entry,Vector
0,P18433,"[0.060395252, -1.1459734, -0.101510145, 0.1245..."
1,O75335,"[0.32916632, -0.9245654, -0.005039262, -0.0488..."
2,Q8IYG6,"[0.52817136, -0.3996036, 0.55867016, -0.369063..."
3,Q6UXU0,"[0.5714641, -0.44153115, -0.121542804, -0.0849..."
4,B1ANY3,"[0.478181, -0.5061363, -0.032993697, 0.0699267..."
5,Q96FZ5,"[0.002832711, -0.43827248, 0.053592034, 0.0450..."
6,Q9GZN7,"[0.33964908, -0.57820886, -0.2407801, -0.07511..."
7,A6NJW4,"[0.7161432, -0.4619242, -0.31101382, -0.465274..."
8,Q6P9A3,"[-0.0010922293, -0.6677896, -0.22753885, 0.526..."
9,P05062,"[0.03655637, -0.71208334, -0.36614433, 0.48408..."


In [36]:
# define similarity_list and proteinList as global variables
proteinList = []
manager = Manager()
similarity_list = manager.list()
proteinListNew = manager.list()

def parallelSimilarity(paramList):
    protein_embedding_type = paramList[4]
    if protein_embedding_type == "pool":
        protein_embedding_dataframe = transformer_dataframe_pool
    if protein_embedding_type == "avg":
        protein_embedding_dataframe = transformer_dataframe_avg
    
    i = paramList[0]
    j = paramList[1] 
    aspect = paramList[2]
    if j>i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]
        if protein1 in protein_names and protein2 in protein_names:
            prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item())
            prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item())
            #cosine will return in shape of input vectors first dimension
            cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
            manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
            manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
            euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
            euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) 
            real = paramList[3]
            #real = human_protienSimilarityMatrix.loc[protein1,protein2]
            # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
            similarity_list.append((real,cos,1-manhattanDistNorm.item(),1-euclideanDistNorm.item()))
    return similarity_list


In [37]:
def calculateMSEforOntology(aspect,matrix_type,protein_embedding_type):
    
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []

    similarityMatrixNameDict = {}
    similarityMatrixNameDict["All"] = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +aspect+"_proteinSimilarityMatrix.csv" 
    similarityMatrixNameDict["500"] = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"
    similarityMatrixNameDict["Sparse"] = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv" 
    similarityMatrixNameDict["200"] = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv"
            
    similarityMatrixFileName = similarityMatrixNameDict[matrix_type]
        
    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
    #proteinList = human_proteinSimilarityMatrix
    proteinList = human_proteinSimilarityMatrix.columns
    
     #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
    if matrix_type == "Sparse":
        #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
        sparsified_similarity_coordinates = \
        np.load("./auxilary_input/SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
        protParamList = sparsified_similarity_coordinates
    else:     
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i,j))
    protParamListNew = []
    # Prepare parameters for parallel processing these parameters will be 
    # used concurrently by different processes
    for tup in tqdm_notebook(protParamList):
        i = tup[0]
        j = tup[1]
        
        if matrix_type == "Sparse":
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1,protein2]
            tupNew = (tup[0],tup[1],aspect,real,protein_embedding_type)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                tupNew = (tup[0],tup[1],aspect,real,protein_embedding_type)
                protParamListNew.append(tupNew)

    total_task_num=len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    #parallelSimilarityPartial = partial(parallelSimilarity,protein_embedding_type)
    for similarity_listRet in tqdm_notebook(pool.imap_unordered(parallelSimilarity  \
                                                                ,protParamListNew), total=total_task_num):
        pass
    pool.close()
    pool.join()

    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    #mseValue = mse(real_distance_list,cosine_distance_list)
    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)   
    
    print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

    return (cosineCorr,manhattanCorr,euclidianCorr)
    

In [38]:
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"

for protein_embedding_type in ["pool","avg"]:
    for similarity_matrix_type in ["Sparse","200","500","All"]:
        saveFileName = "SimilarityTape_Bert"+protein_embedding_type+"_"+similarity_matrix_type+".csv"
        f = open(saveFileName,'w')
        f.write(buffer)
        for aspect in ["MF","BP","CC"]:
            corr = calculateMSEforOntology(aspect,similarity_matrix_type,protein_embedding_type) 
            buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
            + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
            f = open(saveFileName,'a')
            f.write(buffer) #Give your csv text here.
            ## Python will convert \n to os.linesep
            f.close()
    

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

Cosine Correlation for MF is SpearmanrResult(correlation=0.40857085164037726, pvalue=5.97080426078973e-11)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.4121568387063042, pvalue=3.906709792337332e-11)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.4090891881318071, pvalue=5.617455645004331e-11)


HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

Cosine Correlation for BP is SpearmanrResult(correlation=0.36167261111207294, pvalue=2.6988151046305797e-08)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.3617786588638607, pvalue=2.6718829433828793e-08)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.3609952040445314, pvalue=2.8771263774883416e-08)


HBox(children=(IntProgress(value=0, max=245), HTML(value='')))

HBox(children=(IntProgress(value=0, max=245), HTML(value='')))

Cosine Correlation for CC is SpearmanrResult(correlation=0.2649424980891946, pvalue=0.00022937461676639806)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.2613572939440801, pvalue=0.00028060478803552713)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.2656215005599104, pvalue=0.000220711705629393)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))

Cosine Correlation for MF is SpearmanrResult(correlation=0.18903197973648384, pvalue=4.4098096282806417e-144)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.18803317913121778, pvalue=1.4758028365616895e-142)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.18915274569924, pvalue=2.880658702101465e-144)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))

Cosine Correlation for BP is SpearmanrResult(correlation=0.11632110066329508, pvalue=3.972510942817961e-55)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.11625010035450355, pvalue=4.619384363499269e-55)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.11616314645874072, pvalue=5.5560929044732165e-55)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))

Cosine Correlation for CC is SpearmanrResult(correlation=0.01540367449568577, pvalue=0.04805683702833609)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.013306948887461492, pvalue=0.08768241607813118)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.015361557612870284, pvalue=0.04867132033498957)


HBox(children=(IntProgress(value=0, max=247009), HTML(value='')))

HBox(children=(IntProgress(value=0, max=123256), HTML(value='')))

Cosine Correlation for MF is SpearmanrResult(correlation=0.16226955080047656, pvalue=0.0)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.1626256018946066, pvalue=0.0)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.16231936784184467, pvalue=0.0)


HBox(children=(IntProgress(value=0, max=247009), HTML(value='')))

HBox(children=(IntProgress(value=0, max=123256), HTML(value='')))

Cosine Correlation for BP is SpearmanrResult(correlation=0.07841099021370011, pvalue=2.8029648944842544e-154)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.0779094011398529, pvalue=2.4840542351051962e-152)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.07834715795730864, pvalue=4.9677848593015355e-154)


HBox(children=(IntProgress(value=0, max=245025), HTML(value='')))

HBox(children=(IntProgress(value=0, max=122265), HTML(value='')))

Cosine Correlation for CC is SpearmanrResult(correlation=0.004144158652397764, pvalue=0.18291321958942605)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.002248116674648534, pvalue=0.4699923244798956)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.004088413136357672, pvalue=0.1888720712009469)


HBox(children=(IntProgress(value=0, max=9467929), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4732426), HTML(value='')))

Cosine Correlation for MF is SpearmanrResult(correlation=0.0682655052487368, pvalue=0.0)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.06727152628369212, pvalue=0.0)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.06830249689582543, pvalue=0.0)


HBox(children=(IntProgress(value=0, max=37871716), HTML(value='')))

HBox(children=(IntProgress(value=0, max=18932781), HTML(value='')))

Cosine Correlation for BP is SpearmanrResult(correlation=0.054392060808297345, pvalue=0.0)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.05465294895829099, pvalue=0.0)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.05441917180914316, pvalue=0.0)


HBox(children=(IntProgress(value=0, max=20529961), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10262715), HTML(value='')))

Cosine Correlation for CC is SpearmanrResult(correlation=0.01354818997664756, pvalue=0.0)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.012421093852355182, pvalue=0.0)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.013580581982114068, pvalue=0.0)


HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

Cosine Correlation for MF is SpearmanrResult(correlation=0.45783986196924603, pvalue=1.1078198926017823e-13)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.49648612931548936, pvalue=3.732414283758915e-16)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.4583401693653218, pvalue=1.0337369973072905e-13)


HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

Cosine Correlation for BP is SpearmanrResult(correlation=0.22296323389372713, pvalue=0.0007989749604679824)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.20682666170334332, pvalue=0.001903663251382038)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.22117773603199609, pvalue=0.0008822543963720316)


HBox(children=(IntProgress(value=0, max=245), HTML(value='')))

HBox(children=(IntProgress(value=0, max=245), HTML(value='')))

Cosine Correlation for CC is SpearmanrResult(correlation=0.2672603494551983, pvalue=0.00020104200967715162)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.2217707389039976, pvalue=0.002162915012843233)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.2725430597771023, pvalue=0.00014819844489783477)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))

Cosine Correlation for MF is SpearmanrResult(correlation=0.2177809773340571, pvalue=1.0239998575258815e-191)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.22346737255337032, pvalue=5.210985400802653e-202)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.21669924060702617, pvalue=8.618919160957068e-190)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))

Cosine Correlation for BP is SpearmanrResult(correlation=0.14680920823618346, pvalue=4.627782031373728e-87)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.14165391062834995, pvalue=3.9943825739184895e-81)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.14741892875730309, pvalue=8.886732146241368e-88)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))

Cosine Correlation for CC is SpearmanrResult(correlation=-0.006023199949127148, pvalue=0.4395447670534941)
Manhattan Correlation for CC is SpearmanrResult(correlation=-0.01707526894783479, pvalue=0.028421188985284524)
Euclidian Correlation for CC is SpearmanrResult(correlation=-0.009586570242742853, pvalue=0.21859619941718486)


HBox(children=(IntProgress(value=0, max=247009), HTML(value='')))

HBox(children=(IntProgress(value=0, max=123256), HTML(value='')))

Cosine Correlation for MF is SpearmanrResult(correlation=0.1976302576435238, pvalue=0.0)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.2046434013408975, pvalue=0.0)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.19662277396915046, pvalue=0.0)


HBox(children=(IntProgress(value=0, max=247009), HTML(value='')))

HBox(children=(IntProgress(value=0, max=123256), HTML(value='')))

Cosine Correlation for BP is SpearmanrResult(correlation=0.0738660604588112, pvalue=4.305346908109565e-137)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.07111780733483708, pvalue=3.3477045368964596e-127)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.07207758596050495, pvalue=1.2986651202563268e-130)


HBox(children=(IntProgress(value=0, max=245025), HTML(value='')))

HBox(children=(IntProgress(value=0, max=122265), HTML(value='')))

Cosine Correlation for CC is SpearmanrResult(correlation=-0.010361052243406001, pvalue=0.000868826840030297)
Manhattan Correlation for CC is SpearmanrResult(correlation=-0.022816350664943284, pvalue=2.2408308882980568e-13)
Euclidian Correlation for CC is SpearmanrResult(correlation=-0.012090085088239826, pvalue=0.00010207554950449406)


HBox(children=(IntProgress(value=0, max=9467929), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4732426), HTML(value='')))

Cosine Correlation for MF is SpearmanrResult(correlation=0.0784358588026524, pvalue=0.0)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.08026996480394573, pvalue=0.0)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.07809771653097003, pvalue=0.0)


HBox(children=(IntProgress(value=0, max=37871716), HTML(value='')))

HBox(children=(IntProgress(value=0, max=18932781), HTML(value='')))

Cosine Correlation for BP is SpearmanrResult(correlation=0.07673499578222312, pvalue=0.0)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.08614328872910196, pvalue=0.0)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.07577506668900945, pvalue=0.0)


HBox(children=(IntProgress(value=0, max=20529961), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10262715), HTML(value='')))

Cosine Correlation for CC is SpearmanrResult(correlation=0.030020623734863378, pvalue=0.0)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.01864563431876151, pvalue=0.0)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.027898873598229082, pvalue=0.0)
