## This notebook calculates similarity and error between protein embeddings and use GO semantic similarity as gold standart.

In [1]:
import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
import matplotlib.pyplot
import random

## Load protein vectors of Gene2Vec

In [2]:
colnames=['Gene', 'Vector'] 
representationFile = '/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/Gene2Vec/gene2vec_dim_200_iter_9.txt'
Gene2Vec = pd.read_csv(representationFile,delimiter='\t',encoding='utf-8', names=colnames, header=None)

In [3]:
# UNIPROT data for mapping between UNIPROT accession numbers and UNIPROT entry names
uniprot_metadata_directory = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/"
uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t')

In [4]:
Gene2Vec_GenesList = Gene2Vec['Gene'].tolist()
len(Gene2Vec_GenesList)

24447

In [5]:
Gene2Vec4Prots = pd.DataFrame(columns=['Gene', 'Entry', 'Vector'])

In [6]:
for index, row in tqdm_notebook(uniprot_df.iterrows(), total= len(uniprot_df)):
    gene_names = str(row['Gene names']).split()
    new_gene = True
    for gene_name in gene_names:
        if gene_name in Gene2Vec_GenesList and new_gene == True:
            vector = Gene2Vec.query("Gene == @gene_name")['Vector'].item()
            Gene2Vec4Prots = Gene2Vec4Prots.append(pd.Series([gene_name, row['Entry'], vector],\
                                                             index=Gene2Vec4Prots.columns ),ignore_index=True)
            new_gene = False
        

HBox(children=(IntProgress(value=0, max=20421), HTML(value='')))




In [7]:
Gene2Vec4Prots

Unnamed: 0,Gene,Entry,Vector
0,TEX13A,Q9BXU3,-0.024964347 -0.054136768 -0.03164696 0.071467...
1,LARS2,Q15031,-0.21709882 -0.10872821 -0.15145996 -0.1803116...
2,TXNDC11,Q6PKC3,0.016281696 0.0146435285 -0.024859482 -0.25792...
3,TXK,P42681,-0.60876805 0.106821336 0.39637876 -0.04007827...
4,TXLNA,P40222,-0.0943891 0.2919918 -0.15244655 -0.20161043 0...
5,TXLNB,Q8N3L3,0.27845934 -0.049145203 0.28258604 -0.33538082...
6,TXNDC2,Q86VQ3,0.03790296 0.12055474 -0.054069646 0.38271767 ...
7,TXNDC5,Q8NBS9,0.13921826 0.18452647 -0.19664992 -0.2985039 -...
8,TXNDC9,O14530,-0.12764496 0.09772674 0.07011545 -0.032029904...
9,TXLNGY,Q9BZA5,-0.07835789 0.21128362 0.24222846 0.026012164 ...


In [8]:
Gene2VecProteinList = Gene2Vec4Prots['Entry'].tolist()
len(Gene2VecProteinList)

18407

In [9]:
len(np.asarray(Gene2Vec4Prots.query("Entry == 'Q9NV06'")['Vector'].item().split()))

200

In [10]:
Gene2Vec4Prots.to_csv("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/Gene2Vec_dataframe.csv")

In [9]:
def f(row):
    row['Vector'] = row['Vector'].split()
    return row

Gene2Vec4Prots.apply(f, axis=1)

Unnamed: 0,Gene,Entry,Vector
0,TEX13A,Q9BXU3,"[-0.024964347, -0.054136768, -0.03164696, 0.07..."
1,LARS2,Q15031,"[-0.21709882, -0.10872821, -0.15145996, -0.180..."
2,TXNDC11,Q6PKC3,"[0.016281696, 0.0146435285, -0.024859482, -0.2..."
3,TXK,P42681,"[-0.60876805, 0.106821336, 0.39637876, -0.0400..."
4,TXLNA,P40222,"[-0.0943891, 0.2919918, -0.15244655, -0.201610..."
5,TXLNB,Q8N3L3,"[0.27845934, -0.049145203, 0.28258604, -0.3353..."
6,TXNDC2,Q86VQ3,"[0.03790296, 0.12055474, -0.054069646, 0.38271..."
7,TXNDC5,Q8NBS9,"[0.13921826, 0.18452647, -0.19664992, -0.29850..."
8,TXNDC9,O14530,"[-0.12764496, 0.09772674, 0.07011545, -0.03202..."
9,TXLNGY,Q9BZA5,"[-0.07835789, 0.21128362, 0.24222846, 0.026012..."


In [10]:
Gene2Vec4Prots.to_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/Gene2Vec_dataframe.pkl")

In [10]:
'''
#This part was used to be sure parallel and sequential versions gives same results
cosine_distance_list1 = []
real_distance_list1 = []

similarityMatrixFileName = ""
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"

human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)

proteinListTmp = human_proteinSimilarityMatrix.columns[0:10]
for i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):
    for j in range(len(proteinListTmp)):
        if j>i:
            protein2 = proteinListTmp[j]
            if protein1 in Gene2VecProteinList and protein2 in Gene2VecProteinList:
                prot1vec = np.asarray(Gene2Vec4Prots.query("Entry == @protein1")['Vector'].item().split())
                prot2vec = np.asarray(Gene2Vec4Prots.query("Entry == @protein2")['Vector'].item().split())
                #cosine will return in shape of input vectors first dimension
                cosine_dist = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
                cosine_norm = (1+cosine_dist)/2
                cosine_distance_list1.append(cosine_norm)
                real_distance_list1.append(human_proteinSimilarityMatrix.loc[protein1,protein2])

print(len(cosine_distance_list1))
print(mse(real_distance_list1,cosine_distance_list1))

'''

'\n#This part was used to be sure parallel and sequential versions gives same results\ncosine_distance_list1 = []\nreal_distance_list1 = []\n\nsimilarityMatrixFileName = ""\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"\nsimilarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"\n\nhuman_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)\nhuman_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)\n\nproteinListTmp = human_proteinSimilarityMatrix.columns[0:10]\nfor i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):\n    for j in range(len(proteinListTmp)):\n        if j>i:\n            protein2 = proteinListTmp[j]\n            if protein1 in Gene2VecProteinList and protei

In [11]:
'''
# Multiprocess check
proteinList = []
manager = Manager()
similarity_list = manager.list()

def parallelSimilarity(paramList):
    #print(paramList)
    i = paramList[0]
    j = paramList[1]
    if j>i:  
        protein1 = proteinList[i]
        protein2 = proteinList[j]
        prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
        prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
        prot1vec = protVecEmbeddingDict[()][prot1name]
        prot2vec = protVecEmbeddingDict[()][prot2name]
        #cosine will return in shape of input vectors first dimension
        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
        real = human_protienSimilarityMatrix.loc[protein1,protein2]
        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
        similarity_list.append((real,cos))

#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"

human_protienSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_protienSimilarityMatrix.set_index(human_protienSimilarityMatrix.columns, inplace = True)
proteinList = human_protienSimilarityMatrix.columns[0:10]

i = range(len(proteinList))
j = range(len(proteinList))
protParamList = list(itertools.product(i,j))

    #manager = Manager()
    #similarity_list = manager.list()
total_task_num=len(proteinList)**2

pool = Pool()
pool.map(parallelSimilarity, protParamList)
pool.close()
pool.join()

real_distance_list = [value[0] for value in similarity_list]
cosine_distance_list = [value[1] for value in similarity_list]

mseValue = mse(real_distance_list,cosine_distance_list)
print(mseValue)
'''

'\n# Multiprocess check\nproteinList = []\nmanager = Manager()\nsimilarity_list = manager.list()\n\ndef parallelSimilarity(paramList):\n    #print(paramList)\n    i = paramList[0]\n    j = paramList[1]\n    if j>i:  \n        protein1 = proteinList[i]\n        protein2 = proteinList[j]\n        prot1name = uniprot_df.query("Entry == @protein1")[\'Entry name\'].item()\n        prot2name = uniprot_df.query("Entry == @protein2")[\'Entry name\'].item()\n        prot1vec = protVecEmbeddingDict[()][prot1name]\n        prot2vec = protVecEmbeddingDict[()][prot2name]\n        #cosine will return in shape of input vectors first dimension\n        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()\n        real = human_protienSimilarityMatrix.loc[protein1,protein2]\n        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled\n        similarity_list.append((real,cos))\n\n#similarityMatrixFileName = "/media/DATA/serbulent/Code

In [12]:
# define similarity_list and proteinList as global variables
proteinList = []
manager = Manager()
similarity_list = manager.list()
proteinListNew = manager.list()

def parallelSimilarity(paramList):
    i = paramList[0]
    j = paramList[1] 
    aspect = paramList[2]
    if j>i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]
        if protein1 in Gene2VecProteinList and protein2 in Gene2VecProteinList:
            prot1vec = np.asarray(Gene2Vec4Prots.query("Entry == @protein1")['Vector'].item().split())
            prot2vec = np.asarray(Gene2Vec4Prots.query("Entry == @protein2")['Vector'].item().split())
            #print(str(protein1) + str(prot1vec))
            #print(str(protein2) + str(prot2vec))
            #cosine will return in shape of input vectors first dimension
            cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
            manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
            manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
            euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
            euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) 
            real = paramList[3]
            #real = human_protienSimilarityMatrix.loc[protein1,protein2]
            # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
            #Similarity = 1-distance
            similarity_list.append((real,1-cos,1-manhattanDistNorm.item(),1-euclideanDistNorm.item()))
    return similarity_list


## Calculate similarity values with parallel processing

In [13]:
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"

In [14]:
human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)

In [15]:
def calculateMSEforOntology(aspect,sparse=False):
    
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []

    #similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
    similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv"

    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
    proteinList = human_proteinSimilarityMatrix.columns
    
    #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
    if sparse:
        #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
        sparsified_similarity_coordinates = np.load("SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
        protParamList = sparsified_similarity_coordinates
    else:     
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i,j))
    protParamListNew = []
    # Prepare parameters for parallel processing these parameters will be 
    # used concurrently by different processes
    for tup in tqdm_notebook(protParamList):
        i = tup[0]
        j = tup[1]
        
        if sparse:
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1,protein2]
            tupNew = (tup[0],tup[1],aspect,real)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                tupNew = (tup[0],tup[1],aspect,real)
                protParamListNew.append(tupNew)

    total_task_num=len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    for similarity_listRet in tqdm_notebook(pool.imap_unordered(parallelSimilarity, protParamListNew), total=total_task_num):
        pass
    pool.close()
    pool.join()

    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    #mseValue = mse(real_distance_list,cosine_distance_list)
    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)
    
    random.seed(42)
    random_list = []
    for i in range(len(real_distance_list)):
        random_list.append(random.uniform(0, 1))
    
    if sparse:
        cosine_randomCorr = spearmanr(cosine_distance_list, random_list)
        manhattan_randomCorr = spearmanr(manhattan_distance_list, random_list)
        euclidian_randomCorr = spearmanr(euclidian_distance_list, random_list)
        print("Cosine Random Correlation for "+aspect+" is " + str(cosine_randomCorr))
        print("Manhattan Random Correlation for "+aspect+" is " + str(manhattan_randomCorr))
        print("Euclidian Random Correlation for "+aspect+" is " + str(euclidian_randomCorr))
    
    
    print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

    if sparse:
        return (cosineCorr,manhattanCorr,euclidianCorr,cosine_randomCorr,manhattan_randomCorr,euclidian_randomCorr)
    else:
        return (cosineCorr,manhattanCorr,euclidianCorr)
    

In [16]:
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"
#saveFileName = "SimilarityGene2Vec.csv"
saveFileName = "SimilarityGene2Vec_highest_200.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()
    
# 0.3673674654105104 mse for MF with 0:10
# 0.31965355246378196 mse for BP with 0:10
# 0.29460915219361683 mse for CC with 0:10

HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for MF is SpearmanrResult(correlation=-0.0938153249372314, pvalue=3.808962832459843e-40)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.09346250205689033, pvalue=7.420802047333575e-40)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.08696913262192496, pvalue=1.0121064397250687e-34)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for BP is SpearmanrResult(correlation=-0.13119712297365815, pvalue=4.105392235160173e-77)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.09154795168521214, pvalue=2.648653106543004e-38)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.11607747026290008, pvalue=1.1806220689790009e-60)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for CC is SpearmanrResult(correlation=-0.13983360917806933, pvalue=1.9164132414196707e-87)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.13337031998306156, pvalue=1.1960188311617881e-79)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.13011852933456433, pvalue=7.176317134150369e-76)


Calculate Sparsified Similarities

In [17]:
'''buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal\
,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"
#saveFileName = "SimilarityGene2Vec.csv"
saveFileName = "Similarity_Sparse_Gene2Vec_highest_500.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect,True) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])\
    + ","+ str(corr[3][0])+ ","+ str(corr[3][1])+ ","+ str(corr[4][0])+ ","+ str(corr[4][1])+ \
    ","+ str(corr[5][0])+ ","+ str(corr[5][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()'''

'buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"\n#saveFileName = "SimilarityGene2Vec.csv"\nsaveFileName = "Similarity_Sparse_Gene2Vec_highest_500.csv"\nf = open(saveFileName,\'w\')\nf.write(buffer)\n\nfor aspect in ["MF","BP","CC"]:\n    corr = calculateMSEforOntology(aspect,True) \n    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])    + ","+ str(corr[3][0])+ ","+ str(corr[3][1])+ ","+ str(corr[4][0])+ ","+ str(corr[4][1])+     ","+ str(corr[5][0])+ ","+ str(corr[5][1])+"\n" \n    f = open(saveFileName,\'a\')\n    f.write(buffer) #Give your csv text here.\n    ## Python will convert \n to os.linesep\n    f.close()'