## This notebook calculates similarity and error between protein embeddings and use GO semantic similarity as gold standart.

In [1]:
import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
import random


## Load protein vectors of Gene2Vec

In [4]:
colnames=['Gene', 'Vector'] 
representationFile = '/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/representation_vectors/Learned_Embeddings/learned_embed_calculated_human_protein_vectors.csv'
LearnedVec = pd.read_csv(representationFile,delimiter=',',encoding='utf-8', names=colnames, header=None)

In [5]:
def string_to_float_list(stringList):
    l2 = stringList.strip("[").strip("]").split(',')
    result = list(map(float, l2))
    return result

In [6]:
LearnedVec['Vector'] = LearnedVec['Vector'].map(string_to_float_list)

In [7]:
# UNIPROT data for mapping between UNIPROT accession numbers and UNIPROT entry names
uniprot_metadata_directory = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/"
uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t')
uniprot_df.iloc[0:5]

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,Annotation
0,A0A584,TVBK2_HUMAN,reviewed,T cell receptor beta variable 11-2,TRBV11-2 TCRBV21S3A2N2T,Homo sapiens (Human),115,3 out of 5
1,Q9BXU3,TX13A_HUMAN,reviewed,Testis-expressed protein 13A,TEX13A,Homo sapiens (Human),409,2 out of 5
2,Q15031,SYLM_HUMAN,reviewed,"Probable leucine--tRNA ligase, mitochondrial (...",LARS2 KIAA0028,Homo sapiens (Human),903,5 out of 5
3,Q6PKC3,TXD11_HUMAN,reviewed,Thioredoxin domain-containing protein 11 (EF-h...,TXNDC11 EFP1,Homo sapiens (Human),985,5 out of 5
4,P42681,TXK_HUMAN,reviewed,Tyrosine-protein kinase TXK (EC 2.7.10.2) (Pro...,TXK PTK4 RLK,Homo sapiens (Human),527,5 out of 5


In [8]:
protein1 = "Q9BXU3"
prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
prot1name

'TX13A_HUMAN'

In [9]:
LearnedVec_GenesList = LearnedVec['Gene'].tolist()
len(LearnedVec_GenesList)

20421

In [10]:
len(np.asarray(LearnedVec.query("Gene == '1433B_HUMAN'")['Vector'].item()))

64

In [11]:
eName = '1433B_HUMAN'
uniprot_df[uniprot_df['Entry name'] == eName]['Entry'].item()

'P31946'

In [12]:
LearnedVecDF = pd.DataFrame(columns=['Entry', 'Vector'])
for i, row in tqdm_notebook(LearnedVec.iterrows(), total=len(LearnedVec_GenesList)):
    gene =  row['Gene']
    entry = uniprot_df[uniprot_df['Entry name'] == gene]['Entry'].item()
    vector = row['Vector']
    LearnedVecDF.loc[i] = [entry,vector]

HBox(children=(IntProgress(value=0, max=20421), HTML(value='')))




In [14]:
LearnedVecDF[0:10]

Unnamed: 0,Entry,Vector
0,P31946,"[0.1206292550840799, 0.042524608591712997, -0...."
1,P62258,"[0.0865170389789406, 0.06565096681950155, -0.0..."
2,Q04917,"[0.03067628642963321, 0.08261820964118351, -0...."
3,P61981,"[0.056176401725012974, 0.1272234897543238, -0...."
4,P31947,"[0.09433241449567525, 0.05418153837457671, -0...."
5,P27348,"[-0.007474015609202146, 0.12553591556421906, -..."
6,P63104,"[0.04930505288580744, 0.05025088669477938, -0...."
7,P30443,"[-0.09634417381379114, 0.18386099284089832, 0...."
8,P01892,"[-0.11567174365089174, 0.18178789219098315, 0...."
9,P04439,"[-0.12398412279961105, 0.2040801817622601, 0.0..."


In [46]:
LearnedVecDF.to_csv("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/LearnedVec_dataframe.csv")

In [16]:
LearnedVecDF.to_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/representation_vectors/representation_vector_dataframes/learned_vec.pkl")

In [12]:
'''
#This part was used to be sure parallel and sequential versions gives same results
cosine_distance_list1 = []
real_distance_list1 = []

similarityMatrixFileName = ""
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"

human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)

proteinListTmp = human_proteinSimilarityMatrix.columns[0:10]
for i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):
    for j in range(len(proteinListTmp)):
        if j>i:
            protein2 = proteinListTmp[j]
            #print((protein1,protein2))
            prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
            prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
            #if protein1 in LearnedVec_GenesList and protein2 in LearnedVec_GenesList:
            prot1vec = np.asarray(LearnedVec.query("Gene == @prot1name")['Vector'].item())
            prot2vec = np.asarray(LearnedVec.query("Gene == @prot2name")['Vector'].item())
            #cosine will return in shape of input vectors first dimension
            cosine_dist = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
            cosine_norm = (1+cosine_dist)/2
            cosine_distance_list1.append(cosine_norm)
            real_distance_list1.append(human_proteinSimilarityMatrix.loc[protein1,protein2])

print(len(cosine_distance_list1))
print(mse(real_distance_list1,cosine_distance_list1))

'''

'\n#This part was used to be sure parallel and sequential versions gives same results\ncosine_distance_list1 = []\nreal_distance_list1 = []\n\nsimilarityMatrixFileName = ""\nsimilarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"\n\nhuman_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)\nhuman_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)\n\nproteinListTmp = human_proteinSimilarityMatrix.columns[0:10]\nfor i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):\n    for j in range(len(proteinListTmp)):\n        if j>i:\n            protein2 = proteinListTmp[j]\n            #print((protein1,protein2))\n            prot

In [13]:
'''
# Multiprocess check
proteinList = []
manager = Manager()
similarity_list = manager.list()

def parallelSimilarity(paramList):
    #print(paramList)
    i = paramList[0]
    j = paramList[1]
    if j>i:  
        protein1 = proteinList[i]
        protein2 = proteinList[j]
        prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
        prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
        prot1vec = protVecEmbeddingDict[()][prot1name]
        prot2vec = protVecEmbeddingDict[()][prot2name]
        #cosine will return in shape of input vectors first dimension
        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
        real = human_protienSimilarityMatrix.loc[protein1,protein2]
        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
        similarity_list.append((real,cos))

#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"

human_protienSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_protienSimilarityMatrix.set_index(human_protienSimilarityMatrix.columns, inplace = True)
proteinList = human_protienSimilarityMatrix.columns[0:10]

i = range(len(proteinList))
j = range(len(proteinList))
protParamList = list(itertools.product(i,j))

    #manager = Manager()
    #similarity_list = manager.list()
total_task_num=len(proteinList)**2

pool = Pool()
pool.map(parallelSimilarity, protParamList)
pool.close()
pool.join()

real_distance_list = [value[0] for value in similarity_list]
cosine_distance_list = [value[1] for value in similarity_list]

mseValue = mse(real_distance_list,cosine_distance_list)
print(mseValue)
'''

'\n# Multiprocess check\nproteinList = []\nmanager = Manager()\nsimilarity_list = manager.list()\n\ndef parallelSimilarity(paramList):\n    #print(paramList)\n    i = paramList[0]\n    j = paramList[1]\n    if j>i:  \n        protein1 = proteinList[i]\n        protein2 = proteinList[j]\n        prot1name = uniprot_df.query("Entry == @protein1")[\'Entry name\'].item()\n        prot2name = uniprot_df.query("Entry == @protein2")[\'Entry name\'].item()\n        prot1vec = protVecEmbeddingDict[()][prot1name]\n        prot2vec = protVecEmbeddingDict[()][prot2name]\n        #cosine will return in shape of input vectors first dimension\n        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()\n        real = human_protienSimilarityMatrix.loc[protein1,protein2]\n        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled\n        similarity_list.append((real,cos))\n\n#similarityMatrixFileName = "/media/DATA/serbulent/Code

In [14]:
# define similarity_list and proteinList as global variables
proteinList = []
manager = Manager()
similarity_list = manager.list()
proteinListNew = manager.list()

def parallelSimilarity(paramList):
    i = paramList[0]
    j = paramList[1] 
    aspect = paramList[2]
    if j>i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]     
        prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
        prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()     
        prot1vec = np.asarray(LearnedVec.query("Gene == @prot1name")['Vector'].item())
        prot2vec = np.asarray(LearnedVec.query("Gene == @prot2name")['Vector'].item())
        #cosine will return in shape of input vectors first dimension
        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
        manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
        manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
        euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
        euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) 
        #print([(prot1name,prot2name),(prot1vec,prot2vec)])
        #print((prot1name,prot2name))
        #print()
        #print((cos,euclideanDist,norm(prot1vec,2),norm(prot2vec,2)))
        real = paramList[3]
        #real = human_protienSimilarityMatrix.loc[protein1,protein2]
        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
        #Similarity = 1-distance
        similarity_list.append((real,1-cos,1-manhattanDistNorm.item(),1-euclideanDistNorm.item()))
    return similarity_list


## Calculate similarity values with parallel processing

In [18]:
real_distance_list = []
cosine_distance_list = []
euclidian_distance_list = []

def calculateMSEforOntology(aspect,sparse=False):
    
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []

    #similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
    similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+\
    aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv"

    
    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
    proteinList = human_proteinSimilarityMatrix.columns
    
    #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
        
    if sparse:
        #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
        sparsified_similarity_coordinates = np.load("SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
        protParamList = sparsified_similarity_coordinates
    else:     
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i,j))
    protParamListNew = []
    # Prepare parameters for parallel processing these parameters will be 
    # used concurrently by different processes
    for tup in tqdm_notebook(protParamList):
        i = tup[0]
        j = tup[1]
        
        if sparse:
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1,protein2]
            tupNew = (tup[0],tup[1],aspect,real)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                tupNew = (tup[0],tup[1],aspect,real)
                protParamListNew.append(tupNew)

    total_task_num=len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    for similarity_listRet in tqdm_notebook(pool.imap_unordered(parallelSimilarity, protParamListNew), total=total_task_num):
        pass
    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    #mseValue = mse(real_distance_list,cosine_distance_list)
    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)
    
    random.seed(42)
    random_list = []
    for i in range(len(real_distance_list)):
        random_list.append(random.uniform(0, 1))
    
    if sparse:
        cosine_randomCorr = spearmanr(cosine_distance_list, random_list)
        manhattan_randomCorr = spearmanr(manhattan_distance_list, random_list)
        euclidian_randomCorr = spearmanr(euclidian_distance_list, random_list)
        print("Cosine Random Correlation for "+aspect+" is " + str(cosine_randomCorr))
        print("Manhattan Random Correlation for "+aspect+" is " + str(manhattan_randomCorr))
        print("Euclidian Random Correlation for "+aspect+" is " + str(euclidian_randomCorr))
      
    print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

    #return (cosine_distance_list,manhattan_distance_list,euclidian_distance_list)

    
    if sparse:
        return (cosineCorr,manhattanCorr,euclidianCorr,cosine_randomCorr,manhattan_randomCorr,euclidian_randomCorr)
    else:
        return (cosineCorr,manhattanCorr,euclidianCorr)
    

    

In [19]:
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"
#saveFileName = "SimilarityLearnedEmbeddingVec.csv"
saveFileName = "SimilarityLearnedEmbeddingVec_highest_200.csv"
f = open(saveFileName,'w')
f.write(buffer)
for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect)  
    print(corr)
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    # Python will convert \n to os.linesep
    f.close()
    

# 0.3673674654105104 mse for MF with 0:10
# 0.31965355246378196 mse for BP with 0:10
# 0.29460915219361683 mse for CC with 0:10
'''dl = calculateMSEforOntology("MF")
import matplotlib.pyplot
matplotlib.pyplot.hist(dl[0])
matplotlib.pyplot.hist(dl[1])
matplotlib.pyplot.hist(dl[2])'''

HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for MF is SpearmanrResult(correlation=-0.2401017901513225, pvalue=6.523584505513545e-259)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.2503067046108497, pvalue=6.147398899937215e-282)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.2401017901514139, pvalue=6.523584502488044e-259)
(SpearmanrResult(correlation=-0.2401017901513225, pvalue=6.523584505513545e-259), SpearmanrResult(correlation=0.2503067046108497, pvalue=6.147398899937215e-282), SpearmanrResult(correlation=0.2401017901514139, pvalue=6.523584502488044e-259))


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for BP is SpearmanrResult(correlation=-0.15863480297908533, pvalue=2.6726065060063238e-112)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.14337665824409587, pvalue=7.036197229742844e-92)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.15863480297908533, pvalue=2.6726065060063238e-112)
(SpearmanrResult(correlation=-0.15863480297908533, pvalue=2.6726065060063238e-112), SpearmanrResult(correlation=0.14337665824409587, pvalue=7.036197229742844e-92), SpearmanrResult(correlation=0.15863480297908533, pvalue=2.6726065060063238e-112))


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for CC is SpearmanrResult(correlation=-0.028530847547553053, pvalue=5.689356569503309e-05)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.009651732466011226, pvalue=0.17335870257584748)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.028530847547553053, pvalue=5.689356569503309e-05)
(SpearmanrResult(correlation=-0.028530847547553053, pvalue=5.689356569503309e-05), SpearmanrResult(correlation=0.009651732466011226, pvalue=0.17335870257584748), SpearmanrResult(correlation=0.028530847547553053, pvalue=5.689356569503309e-05))


'dl = calculateMSEforOntology("MF")\nimport matplotlib.pyplot\nmatplotlib.pyplot.hist(dl[0])\nmatplotlib.pyplot.hist(dl[1])\nmatplotlib.pyplot.hist(dl[2])'

In [17]:
'''
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal\
,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"
#saveFileName = "SimilarityGene2Vec.csv"
saveFileName = "Similarity_Sparse_LearnedEmbeddingVec_highest_500.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect,True) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()
    
'''

'\nbuffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"\n#saveFileName = "SimilarityGene2Vec.csv"\nsaveFileName = "Similarity_Sparse_LearnedEmbeddingVec_highest_500.csv"\nf = open(saveFileName,\'w\')\nf.write(buffer)\n\nfor aspect in ["MF","BP","CC"]:\n    corr = calculateMSEforOntology(aspect,True) \n    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" \n    f = open(saveFileName,\'a\')\n    f.write(buffer) #Give your csv text here.\n    ## Python will convert \n to os.linesep\n    f.close()\n    \n'