## This notebook calculates similarity and error between protein embeddings and use GO semantic similarity as gold standart.

In [1]:
import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr

## Load protein vectors of SMILESVec

In [4]:
dir = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/representations/SMILESVecProteinRepresentation/source"
with open(dir+'/protein.vec', 'rb') as f:
        SmilesVecTmp = pickle.load(f, encoding='bytes')

In [5]:
#SmilesVecTmp

[b'none',
 b'none',
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 b'none',
 [-0.011579623250945506,
  -0.0030352282429762134,
  -0.015902195280684213,
  -0.0186054159069135,
  0.03845849909245425,
  0.017461331691691444,
  -0.02543278002254595,
  0.010407785675375437,
  -0.0025903173725196583,
  0.027720137242756966,
  -0.025913873412750053,


In [3]:
protNamesFile = '/media/DATA/serbulent/Code/Thesis/ReviewPaper/representations/SMILESVecProteinRepresentation/source/utils/prots_sample.txt'
protNames = [line.rstrip('\n') for line in open(protNamesFile)]

In [4]:
SmilesVecDictTmp = dict(zip(protNames, SmilesVecTmp))

In [52]:
#SmilesVecDictTmp

{'Entry': b'none',
 'A0A584': b'none',
 'Q9BXU3': b'none',
 'Q15031': b'none',
 'Q6PKC3': b'none',
 'P42681': [-0.011579623250945506,
  -0.0030352282429762134,
  -0.015902195280684213,
  -0.0186054159069135,
  0.03845849909245425,
  0.017461331691691444,
  -0.02543278002254595,
  0.010407785675375437,
  -0.0025903173725196583,
  0.027720137242756966,
  -0.025913873412750053,
  -0.003181346934659988,
  -0.04116342294465953,
  0.0032499054808979477,
  0.02177170010103968,
  0.0355184216657196,
  0.020602162572366876,
  0.0067042465659046336,
  0.019752834958010418,
  -0.0011752074858402662,
  0.04225662029116363,
  -0.010004511405733605,
  -0.010289393306373748,
  -0.038631011713293494,
  0.01548805304727205,
  -0.018132852381648062,
  0.03768075253240829,
  -0.00382887458272924,
  0.0018206210666556505,
  0.034942309894501554,
  0.008391669246429636,
  -0.039028722395969814,
  0.058096257522216366,
  -0.028354512168216384,
  -0.024229849316872863,
  -0.017091153920446703,
  0.0015532069

In [5]:
SmilesVecDict = dict()
for k,v in SmilesVecDictTmp.items():
    if v != b'none':
        SmilesVecDict[k] = v

In [51]:
SmilesVecDict

{'P42681': [-0.011579623250945506,
  -0.0030352282429762134,
  -0.015902195280684213,
  -0.0186054159069135,
  0.03845849909245425,
  0.017461331691691444,
  -0.02543278002254595,
  0.010407785675375437,
  -0.0025903173725196583,
  0.027720137242756966,
  -0.025913873412750053,
  -0.003181346934659988,
  -0.04116342294465953,
  0.0032499054808979477,
  0.02177170010103968,
  0.0355184216657196,
  0.020602162572366876,
  0.0067042465659046336,
  0.019752834958010418,
  -0.0011752074858402662,
  0.04225662029116363,
  -0.010004511405733605,
  -0.010289393306373748,
  -0.038631011713293494,
  0.01548805304727205,
  -0.018132852381648062,
  0.03768075253240829,
  -0.00382887458272924,
  0.0018206210666556505,
  0.034942309894501554,
  0.008391669246429636,
  -0.039028722395969814,
  0.058096257522216366,
  -0.028354512168216384,
  -0.024229849316872863,
  -0.017091153920446703,
  0.0015532069447019522,
  -0.012483578691549419,
  -0.019800632310843638,
  0.02233543462166993,
  -0.0201054381

In [None]:
SmilesVecDF = pd.DataFrame(columns=['Entry', 'Vector'])

for entry_name,vector in tqdm_notebook(SmilesVecDict.items()):
    entry = uniprot_df[uniprot_df['Entry name'] == entry_name]['Entry'].item()
    protVecDF.loc[i] = [entry,vector]


In [6]:
# UNIPROT data for mapping between UNIPROT accession numbers and UNIPROT entry names
uniprot_metadata_directory = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/"
uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t', header=None, names=uniprot_vars)

In [None]:
'''
#This part was used to be sure parallel and sequential versions gives same results
cosine_distance_list1 = []
real_distance_list1 = []

similarityMatrixFileName = ""
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"

human_protienSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_protienSimilarityMatrix.set_index(human_protienSimilarityMatrix.columns, inplace = True)

proteinListTmp = human_protienSimilarityMatrix.columns
for i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):
    for j in range(len(proteinListTmp)):
        if j>i:
            protein2 = proteinListTmp[j]
            if protein1 in SmilesVecDict and protein2 in SmilesVecDict:
                prot1vec = np.asarray(SmilesVecDict[protein1])
                prot2vec = np.asarray(SmilesVecDict[protein2])
                prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
                prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
                #cosine will return in shape of input vectors first dimension
                cosine_distance_list1.append(cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item())
                real_distance_list1.append(human_protienSimilarityMatrix.loc[protein1,protein2])

print(len(cosine_distance_list1))
print(mse(real_distance_list1,cosine_distance_list1))
'''


In [None]:
'''
# Multiprocess check
proteinList = []
manager = Manager()
similarity_list = manager.list()

def parallelSimilarity(paramList):
    #print(paramList)
    i = paramList[0]
    j = paramList[1]
    if j>i:  
        protein1 = proteinList[i]
        protein2 = proteinList[j]
        prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
        prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
        prot1vec = protVecEmbeddingDict[()][prot1name]
        prot2vec = protVecEmbeddingDict[()][prot2name]
        #cosine will return in shape of input vectors first dimension
        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
        real = human_protienSimilarityMatrix.loc[protein1,protein2]
        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
        similarity_list.append((real,cos))

#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"

human_protienSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_protienSimilarityMatrix.set_index(human_protienSimilarityMatrix.columns, inplace = True)
proteinList = human_protienSimilarityMatrix.columns[0:10]

i = range(len(proteinList))
j = range(len(proteinList))
protParamList = list(itertools.product(i,j))

    #manager = Manager()
    #similarity_list = manager.list()
total_task_num=len(proteinList)**2

pool = Pool()
pool.map(parallelSimilarity, protParamList)
pool.close()
pool.join()

real_distance_list = [value[0] for value in similarity_list]
cosine_distance_list = [value[1] for value in similarity_list]

mseValue = mse(real_distance_list,cosine_distance_list)
print(mseValue)
'''

In [48]:
# define similarity_list and proteinList as global variables
proteinList = []
manager = Manager()
similarity_list = manager.list()
proteinListNew = manager.list()

def parallelSimilarity(paramList):
    i = paramList[0]
    j = paramList[1] 
    aspect = paramList[2]
    if j>i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]
        if protein1 in SmilesVecDict and protein2 in SmilesVecDict:
            print((protein1,protein2))
            prot1vec = np.asarray(SmilesVecDict[protein1])
            prot2vec = np.asarray(SmilesVecDict[protein2])
            print((prot1vec,prot2vec))
            prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
            prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
            #cosine will return in shape of input vectors first dimension
            cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
            #print(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1))
            manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
            manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))

            euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
            euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2))     
            real = paramList[3]
            #real = human_protienSimilarityMatrix.loc[protein1,protein2]
            # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
            similarity_list.append((real,cos,manhattanDistNorm.item(),euclideanDistNorm.item()))
    return similarity_list


## Calculate similarity values with parallel processing

In [49]:
def calculateMSEforOntology(aspect):
    
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []

    similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"

    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
    proteinList = human_proteinSimilarityMatrix.columns[0:20]
    
    #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
    
    i = range(len(proteinList))
    j = range(len(proteinList))
    protParamList = list(itertools.product(i,j))
    protParamListNew = []
    for tup in tqdm_notebook(protParamList):
        i = tup[0]
        j = tup[1]
        if j > i:
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1,protein2]
            tupNew = (tup[0],tup[1],aspect,real)
            protParamListNew.append(tupNew)

    total_task_num=len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    for similarity_listRet in tqdm_notebook(pool.imap_unordered(parallelSimilarity, protParamListNew), total=total_task_num):
        pass
    pool.close()
    pool.join()

    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    #mseValue = mse(real_distance_list,cosine_distance_list)
    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)
    
    print(euclidian_distance_list)
    print(manhattan_distance_list)
     
    print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))
    
    return (cosineCorr,manhattanCorr,euclidianCorr)
    

In [50]:
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"
saveFileName = "SimilaritySmilesVec.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF"]:#,"BP","CC"]:
    corr = calculateMSEforOntology(aspect) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()
    
# 0.4787368636363636 mse for MF with 0:40
# 0.5559994999999999 mse for BP with 0:40
# 0.2630654 mse for CC with 0:40

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))

HBox(children=(IntProgress(value=0, max=190), HTML(value='')))

('O00141', 'O00204')
('O00141', 'O00182')
('O00182', 'O00204')
(array([-0.01157962, -0.00303523, -0.0159022 , -0.01860542,  0.0384585 ,
        0.01746133, -0.02543278,  0.01040779, -0.00259032,  0.02772014,
       -0.02591387, -0.00318135, -0.04116342,  0.00324991,  0.0217717 ,
        0.03551842,  0.02060216,  0.00670425,  0.01975283, -0.00117521,
        0.04225662, -0.01000451, -0.01028939, -0.03863101,  0.01548805,
       -0.01813285,  0.03768075, -0.00382887,  0.00182062,  0.03494231,
        0.00839167, -0.03902872,  0.05809626, -0.02835451, -0.02422985,
       -0.01709115,  0.00155321, -0.01248358, -0.01980063,  0.02233543,
       -0.02010544, -0.02407252, -0.021841  ,  0.01770697, -0.01955368,
        0.02219863,  0.02786354, -0.02743089, -0.0017235 ,  0.0272165 ,
       -0.00252399,  0.03731852,  0.00657341, -0.04398925,  0.04397143,
        0.03162259, -0.0070724 , -0.02533   ,  0.01864967,  0.01869695,
        0.03277477, -0.01470132,  0.00648498,  0.04486331,  0.0212148 ,


       -0.0078021 ,  0.01150403, -0.01369833, -0.02030842,  0.03164586]))
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
Cosine Correlation for MF is SpearmanrResult(correlation=nan, pvalue=nan)
Manhattan Correlation for MF is SpearmanrResult(correlation=nan, pvalue=nan)
Euclidian Correlation for MF is SpearmanrResult(correlation=nan, pvalue=nan)
