This notebook calculates similarity and error between protein embeddings and use GO semantic similarity as gold standart.

In [1]:
import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
import random

In [2]:
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +"CC"+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv"

In [3]:
human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)

In [4]:
len(human_proteinSimilarityMatrix)

200

## Load protein vectors of ProtVec

In [5]:
protVecFile = gzip.GzipFile('/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/ProtVec/calculated_human_protein_vectors.npy.gz', "r")
protVecEmbeddingDict = np.load(protVecFile)

In [6]:
# UNIPROT data for mapping between UNIPROT accession numbers and UNIPROT entry names
uniprot_metadata_directory = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/"
uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t', header=None, names=uniprot_vars)

In [7]:
protVecDF = pd.DataFrame(columns=['Entry', 'Vector'])
i=0
for entry_name,vector in tqdm_notebook(protVecEmbeddingDict[()].items()):
    try:
        if len(vector) != 100:
            print("Size exception")
            print(entry_name)
            print(len(vector))
        entry = uniprot_df[uniprot_df['Entry name'] == entry_name]['Entry'].item()
        protVecDF.loc[i] = [entry,vector]
        i+=1
    except:
        print(entry_name)
        print(uniprot_df[uniprot_df['Entry name'] == entry_name])
        pass

HBox(children=(IntProgress(value=0, max=20422), HTML(value='')))

P01892
Empty DataFrame
Columns: [Entry, Entry name, Status, Protein names, Gene names, Organism, Length, Annotation]
Index: []



In [8]:
for vec in protVecDF['Vector']:
    if len(vec) != 100:
        print(len(vec))

In [9]:
protVecDF.to_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/protVec_dataframe.pkl")

In [10]:
protVecDF.to_csv("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/protVec_dataframe.csv")

In [11]:
'''
#This part was used to be sure parallel and sequential versions gives same results
cosine_distance_list1 = []
real_distance_list1 = []

similarityMatrixFileName = ""
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"

human_protienSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_protienSimilarityMatrix.set_index(human_protienSimilarityMatrix.columns, inplace = True)

proteinListTmp = human_protienSimilarityMatrix.columns[0:10] 
for i,protein1 in enumerate(proteinListTmp):
    for j in range(len(proteinListTmp)):
        if j>i:
            protein2 = proteinListTmp[j]
            prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
            prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
            prot1vec = protVecEmbeddingDict[()][prot1name]
            prot2vec = protVecEmbeddingDict[()][prot2name]
            #cosine will return in shape of input vectors first dimension
            cosine_distance_list1.append(cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item())
            real_distance_list1.append(human_protienSimilarityMatrix.loc[protein1,protein2])

print(mse(real_distance_list1,cosine_distance_list1))
'''


'\n#This part was used to be sure parallel and sequential versions gives same results\ncosine_distance_list1 = []\nreal_distance_list1 = []\n\nsimilarityMatrixFileName = ""\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"\nsimilarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"\n\nhuman_protienSimilarityMatrix = pd.read_csv(similarityMatrixFileName)\nhuman_protienSimilarityMatrix.set_index(human_protienSimilarityMatrix.columns, inplace = True)\n\nproteinListTmp = human_protienSimilarityMatrix.columns[0:10] \nfor i,protein1 in enumerate(proteinListTmp):\n    for j in range(len(proteinListTmp)):\n        if j>i:\n            protein2 = proteinListTmp[j]\n            prot1name = uniprot_df.query("Entry == @protein1")[\'Entry 

In [12]:
'''
# Multiprocess check
proteinList = []
manager = Manager()
similarity_list = manager.list()

def parallelSimilarity(paramList):
    #print(paramList)
    i = paramList[0]
    j = paramList[1]
    if j>i:  
        protein1 = proteinList[i]
        protein2 = proteinList[j]
        prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
        prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
        prot1vec = protVecEmbeddingDict[()][prot1name]
        prot2vec = protVecEmbeddingDict[()][prot2name]
        #cosine will return in shape of input vectors first dimension
        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
        real = human_protienSimilarityMatrix.loc[protein1,protein2]
        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
        similarity_list.append((real,cos))

#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"

human_protienSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_protienSimilarityMatrix.set_index(human_protienSimilarityMatrix.columns, inplace = True)
proteinList = human_protienSimilarityMatrix.columns[0:10]

i = range(len(proteinList))
j = range(len(proteinList))
protParamList = list(itertools.product(i,j))

    #manager = Manager()
    #similarity_list = manager.list()
total_task_num=len(proteinList)**2

pool = Pool()
pool.map(parallelSimilarity, protParamList)
pool.close()
pool.join()

real_distance_list = [value[0] for value in similarity_list]
cosine_distance_list = [value[1] for value in similarity_list]

mseValue = mse(real_distance_list,cosine_distance_list)
print(mseValue)
'''

'\n# Multiprocess check\nproteinList = []\nmanager = Manager()\nsimilarity_list = manager.list()\n\ndef parallelSimilarity(paramList):\n    #print(paramList)\n    i = paramList[0]\n    j = paramList[1]\n    if j>i:  \n        protein1 = proteinList[i]\n        protein2 = proteinList[j]\n        prot1name = uniprot_df.query("Entry == @protein1")[\'Entry name\'].item()\n        prot2name = uniprot_df.query("Entry == @protein2")[\'Entry name\'].item()\n        prot1vec = protVecEmbeddingDict[()][prot1name]\n        prot2vec = protVecEmbeddingDict[()][prot2name]\n        #cosine will return in shape of input vectors first dimension\n        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()\n        real = human_protienSimilarityMatrix.loc[protein1,protein2]\n        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled\n        similarity_list.append((real,cos))\n\n#similarityMatrixFileName = "/media/DATA/serbulent/Code

In [13]:
# define similarity_list and proteinList as global variables
proteinList = []
manager = Manager()
similarity_list = manager.list()
proteinListNew = manager.list()

def parallelSimilarity(paramList):
    i = paramList[0]
    j = paramList[1] 
    aspect = paramList[2]

    if j>i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]
        prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
        prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
        prot1vec = protVecEmbeddingDict[()][prot1name]
        prot2vec = protVecEmbeddingDict[()][prot2name]
        #cosine will return in shape of input vectors first dimension
        #print(str(prot1name) + str(prot1vec))
        #print(str(prot2name) + str(prot2vec))
        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
        manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
        manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
        euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
        euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) 
        real = paramList[3]
        #real = human_protienSimilarityMatrix.loc[protein1,protein2]
        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
        similarity_list.append((real,1-cos,1-manhattanDistNorm.item(),1-euclideanDistNorm.item()))
    return similarity_list


## Calculate similarity values with parallel processing

In [14]:
def calculateMSEforOntology(aspect,sparse=False):
    
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []

    #similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
    similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv"

    
    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
    #proteinList = human_proteinSimilarityMatrix
    proteinList = human_proteinSimilarityMatrix.columns
    
     #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
    if sparse:
        #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
        sparsified_similarity_coordinates = np.load("SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
        protParamList = sparsified_similarity_coordinates
    else:     
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i,j))
    protParamListNew = []
    # Prepare parameters for parallel processing these parameters will be 
    # used concurrently by different processes
    for tup in tqdm_notebook(protParamList):
        i = tup[0]
        j = tup[1]
        
        if sparse:
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1,protein2]
            tupNew = (tup[0],tup[1],aspect,real)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                tupNew = (tup[0],tup[1],aspect,real)
                protParamListNew.append(tupNew)

    total_task_num=len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    for similarity_listRet in tqdm_notebook(pool.imap_unordered(parallelSimilarity, protParamListNew), total=total_task_num):
        pass
    pool.close()
    pool.join()

    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    #mseValue = mse(real_distance_list,cosine_distance_list)
    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)
    
    random.seed(42)
    random_list = []
    for i in range(len(real_distance_list)):
        random_list.append(random.uniform(0, 1))
    
    if sparse:
        cosine_randomCorr = spearmanr(cosine_distance_list, random_list)
        manhattan_randomCorr = spearmanr(manhattan_distance_list, random_list)
        euclidian_randomCorr = spearmanr(euclidian_distance_list, random_list)
        print("Cosine Random Correlation for "+aspect+" is " + str(cosine_randomCorr))
        print("Manhattan Random Correlation for "+aspect+" is " + str(manhattan_randomCorr))
        print("Euclidian Random Correlation for "+aspect+" is " + str(euclidian_randomCorr))
    
    
    print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

    if sparse:
        return (cosineCorr,manhattanCorr,euclidianCorr,cosine_randomCorr,manhattan_randomCorr,euclidian_randomCorr)
    else:
        return (cosineCorr,manhattanCorr,euclidianCorr)
    

In [15]:
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"
saveFileName = "tmp_CC_SimilarityProtVec_highest_200.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()
    
# 0.3673674654105104 mse for MF with 0:10
# 0.31965355246378196 mse for BP with 0:10
# 0.29460915219361683 mse for CC with 0:10

HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for MF is SpearmanrResult(correlation=-0.02631840171325433, pvalue=0.0002047542857679604)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.08097384767506213, pvalue=2.6134536449824976e-30)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.08288474325035798, pvalue=1.109619961090317e-31)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for BP is SpearmanrResult(correlation=-0.03662264288717128, pvalue=2.3703231046938388e-07)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.08312061352119392, pvalue=7.47480595371219e-32)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.08437288744151857, pvalue=9.005691295013815e-33)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for CC is SpearmanrResult(correlation=-0.07048255781813766, pvalue=2.4092284708430528e-23)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.07242533908546427, pvalue=1.4605386494772896e-24)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.07168915481034553, pvalue=4.2626027668260024e-24)


In [16]:
'''buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal\
,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"
#saveFileName = "SimilarityGene2Vec.csv"
saveFileName = "Similarity_Sparse_ProtVec_highest_500.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect,True) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()
'''

'buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"\n#saveFileName = "SimilarityGene2Vec.csv"\nsaveFileName = "Similarity_Sparse_ProtVec_highest_500.csv"\nf = open(saveFileName,\'w\')\nf.write(buffer)\n\nfor aspect in ["MF","BP","CC"]:\n    corr = calculateMSEforOntology(aspect,True) \n    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" \n    f = open(saveFileName,\'a\')\n    f.write(buffer) #Give your csv text here.\n    ## Python will convert \n to os.linesep\n    f.close()\n'