## This notebook calculates similarity and error between protein embeddings and use GO semantic similarity as gold standart.

In [1]:
import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
import random

In [3]:
colnames=['Gene', 'Vector'] 
representationFile = '/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/representation_vectors/Mut2Vec/Mut2Vec+PI+R_ENSG.txt'
Mut2Vec = pd.read_csv(representationFile,delimiter=' ',encoding='utf-8', names=None, header=None)

In [4]:
#ensemble_ids_path = '/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/Mut2Vec/ensemble_ids.txt'
#Mut2Vec[0].to_csv(ensemble_ids_path, index=False)

In [5]:
colnames=['Ensemble_ID', 'Translate', 'Entry','Entry_name','Status','Protein_names','Gene_names','Organism','Length'] 
ensemble2gene_file = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/ensembl2uniprot.tab"
ensemble2gene = pd.read_csv(ensemble2gene_file,delimiter='\t',encoding='utf-8',names=colnames, header=None,skiprows=1)

In [6]:
Mut2Vec_dataframe = pd.DataFrame(columns = ['Gene', 'Entry', 'Vector'])

for index, row in tqdm_notebook(Mut2Vec.iterrows(), total = len(Mut2Vec)):
    gene_id = row[0]
    protein_entry = ensemble2gene.loc[ensemble2gene['Ensemble_ID'] == gene_id]['Entry']
    protein_id = ""
    if len(protein_entry) >= 1:
        protein_id = list(protein_entry)[0]
    gene_vector = list(row[1:301])
    Mut2Vec_dataframe = Mut2Vec_dataframe.append({'Gene': gene_id,'Entry':protein_id,'Vector':gene_vector},ignore_index=True)

HBox(children=(IntProgress(value=0, max=18584), HTML(value='')))




In [7]:
len(Mut2Vec_dataframe.iloc[0]['Vector'])

300

In [8]:
Mut2Vec_dataframe.to_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/representation_vectors/representation_vector_dataframes/mut2vec_processed.pkl")

In [9]:
Mut2Vec_dataframe.to_csv("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/Mut2Vec/mut2vec_processed.csv")

In [6]:
# UNIPROT data for mapping between UNIPROT accession numbers and UNIPROT entry names
uniprot_metadata_directory = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/"
uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t')

In [7]:
Mut2VecProteinList = list(set(Mut2Vec_dataframe['Entry']))
Mut2VecProteinList.remove('')

In [8]:
'''
#This part was used to be sure parallel and sequential versions gives same results
cosine_distance_list1 = []
real_distance_list1 = []

#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"

human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)

proteinListTmp = human_proteinSimilarityMatrix.columns[0:10]
for i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):
    for j in range(len(proteinListTmp)):
        if j>i:
            protein2 = proteinListTmp[j]
            if protein1 in Mut2VecProteinList and protein2 in Mut2VecProteinList:
                
                prot1vec = np.asarray(Mut2Vec_dataframe.query("Protein_Entry == @protein1")['Vector'].item())
                prot2vec = np.asarray(Mut2Vec_dataframe.query("Protein_Entry == @protein2")['Vector'].item())
                #cosine will return in shape of input vectors first dimension
                cosine_dist = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
                cosine_norm = (1+cosine_dist)/2
                cosine_distance_list1.append(cosine_norm)
                real_distance_list1.append(human_proteinSimilarityMatrix.loc[protein1,protein2])

print(len(cosine_distance_list1))
print(spearmanr(real_distance_list1,cosine_distance_list1))


# MF    0.1866509691526721
# BP    0.3116883116883117
# CC   -0.2711730421384723
'''

'\n#This part was used to be sure parallel and sequential versions gives same results\ncosine_distance_list1 = []\nreal_distance_list1 = []\n\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"\n\nhuman_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)\nhuman_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)\n\nproteinListTmp = human_proteinSimilarityMatrix.columns[0:10]\nfor i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):\n    for j in range(len(proteinListTmp)):\n        if j>i:\n            protein2 = proteinListTmp[j]\n            if protein1 in Mut2VecProteinList and protein2 in Mut2VecProteinList:\n    

In [9]:
# define similarity_list and proteinList as global variables
proteinList = []
manager = Manager()
similarity_list = manager.list()
proteinListNew = manager.list()

def parallelSimilarity(paramList):
    i = paramList[0]
    j = paramList[1] 
    aspect = paramList[2]
    if j>i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]
        if protein1 in Mut2VecProteinList and protein2 in Mut2VecProteinList:
            prot1vec = np.asarray(Mut2Vec_dataframe.query("Entry == @protein1")['Vector'].item())
            prot2vec = np.asarray(Mut2Vec_dataframe.query("Entry == @protein2")['Vector'].item())
            #print(str(protein1) + str(prot1vec))
            #print(str(protein2) + str(prot2vec))
            #cosine will return in shape of input vectors first dimension
            cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
            manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
            manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
            euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
            euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) 
            real = paramList[3]
            #real = human_protienSimilarityMatrix.loc[protein1,protein2]
            # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
            similarity_list.append((real,1-cos,1-manhattanDistNorm.item(),1-euclideanDistNorm.item()))
    return similarity_list

In [10]:
def calculateMSEforOntology(aspect,sparse=False):
    
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []

    #similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
    similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv"

    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
    proteinList = human_proteinSimilarityMatrix.columns
    
    #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
    if sparse:
        #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
        sparsified_similarity_coordinates = np.load("SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
        protParamList = sparsified_similarity_coordinates
    else:     
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i,j))
    protParamListNew = []
    # Prepare parameters for parallel processing these parameters will be 
    # used concurrently by different processes
    for tup in tqdm_notebook(protParamList):
        i = tup[0]
        j = tup[1]
        
        if sparse:
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1,protein2]
            tupNew = (tup[0],tup[1],aspect,real)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                tupNew = (tup[0],tup[1],aspect,real)
                protParamListNew.append(tupNew)


    total_task_num=len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    for similarity_listRet in tqdm_notebook(pool.imap_unordered(parallelSimilarity, protParamListNew), total=total_task_num):
        pass
    pool.close()
    pool.join()

    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    #mseValue = mse(real_distance_list,cosine_distance_list)
    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)

    random.seed(42)
    random_list = []
    for i in range(len(real_distance_list)):
        random_list.append(random.uniform(0, 1))
    
    if sparse:
        cosine_randomCorr = spearmanr(cosine_distance_list, random_list)
        manhattan_randomCorr = spearmanr(manhattan_distance_list, random_list)
        euclidian_randomCorr = spearmanr(euclidian_distance_list, random_list)
        print("Cosine Random Correlation for "+aspect+" is " + str(cosine_randomCorr))
        print("Manhattan Random Correlation for "+aspect+" is " + str(manhattan_randomCorr))
        print("Euclidian Random Correlation for "+aspect+" is " + str(euclidian_randomCorr))
    
    
    print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

    if sparse:
        return (cosineCorr,manhattanCorr,euclidianCorr,cosine_randomCorr,manhattan_randomCorr,euclidian_randomCorr)
    else:
        return (cosineCorr,manhattanCorr,euclidianCorr)
    

Calculate Normal Correlations

In [11]:
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"
#saveFileName = "SimilarityMut2Vec.csv"
saveFileName = "SimilarityMut2Vec_highest_200.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()
    

# MF    0.1866509691526721
# BP    0.3116883116883117
# CC   -0.2711730421384723

HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for MF is SpearmanrResult(correlation=-0.20664982255238126, pvalue=7.638675917122441e-178)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.19279289122553758, pvalue=1.2769773912789466e-154)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.20664982255238126, pvalue=7.638675917122441e-178)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for BP is SpearmanrResult(correlation=-0.2586085107136244, pvalue=1.5272830050960912e-266)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.24968551219897236, pvalue=5.069595486252185e-248)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.2586085107136244, pvalue=1.5272830050960912e-266)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for CC is SpearmanrResult(correlation=-0.10479894789959376, pvalue=6.045123802331219e-46)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.09861306676873008, pvalue=7.403060708066283e-41)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.10479894789959376, pvalue=6.045123802331219e-46)


Calculate Sparse Correlations

In [12]:
'''buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal\
,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"
#saveFileName = "SimilarityGene2Vec.csv"
saveFileName = "Similarity_Sparse_Mut2Vec_highest_500.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect,True) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()'''

'buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"\n#saveFileName = "SimilarityGene2Vec.csv"\nsaveFileName = "Similarity_Sparse_Mut2Vec_highest_500.csv"\nf = open(saveFileName,\'w\')\nf.write(buffer)\n\nfor aspect in ["MF","BP","CC"]:\n    corr = calculateMSEforOntology(aspect,True) \n    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" \n    f = open(saveFileName,\'a\')\n    f.write(buffer) #Give your csv text here.\n    ## Python will convert \n to os.linesep\n    f.close()'