In [1]:
import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
import random

In [2]:
# UNIPROT data for mapping between UNIPROT accession numbers and UNIPROT entry names
uniprot_metadata_directory = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/"
uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t')

In [3]:
#colnames=['Gene', 'Vector'] 
representationFile = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/UniRep/UniRep_calculated_human_protein_vectors.npy"
UniRep_embedding = np.load(representationFile)

In [10]:
UniRep_embedding = UniRep_embedding.item()

In [11]:
UniRep_embedding

defaultdict(float,
            {'CRBA1_HUMAN': array([ 0.00656064, -0.04175284,  0.02659303, ...,  1.5471516 ,
                     3.1354516 ,  2.2944388 ], dtype=float32),
             'IWS1_HUMAN': array([ 0.01218513, -0.03578384,  0.02830771, ...,  0.43539655,
                     0.24572667,  4.3294578 ], dtype=float32),
             'BORC6_HUMAN': array([ 0.00484324, -0.05044174,  0.01257045, ...,  2.5620964 ,
                    -3.4162796 ,  1.0339303 ], dtype=float32),
             'RDH8_HUMAN': array([0.01840623, 0.10624367, 0.088156  , ..., 0.95909464, 0.06288785,
                    0.17586654], dtype=float32),
             'ZO3_HUMAN': array([ 0.00945478, -0.02987122,  0.04905148, ...,  0.7417364 ,
                     3.6578367 ,  6.471432  ], dtype=float32),
             'KANK4_HUMAN': array([ 0.00632587, -0.06175105,  0.02536356, ...,  3.1720257 ,
                    -0.078334  , -1.4627335 ], dtype=float32),
             'APOD_HUMAN': array([ 0.00499755, -0.0434384 ,  

In [12]:
UniRep_embedding['CRBA1_HUMAN']

array([ 0.00656064, -0.04175284,  0.02659303, ...,  1.5471516 ,
        3.1354516 ,  2.2944388 ], dtype=float32)

In [13]:
UniRepDF = pd.DataFrame(columns=['Entry', 'Vector'])
i=0
for entry_name,vector in tqdm_notebook(UniRep_embedding.items()):
    try:
        entry = uniprot_df[uniprot_df['Entry name'] == entry_name]['Entry'].item()
        UniRepDF.loc[i] = [entry,vector.tolist()]
        i+=1
    except:
        print(entry_name)
        print(uniprot_df[uniprot_df['Entry name'] == entry_name])
        pass

HBox(children=(IntProgress(value=0, max=20421), HTML(value='')))




In [14]:
UniRepDF.to_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/UniRep_dataframe.pkl")

In [11]:
UniRepDF.to_csv("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/UniRep_dataframe.csv")

In [6]:
'''
#This part was used to be sure parallel and sequential versions gives same results
cosine_distance_list1 = []
real_distance_list1 = []

similarityMatrixFileName = ""
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"

human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)

proteinListTmp = human_proteinSimilarityMatrix.columns[0:10]
for i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):
    for j in range(len(proteinListTmp)):
        if j>i:
            protein2 = proteinListTmp[j]
            #print((protein1,protein2))
            prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
            prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()
            #if protein1 in LearnedVec_GenesList and protein2 in LearnedVec_GenesList:
            prot1vec = np.asarray(UniRep_embedding[prot1name].tolist())
            prot2vec = np.asarray(UniRep_embedding[prot2name].tolist())
            #cosine will return in shape of input vectors first dimension
            cosine_dist = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
            cosine_distance_list1.append(cosine_dist)
            real_distance_list1.append(human_proteinSimilarityMatrix.loc[protein1,protein2])

print(len(cosine_distance_list1))
print(spearmanr(real_distance_list1,cosine_distance_list1))

# MF corr 0.37317002949322214
# BP corr 0.4704371029265753
# CC corr 0.0991534082649176

'''


'\n#This part was used to be sure parallel and sequential versions gives same results\ncosine_distance_list1 = []\nreal_distance_list1 = []\n\nsimilarityMatrixFileName = ""\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"\n\nhuman_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)\nhuman_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)\n\nproteinListTmp = human_proteinSimilarityMatrix.columns[0:10]\nfor i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):\n    for j in range(len(proteinListTmp)):\n        if j>i:\n            protein2 = proteinListTmp[j]\n            #print((protein1,protein2))\n            pro

In [7]:
# define similarity_list and proteinList as global variables
proteinList = []
manager = Manager()
similarity_list = manager.list()
proteinListNew = manager.list()

def parallelSimilarity(paramList):
    i = paramList[0]
    j = paramList[1] 
    aspect = paramList[2]
    if j>i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]     
        prot1name = uniprot_df.query("Entry == @protein1")['Entry name'].item()
        prot2name = uniprot_df.query("Entry == @protein2")['Entry name'].item()     
        prot1vec = np.asarray(UniRep_embedding[prot1name].tolist())
        prot2vec = np.asarray(UniRep_embedding[prot2name].tolist())
        #cosine will return in shape of input vectors first dimension
        cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
        manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
        manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
        euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
        euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) 
        #print([(prot1name,prot2name),(prot1vec,prot2vec)])
        #print((prot1name,prot2name))
        #print()
        #print((cos,euclideanDist,norm(prot1vec,2),norm(prot2vec,2)))
        real = paramList[3]
        #real = human_protienSimilarityMatrix.loc[protein1,protein2]
        # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
        #Similarity = 1 - Normlized_distance
        similarity_list.append((real,1-cos,1-manhattanDistNorm.item(),1-euclideanDistNorm.item()))
    return similarity_list


In [8]:
real_distance_list = []
cosine_distance_list = []
euclidian_distance_list = []

def calculateMSEforOntology(aspect,sparse=False):
    
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []

#    similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
    similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv"

    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
    proteinList = human_proteinSimilarityMatrix.columns
    
    #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
        
    if sparse:
        #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
        sparsified_similarity_coordinates = np.load("SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
        protParamList = sparsified_similarity_coordinates
    else:     
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i,j))
    protParamListNew = []
    # Prepare parameters for parallel processing these parameters will be 
    # used concurrently by different processes
    for tup in tqdm_notebook(protParamList):
        i = tup[0]
        j = tup[1]
        
        if sparse:
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1,protein2]
            tupNew = (tup[0],tup[1],aspect,real)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                tupNew = (tup[0],tup[1],aspect,real)
                protParamListNew.append(tupNew)
    total_task_num=len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    for similarity_listRet in tqdm_notebook(pool.imap_unordered(parallelSimilarity, protParamListNew), total=total_task_num):
        pass
    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    #mseValue = mse(real_distance_list,cosine_distance_list)
    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)
    
    random.seed(42)
    random_list = []
    for i in range(len(real_distance_list)):
        random_list.append(random.uniform(0, 1))
    
    if sparse:
        cosine_randomCorr = spearmanr(cosine_distance_list, random_list)
        manhattan_randomCorr = spearmanr(manhattan_distance_list, random_list)
        euclidian_randomCorr = spearmanr(euclidian_distance_list, random_list)
        print("Cosine Random Correlation for "+aspect+" is " + str(cosine_randomCorr))
        print("Manhattan Random Correlation for "+aspect+" is " + str(manhattan_randomCorr))
        print("Euclidian Random Correlation for "+aspect+" is " + str(euclidian_randomCorr))
    
    
    print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

    if sparse:
        return (cosineCorr,manhattanCorr,euclidianCorr,cosine_randomCorr,manhattan_randomCorr,euclidian_randomCorr)
    else:
        return (cosineCorr,manhattanCorr,euclidianCorr)
    

In [9]:
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"
#saveFileName = "SimilarityUniRep.csv"
saveFileName = "SimilarityUniRep_highest_200.csv"

f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()
    
# MF corr 0.37317002949322214
# BP corr 0.4704371029265753
# CC corr 0.0991534082649176

HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for MF is SpearmanrResult(correlation=-0.26724953213922437, pvalue=0.0)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.1800707227181251, pvalue=1.1771460757225988e-144)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.16228398097340532, pvalue=1.6722952265847623e-117)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for BP is SpearmanrResult(correlation=-0.16480476169178904, pvalue=3.5987631764445505e-121)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.13806024911163078, pvalue=2.878970733349847e-85)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.13810684441134438, pvalue=2.5258661779012862e-85)


HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19900), HTML(value='')))


Cosine Correlation for CC is SpearmanrResult(correlation=-0.013956885972626384, pvalue=0.04897224101546829)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.02904511420886262, pvalue=4.168708589920952e-05)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.057978215859549306, pvalue=2.7183949050689583e-16)


In [10]:
'''buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal\
,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"
#saveFileName = "SimilarityGene2Vec.csv"
saveFileName = "Similarity_Sparse_UniRep_highest_500.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect,True) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()'''

'buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"\n#saveFileName = "SimilarityGene2Vec.csv"\nsaveFileName = "Similarity_Sparse_UniRep_highest_500.csv"\nf = open(saveFileName,\'w\')\nf.write(buffer)\n\nfor aspect in ["MF","BP","CC"]:\n    corr = calculateMSEforOntology(aspect,True) \n    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" \n    f = open(saveFileName,\'a\')\n    f.write(buffer) #Give your csv text here.\n    ## Python will convert \n to os.linesep\n    f.close()'