In [1]:
import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
import random

In [2]:
# UNIPROT data for mapping between UNIPROT accession numbers and UNIPROT entry names
uniprot_metadata_directory = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/Uniprot/"
uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t')

In [3]:
colnames=['Gene', 'Vector'] 
representationFile = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/tcga_embedding/gemb_normal.csv"
tcga_embedding = pd.read_csv(representationFile,delimiter=',',encoding='utf-8', names=None, header=None,skiprows=1)

In [4]:
tcga_dataframe = pd.DataFrame(columns = ['Gene', 'Protein_Entry', 'Vector'])

for index, row in tqdm_notebook(tcga_embedding.iterrows(), total = len(tcga_embedding)):
    gene_name = row[0]
    vector = list(row[1:51])
    protein_id = ""
    tcga_dataframe = tcga_dataframe.append({'Gene': gene_name,'Protein_Entry':protein_id,'Vector':vector},ignore_index=True)

HBox(children=(IntProgress(value=0, max=20531), HTML(value='')))




In [5]:
tcgaGeneList = list(set(tcga_dataframe['Gene']))

In [6]:
tcga_embedding = pd.DataFrame(columns=['Gene', 'Entry', 'Vector'])

for index, row in tqdm_notebook(uniprot_df.iterrows(), total= len(uniprot_df)):
    gene_names = str(row['Gene names']).split()
    new_gene = True
    for gene_name in gene_names:
        if gene_name in tcgaGeneList and new_gene == True:
            vector = tcga_dataframe.query("Gene == @gene_name")['Vector'].item()           
            tcga_embedding = tcga_embedding.append({'Gene': gene_name,'Entry': row['Entry'],\
                                                          'Vector': vector},ignore_index=True)
            new_gene = False
        

HBox(children=(IntProgress(value=0, max=20421), HTML(value='')))




In [7]:
tcga_embedding[0:5]

Unnamed: 0,Gene,Entry,Vector
0,TEX13A,Q9BXU3,"[-0.16341070000000002, 0.13177536, 0.027271714..."
1,LARS2,Q15031,"[0.012629689, 0.01917274, -0.022130901, 0.0146..."
2,TXNDC11,Q6PKC3,"[0.04593855, -0.012116345, -0.11811426, 0.0997..."
3,TXK,P42681,"[0.038961213, 0.069318585, 0.00691180160000000..."
4,TXLNA,P40222,"[0.03339354, -0.017174669, -0.021231595, 0.054..."


In [8]:
tcga_embedding.to_pickle("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/tcga_embedding_dataframe.pkl")

In [9]:
tcga_embedding.to_csv("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/embedding_dataframes/tcga_embedding_dataframe.csv")

In [10]:
tcgaProteinList = list(tcga_embedding['Entry'])

In [11]:
'''
#This part was used to be sure parallel and sequential versions gives same results
cosine_distance_list1 = []
real_distance_list1 = []

similarityMatrixFileName = ""
similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"
#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"

human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)

proteinListTmp = human_proteinSimilarityMatrix.columns[0:10]
for i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):
    for j in range(len(proteinListTmp)):
        if j>i:
            protein2 = proteinListTmp[j]
            if protein1 in tcgaProteinList and protein2 in tcgaProteinList:
                prot1vec = np.asarray(tcga_embedding.query("Entry == @protein1")['Vector'].item())
                prot2vec = np.asarray(tcga_embedding.query("Entry == @protein2")['Vector'].item())
                #cosine will return in shape of input vectors first dimension
                cosine_dist = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
                cosine_distance_list1.append(cosine_dist)
                real_distance_list1.append(human_proteinSimilarityMatrix.loc[protein1,protein2])

print(len(cosine_distance_list1))
print(spearmanr(real_distance_list1,cosine_distance_list1))

# MF corr 0.26526862117740185
# BP corr 0.12727272727272726
# CC corr 0.0625226474707022
'''

'\n#This part was used to be sure parallel and sequential versions gives same results\ncosine_distance_list1 = []\nreal_distance_list1 = []\n\nsimilarityMatrixFileName = ""\nsimilarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_MF_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_BP_protienSimilarityMatrix.csv"\n#similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_CC_protienSimilarityMatrix.csv"\n\nhuman_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)\nhuman_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)\n\nproteinListTmp = human_proteinSimilarityMatrix.columns[0:10]\nfor i,protein1 in tqdm_notebook(enumerate(proteinListTmp)):\n    for j in range(len(proteinListTmp)):\n        if j>i:\n            protein2 = proteinListTmp[j]\n            if protein1 in tcgaProteinList and protein2 i

In [12]:
# define similarity_list and proteinList as global variables
proteinList = []
manager = Manager()
similarity_list = manager.list()
proteinListNew = manager.list()

def parallelSimilarity(paramList):
    i = paramList[0]
    j = paramList[1] 
    aspect = paramList[2]
    if j>i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]
        if protein1 in tcgaProteinList and protein2 in tcgaProteinList:
            prot1vec = np.asarray(tcga_embedding.query("Entry == @protein1")['Vector'].item())
            prot2vec = np.asarray(tcga_embedding.query("Entry == @protein2")['Vector'].item())
            #print(str(protein1) + str(prot1vec))
            #print(str(protein2) + str(prot2vec))
            #cosine will return in shape of input vectors first dimension
            cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
            manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
            manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
            euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
            euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) 
            real = paramList[3]
            #real = human_protienSimilarityMatrix.loc[protein1,protein2]
            # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
            #Similarity = 1-distance
            similarity_list.append((real,1-cos,1-manhattanDistNorm.item(),1-euclideanDistNorm.item()))
    return similarity_list

In [16]:
def calculateMSEforOntology(aspect,sparse=False):
    
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []

    #similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"+aspect+"_protienSimilarityMatrix.csv"
    similarityMatrixFileName = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/preprocess/human_"\
    +aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"


    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
    proteinList = human_proteinSimilarityMatrix.columns
    
    #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
        
    if sparse:
        #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
        sparse_sim_dir = "/media/DATA/serbulent/Code/Thesis/ReviewPaper/embedding_similarity/auxilary_input/"
        sparsified_similarity_coordinates = np.load(sparse_sim_dir+"SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
        protParamList = sparsified_similarity_coordinates
    else:     
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i,j))
    protParamListNew = []
    # Prepare parameters for parallel processing these parameters will be 
    # used concurrently by different processes
    for tup in tqdm_notebook(protParamList):
        i = tup[0]
        j = tup[1]
        
        if sparse:
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1,protein2]
            tupNew = (tup[0],tup[1],aspect,real)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                tupNew = (tup[0],tup[1],aspect,real)
                protParamListNew.append(tupNew)
    total_task_num=len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    for similarity_listRet in tqdm_notebook(pool.imap_unordered(parallelSimilarity, protParamListNew), total=total_task_num):
        pass
    pool.close()
    pool.join()

    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    #mseValue = mse(real_distance_list,cosine_distance_list)
    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)
    
    random.seed(42)
    random_list = []
    for i in range(len(real_distance_list)):
        random_list.append(random.uniform(0, 1))
    
    if sparse:
        cosine_randomCorr = spearmanr(cosine_distance_list, random_list)
        manhattan_randomCorr = spearmanr(manhattan_distance_list, random_list)
        euclidian_randomCorr = spearmanr(euclidian_distance_list, random_list)
        print("Cosine Random Correlation for "+aspect+" is " + str(cosine_randomCorr))
        print("Manhattan Random Correlation for "+aspect+" is " + str(manhattan_randomCorr))
        print("Euclidian Random Correlation for "+aspect+" is " + str(euclidian_randomCorr))
    
    
    print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

    if sparse:
        return (cosineCorr,manhattanCorr,euclidianCorr,cosine_randomCorr,manhattan_randomCorr,euclidian_randomCorr)
    else:
        return (cosineCorr,manhattanCorr,euclidianCorr)
    

In [17]:
'''
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"
#saveFileName = "SimilarityTCGA_Embedding.csv"
saveFileName = "SimilarityTCGA_Embedding_highest_500.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()

# MF corr 0.26526862117740185
# BP corr 0.12727272727272726
# CC corr 0.0625226474707022

'''

'\nbuffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"\n#saveFileName = "SimilarityTCGA_Embedding.csv"\nsaveFileName = "SimilarityTCGA_Embedding_highest_500.csv"\nf = open(saveFileName,\'w\')\nf.write(buffer)\n\nfor aspect in ["MF","BP","CC"]:\n    corr = calculateMSEforOntology(aspect) \n    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" \n    f = open(saveFileName,\'a\')\n    f.write(buffer) #Give your csv text here.\n    ## Python will convert \n to os.linesep\n    f.close()\n\n# MF corr 0.26526862117740185\n# BP corr 0.12727272727272726\n# CC corr 0.0625226474707022\n\n'

In [18]:
buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal\
,random_cosineCorr,random_cosineCorrPVal,random_manhattanCorr,random_manhattanCorrPVal,random_euclidianCorr,random_euclidianCorrPVal\n"
#saveFileName = "SimilarityGene2Vec.csv"
saveFileName = "SimilarityTCGA_Embedding_Sparse_highest_500.csv"
f = open(saveFileName,'w')
f.write(buffer)

for aspect in ["MF","BP","CC"]:
    corr = calculateMSEforOntology(aspect,True) 
    buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
    + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
    f = open(saveFileName,'a')
    f.write(buffer) #Give your csv text here.
    ## Python will convert \n to os.linesep
    f.close()

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))




HBox(children=(IntProgress(value=0, max=247), HTML(value='')))


Cosine Random Correlation for MF is SpearmanrResult(correlation=-0.017621512908510887, pvalue=0.7833190749517686)
Manhattan Random Correlation for MF is SpearmanrResult(correlation=0.006071866882429874, pvalue=0.9245139277839889)
Euclidian Random Correlation for MF is SpearmanrResult(correlation=0.010656096150319512, pvalue=0.867930461432932)
Cosine Correlation for MF is SpearmanrResult(correlation=-0.039360126394786206, pvalue=0.5389289889673823)
Manhattan Correlation for MF is SpearmanrResult(correlation=0.037544813521340197, pvalue=0.55782521975525)
Euclidian Correlation for MF is SpearmanrResult(correlation=0.04983686769859518, pvalue=0.4364727787706798)


HBox(children=(IntProgress(value=0, max=247), HTML(value='')))




HBox(children=(IntProgress(value=0, max=247), HTML(value='')))


Cosine Random Correlation for BP is SpearmanrResult(correlation=0.0012032668974985424, pvalue=0.9849887640655707)
Manhattan Random Correlation for BP is SpearmanrResult(correlation=-0.004430825292972794, pvalue=0.9447643623853753)
Euclidian Random Correlation for BP is SpearmanrResult(correlation=-0.0010153311014630322, pvalue=0.9873331244174579)
Cosine Correlation for BP is SpearmanrResult(correlation=-0.4925860921140228, pvalue=1.6732887251948711e-16)
Manhattan Correlation for BP is SpearmanrResult(correlation=0.4822408636127631, pvalue=8.634372908060867e-16)
Euclidian Correlation for BP is SpearmanrResult(correlation=0.4962874716901798, pvalue=9.176679617487395e-17)


HBox(children=(IntProgress(value=0, max=245), HTML(value='')))




HBox(children=(IntProgress(value=0, max=245), HTML(value='')))


Cosine Random Correlation for CC is SpearmanrResult(correlation=0.0313326483325226, pvalue=0.6276642496376734)
Manhattan Random Correlation for CC is SpearmanrResult(correlation=-0.03121071988166167, pvalue=0.6290043389327324)
Euclidian Random Correlation for CC is SpearmanrResult(correlation=-0.025290416212080995, pvalue=0.6954630425306461)
Cosine Correlation for CC is SpearmanrResult(correlation=-0.4738956690421547, pvalue=5.947607454519805e-15)
Manhattan Correlation for CC is SpearmanrResult(correlation=0.5033396964743626, pvalue=6.003047049037164e-17)
Euclidian Correlation for CC is SpearmanrResult(correlation=0.5055615038011618, pvalue=4.166986047730295e-17)
