In [2]:
import pandas as pd
import numpy as np
import gzip
import itertools
import multiprocessing
import csv
import pickle
import random
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Manager, Pool
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr, pearsonr
from functools import partial

In [7]:
class SimilarityCorrelation:
    def __init__(self,pkl_dataframe_file_name,representation_name):
        # UNIPROT data for mapping between UNIPROT accession numbers and UNIPROT entry names
        uniprot_metadata_directory = "./DATA/Uniprot/"
        uniprot_metadata_file_path = uniprot_metadata_directory + "uniprot_human_all.tab"
        uniprot_vars = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Annotation' ]
        self.uniprot_df = pd.read_csv(uniprot_metadata_file_path, sep='\t')
        # The representation should be a pickle object which is a dataframe consists of two coloumns
        # Entry (UNIPROT Entry_ID) and Vector (the representation vector belongs to that protein)
        self.representation_dataframe = pd.read_pickle(pkl_dataframe_file_name)
        self.protein_names = transformer_dataframe_avg['Entry'].tolist()
        # define similarity_list and proteinList as global variables
        self.proteinList = []
        self.manager = Manager()
        self.similarity_list = manager.list()
        self.proteinListNew = manager.list()
        self.representation_name = representation_name
        
        def parallelSimilarity(self,paramList):
            protein_embedding_dataframe = self.representation_dataframe

            i = paramList[0]
            j = paramList[1] 
            aspect = paramList[2]
            if j>i:
                protein1 = self.proteinListNew[i]
                protein2 = self.proteinListNew[j]
                if protein1 in protein_names and protein2 in protein_names:
                    prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item())
                    prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item())
                    #cosine will return in shape of input vectors first dimension
                    cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item()
                    manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
                    manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1))
                    euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean')
                    euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) 
                    real = paramList[3]
                    #real = human_protienSimilarityMatrix.loc[protein1,protein2]
                    # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
                    self.similarity_list.append((real,cos,1-manhattanDistNorm.item(),1-euclideanDistNorm.item()))
            return self.similarity_list

    def calculateMSEforOntology(self,aspect,matrix_type,protein_embedding_type):

        #Clear lists before each aspect
        self.similarity_list[:] = []
        self.proteinListNew[:] = []

        similarityMatrixNameDict = {}
        similarityMatrixNameDict["All"] = "./DATA/preprocess/human_"\
        +aspect+"_proteinSimilarityMatrix.csv" 
        similarityMatrixNameDict["500"] = "./DATA/preprocess/human_"\
        +aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"
        similarityMatrixNameDict["Sparse"] = "./DATA/preprocess/human_"\
        +aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv" 
        similarityMatrixNameDict["200"] = "./DATA/preprocess/human_"\
        +aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv"

        similarityMatrixFileName = similarityMatrixNameDict[matrix_type]

        human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
        human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True)
        #proteinList = human_proteinSimilarityMatrix
        self.proteinList = human_proteinSimilarityMatrix.columns

         #proteinListNew is referanced using Manager
        for prot in self.proteinList:
            self.proteinListNew.append(prot)
        if matrix_type == "Sparse":
            #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
            sparsified_similarity_coordinates = \
            np.load("./auxilary_input/SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy")
            protParamList = sparsified_similarity_coordinates
        else:     
            i = range(len(self.proteinList))
            j = range(len(self.proteinList))
            protParamList = list(itertools.product(i,j))
        protParamListNew = []
        # Prepare parameters for parallel processing these parameters will be 
        # used concurrently by different processes
        for tup in tqdm_notebook(protParamList):
            i = tup[0]
            j = tup[1]

            if matrix_type == "Sparse":
                protein1 = self.proteinListNew[i]
                protein2 = self.proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                tupNew = (tup[0],tup[1],aspect,real,protein_embedding_type)
                protParamListNew.append(tupNew)
            else:
                if j > i:
                    protein1 = self.proteinListNew[i]
                    protein2 = self.proteinListNew[j]
                    real = human_proteinSimilarityMatrix.loc[protein1,protein2]
                    tupNew = (tup[0],tup[1],aspect,real,protein_embedding_type)
                    protParamListNew.append(tupNew)

        total_task_num=len(protParamListNew)
        pool = Pool()
        self.similarity_listRet = []
        #parallelSimilarityPartial = partial(parallelSimilarity,protein_embedding_type)
        for similarity_listRet in tqdm_notebook(pool.imap_unordered(parallelSimilarity  \
                                                                    ,protParamListNew), total=total_task_num):
            pass
        pool.close()
        pool.join()

        real_distance_list = [value[0] for value in similarity_listRet]
        cosine_distance_list = [value[1] for value in similarity_listRet]
        manhattan_distance_list = [value[2] for value in similarity_listRet]
        euclidian_distance_list = [value[3] for value in similarity_listRet]

        #mseValue = mse(real_distance_list,cosine_distance_list)
        cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
        manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
        euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)   

        print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
        print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
        print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

        return (cosineCorr,manhattanCorr,euclidianCorr)

    def calculate_all_correlations(self):

        buffer = "aspect,cosineCorr,cosineCorrPVal,manhattanCorr,manhattanCorrPVal,euclidianCorr,euclidianCorrPVal \n"

        for similarity_matrix_type in ["Sparse","200","500","All"]:
            saveFileName = self.reprsentation_name+"_"+similarity_matrix+".csv"
            f = open(saveFileName,'w')
            f.write(buffer)
            for aspect in ["MF","BP","CC"]:
                corr = calculateMSEforOntology(aspect,similarity_matrix_type,protein_embedding_type) 
                buffer = "" + aspect + ","+ str(corr[0][0])+ ","+ str(corr[0][1])\
                + ","+ str(corr[1][0])+ ","+ str(corr[1][1])+ ","+ str(corr[2][0])+ ","+ str(corr[2][1])+"\n" 
                f = open(saveFileName,'a')
                f.write(buffer) #Give your csv text here.
                ## Python will convert \n to os.linesep
                f.close()
        