In [1]:
import pandas as pd
import glob
import os
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from functools import reduce
import collections
from matplotlib.ticker import AutoMinorLocator
import matplotlib.ticker as ticker
import math
#import pixiedust

from pathlib import Path

# Prepare data table according to data based grouping

In [27]:
method_ordered_list = ["BLAST","HMMER","PFAM","K-SEP","ENSEMBL-ORTHOLOGY","UNIRULE2GO","INTERPRO2GO","AAC","APAAC","PROTVEC",\
                       "LEARNED-VEC","UNIREP","SEQVEC","CPC-PROT","BERT-PFAM","BERT-BFD","ESMB1","XLNET","ALBERT",\
                       "T5","MUT2VEC","TCGA-EMBEDDING","GENE2VEC"]


data_based_group_table = pd.DataFrame({"Sim_MF": pd.Series([], dtype='str'),\
                                "Sim_BP": pd.Series([], dtype=np.float64),\
                                "Sim_CC": pd.Series([], dtype=np.float64),\
                                "Func_MF": pd.Series([], dtype=np.float64),\
                                "Func_BP": pd.Series([], dtype=np.float64),\
                                "Func_CC": pd.Series([], dtype=np.float64),\
                                "Fam_Pred": pd.Series([], dtype=np.float64),
                                "Affinity_Pred": pd.Series([], dtype=np.float64)},index=method_ordered_list)

data_based_group_table

Unnamed: 0,Sim_MF,Sim_BP,Sim_CC,Func_MF,Func_BP,Func_CC,Fam_Pred,Affinity_Pred
BLAST,,,,,,,,
HMMER,,,,,,,,
PFAM,,,,,,,,
K-SEP,,,,,,,,
ENSEMBL-ORTHOLOGY,,,,,,,,
UNIRULE2GO,,,,,,,,
INTERPRO2GO,,,,,,,,
AAC,,,,,,,,
APAAC,,,,,,,,
PROTVEC,,,,,,,,


# Get data for semantic similarity

In [28]:
#%%pixie_debugger
path = "/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/revision-1/semantic_similarity/"

first = False
similarity_table = pd.DataFrame({"Semantic Aspect": pd.Series([], dtype='str'),\
                                "CosineSim_Correlation": pd.Series([], dtype=np.float64),\
                                "CosineSim_Correlation p-value": pd.Series([], dtype=np.float64),\
                                "ManhattanSim_Correlation": pd.Series([], dtype=np.float64),\
                                "ManhattanSim_Correlation p-value": pd.Series([], dtype=np.float64),\
                                "EuclidianSim_Correlation": pd.Series([], dtype=np.float64),\
                                "EuclidianSim_Correlation p-value": pd.Series([], dtype=np.float64)})


pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 4000
embedding_name_set = set()

for filename in sorted(glob.glob(os.path.join(path, '*.csv'))):
    try:
        embedding_name = filename.split('Semantic_sim_pred_')[1].replace('.csv', '')
        similarity_table = similarity_table.append(pd.Series(name=embedding_name))
        tmp_table = pd.read_csv(filename)
        tmp_table = tmp_table.iloc[:,:7]
        new_cols = {x: y for x, y in zip(tmp_table.columns,similarity_table.columns)}
        similarity_table = similarity_table.append(tmp_table.rename(columns=new_cols))
        embedding_name_set.add(embedding_name.split('_')[0])
    except:
        print(filename)
        continue

  


In [29]:
def df_empty(columns, dtypes, index=None):
    assert len(columns)==len(dtypes)
    df = pd.DataFrame(index=index)
    for c,d in zip(columns, dtypes):
        df[c] = pd.Series(dtype=d)
    return df

In [30]:
#%%pixie_debugger
def drawEmbeddingSimilarity(measure):
    sim_corr_colnames = ["Type","Aspect"] + list(embedding_name_set)
    dtypes = [str,str] + [float] * len(embedding_name_set)

    SimilarityCorrDf = df_empty(columns=sim_corr_colnames, dtypes=dtypes)
    #SimilarityCorrDf.rename(index={0:'All_Proteins',1:'Well_Annotated_500',2:'Well_Annotated_200',3:'Sparse_Uniform'}, inplace=True)
    rowIndex = 0
    for aspect in ["MF","BP","CC"]:
        rowDictSparse_Uniform = {}
        rowDictSparse_Uniform['Type'] = "Sparse_Uniform"
        rowDictWell_Annotated_500 = {}
        rowDictWell_Annotated_500['Type'] = "Well_Annotated_500"
        rowDictWell_Annotated_200 = {}
        rowDictWell_Annotated_200['Type'] = "Well_Annotated_200"
        for index, row in similarity_table.iterrows(): 
            
            if isinstance(index, str):
                embedding = index.split("_")
                embedding_name = embedding[0]
                embedding_type = '_'.join(embedding[1:])
                
            elif row['Semantic Aspect'] == aspect:
                if "Sparse" in embedding_type:           
                    rowDictSparse_Uniform['Aspect'] = aspect
                    rowDictSparse_Uniform[embedding_name] = row[measure]
                elif "500" in embedding_type:# and "Sparse" not in embedding_type:
                    rowDictWell_Annotated_500['Aspect'] = aspect
                    rowDictWell_Annotated_500[embedding_name] = row[measure]
                elif "200" in embedding_type:             
                    rowDictWell_Annotated_200['Aspect'] = aspect
                    rowDictWell_Annotated_200[embedding_name] = row[measure]
        SimilarityCorrDf = SimilarityCorrDf.append(rowDictWell_Annotated_500, ignore_index=True)
        SimilarityCorrDf = SimilarityCorrDf.append(rowDictWell_Annotated_200, ignore_index=True)
        SimilarityCorrDf = SimilarityCorrDf.append(rowDictSparse_Uniform, ignore_index=True)

        rowIndex = rowIndex + 1
    return SimilarityCorrDf

In [31]:
def prepare_data_for_measure(measure):
    SimilarityCorrDf_Cosine = drawEmbeddingSimilarity(measure)
    SimilarityCorrDf_melted_Cosine = pd.melt(SimilarityCorrDf_Cosine, id_vars=["Type","Aspect"])
    SimilarityCorrDf_melted_pivot_Cosine = SimilarityCorrDf_melted_Cosine.\
        pivot_table(index=['variable','Aspect'], columns='Type', values='value')

    cols = ['Well_Annotated_500','Well_Annotated_200', 'Sparse_Uniform']
    MF = SimilarityCorrDf_melted_pivot_Cosine[SimilarityCorrDf_melted_pivot_Cosine\
                                  .index.get_level_values('Aspect').isin(['MF'])]
    MF = MF[cols]
    BP = SimilarityCorrDf_melted_pivot_Cosine[SimilarityCorrDf_melted_pivot_Cosine\
                                  .index.get_level_values('Aspect').isin(['BP'])]
    BP = BP[cols]
    CC = SimilarityCorrDf_melted_pivot_Cosine[SimilarityCorrDf_melted_pivot_Cosine\
                                  .index.get_level_values('Aspect').isin(['CC'])]
    CC = CC[cols]
    
    # Since sets are not ordered I use OrderedDict
    #b = collections.OrderedDict.\
    #fromkeys(list(SimilarityCorrDf_melted_pivot_Cosine.index.get_level_values('variable')))
    #embedding_lables = list(b.keys())
    
    labels = list(embedding_name_set)
       
    display_labels = ['BLAST','HMMER','K-SEP','APAAC','PFAM','AAC','PROTVEC',\
    'GENE2VEC','LEARNED-VEC','MUT2VEC','TCGA-EMBEDDING','CPC-PROT','SEQVEC','BERT-BFD',\
    'BERT-PFAM','ESMB1','ALBERT','XLNET','UNIREP','T5']


    return MF.reindex(level=0, labels=display_labels),\
            BP.reindex(level=0, labels=display_labels),\
            CC.reindex(level=0, labels=display_labels),display_labels


In [32]:
cosine_MF,cosine_BP,cosine_CC,embedding_lables_MF = prepare_data_for_measure('ManhattanSim_Correlation')
similarity_methods = [index_vals[0] for index_vals in list(cosine_MF.index.values)]
for method in similarity_methods:
    data_based_group_table.at[method, 'Sim_MF'] = cosine_MF.loc[method]['Sparse_Uniform'].item()
    data_based_group_table.at[method, 'Sim_BP'] = cosine_BP.loc[method]['Sparse_Uniform'].item()
    data_based_group_table.at[method, 'Sim_CC'] = cosine_CC.loc[method]['Sparse_Uniform'].item()


# Get data for function prediction

In [33]:
def create_index_from_model_name(index_names):
    index_list = []
    for index_name in index_names:
        new_name = index_name.split("_")[1:len(index_names)]
        new_name = '_'.join(new_name)
        index_list.append(new_name)
    return index_list

In [34]:
path = '/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/revision-1/function_prediction/'
go_pred_table = pd.DataFrame()
measure = "F1_Weighted"
for filename in sorted(glob.glob(os.path.join(path, '*_5cv_mean.tsv'))):
        col_name = filename.split("Ontology_based_function_prediction")[-1].split("_")[1]
        
        tmp_column = pd.read_csv(filename,sep="\t")
        tmp_column.sort_values(tmp_column.columns[0],inplace=True)
        
        go_pred_table[col_name] = tmp_column[measure]
        index = create_index_from_model_name(list(tmp_column.iloc[:, 0]))

go_pred_table["index_col"] = index
go_pred_table.set_index('index_col', inplace=True)
go_pred_table.sort_index(inplace=True)
#go_pred_table

In [35]:
#Read prediction results and order them alphabetically. Hence they are ordered by aspect.
def create_pred_table(measure):
    path = '/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/revision-1/function_prediction/'

    go_pred_table = pd.DataFrame()
    for filename in sorted(glob.glob(os.path.join(path, '*_5cv_mean.tsv'))):
            col_name = filename.split("Ontology_based_function_prediction")[-1].split("_")[1]

            tmp_column = pd.read_csv(filename,sep="\t")
            tmp_column.sort_values(tmp_column.columns[0])

            go_pred_table[col_name] = tmp_column[measure]
            index = create_index_from_model_name(list(tmp_column.iloc[:, 0]))

    go_pred_table["index_col"] = index
    go_pred_table.set_index('index_col', inplace=True)
    go_pred_table.sort_index(inplace=True)
    return go_pred_table

In [36]:
# Slice dataframe by aspect and order subgroups
def get_go_pred_table_for_aspect(aspect,go_pred_table):
    if aspect == "BP":
        go_pred_tableBP = go_pred_table[0:9]
        new_index =  ["BP_High_Shallow", "BP_High_Normal", "BP_High_Specific",\
                      "BP_Middle_Shallow","BP_Middle_Normal","BP_Middle_Specific",\
                      "BP_Low_Shallow","BP_Low_Normal","BP_Low_Specific"]
        go_pred_tableBP = go_pred_tableBP.reindex(new_index)
        return go_pred_tableBP
    if aspect == "CC":
        go_pred_tableCC = go_pred_table[9:17]
        new_index =  ["CC_High_Shallow", "CC_High_Normal",\
                      "CC_Middle_Shallow","CC_Middle_Normal","CC_Middle_Specific",\
                      "CC_Low_Shallow","CC_Low_Normal","CC_Low_Specific"]
        go_pred_tableCC = go_pred_tableCC.reindex(new_index)
        return go_pred_tableCC
    if aspect == "MF":
        go_pred_tableMF = go_pred_table[17:25]
        new_index =  ["MF_High_Shallow", "MF_High_Normal",\
                      "MF_Middle_Shallow","MF_Middle_Normal","MF_Middle_Specific",\
                      "MF_Low_Shallow","MF_Low_Normal","MF_Low_Specific"]
        go_pred_tableMF = go_pred_tableMF.reindex(new_index)
        return go_pred_tableMF


In [37]:
#draw a grouped bar chart for results
def drawBenchmarks(dataset,embedding_lables,title):
    cols = ["Accuracy","F1_Weighted"]
    colors=['peachpuff', 'palegreen','lightskyblue', 'orange']
    edgecolor="violet"
    bar_width = .8

    ax = dataset[cols].plot.bar(width=bar_width\
                                  , ylim=[-0.2, 1], color=colors,figsize=(12,8),edgecolor=edgecolor)

    #get first 4 label
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(labels[0:4])
    ax.set_title('Prediction Benchmark for ' + title )
    ax.set_xticklabels(embedding_lables)
    ax.set_xlabel('')

In [38]:
#Calculate mean measures for different aspects also return F1 weigted scores
def prepare_figure_data_for_aspect(aspect):
    go_pred_tableF1 = create_pred_table("F1_Weighted")
    go_pred_tableACC = create_pred_table("Accuracy")
    go_pred_tablePR = create_pred_table("Precision_Weighted")
    go_pred_tableREC = create_pred_table("Recall_Weighted")
    go_pred_tableHAMM = create_pred_table("Hamming_Distance")

    go_pred_tableF1_aspect = get_go_pred_table_for_aspect(aspect,go_pred_tableF1)
    go_pred_tableACC_aspect = get_go_pred_table_for_aspect(aspect,go_pred_tableACC)
    go_pred_tablePR_aspect = get_go_pred_table_for_aspect(aspect,go_pred_tablePR)
    go_pred_tableREC_aspect = get_go_pred_table_for_aspect(aspect,go_pred_tableREC)
    go_pred_tableHAMM_aspect = get_go_pred_table_for_aspect(aspect,go_pred_tableHAMM)

    go_pred_tableF1_aspect_mean = go_pred_tableF1_aspect.mean(axis = 0)
    go_pred_tableACC_aspect_mean = go_pred_tableACC_aspect.mean(axis = 0) 
    go_pred_tablePR_aspect_mean = go_pred_tablePR_aspect.mean(axis = 0) 
    go_pred_tableREC_aspect_mean = go_pred_tableREC_aspect.mean(axis = 0)
    go_pred_tableHAMM_aspect_mean = go_pred_tableHAMM_aspect.mean(axis = 0)

    new_index =  ["Accuracy","F1-Weighted","Precision","Recall", "Hamming"]
    pred_mean_df = pd.DataFrame([go_pred_tableACC_aspect_mean])
    pred_mean_df = pred_mean_df.append(go_pred_tableF1_aspect_mean, ignore_index=True)
    pred_mean_df = pred_mean_df.append(go_pred_tablePR_aspect_mean, ignore_index=True)
    pred_mean_df = pred_mean_df.append(go_pred_tableREC_aspect_mean, ignore_index=True)
    pred_mean_df = pred_mean_df.append(go_pred_tableHAMM_aspect_mean, ignore_index=True)
    pred_mean_df = pred_mean_df.set_index(pd.Series(new_index))
    
    display_labels = ['INTERPRO2GO','UNIRULE2GO','ENSEMBL-ORTHOLOGY','BLAST','HMMER','K-SEP','APAAC','PFAM','AAC','PROTVEC',\
    'GENE2VEC','LEARNED-VEC','MUT2VEC','TCGA-EMBEDDING','SEQVEC','CPC-PROT','BERT-BFD',\
    'BERT-PFAM','ESMB1','ALBERT','XLNET','UNIREP','T5']
    
    columnsTitles = ['INTERPRO2GO','UNIRULE2GO','ENSEMBL-ORTHOLOGY','BLAST','HMMER','K-SEP','APAAC','PFAM','AAC','PROTVEC',\
    'GENE2VEC','LEARNED-VEC','MUT2VEC','TCGA-EMBEDDING','SEQVEC','CPC-PROT','BERT-BFD',\
    'BERT-PFAM','ESMB1','ALBERT','XLNET','UNIREP','T5']
    
    pred_mean_df = pred_mean_df.reindex(columns=columnsTitles)
    go_pred_tableF1_aspect = go_pred_tableF1_aspect.reindex(columns=columnsTitles)
    go_pred_tablePR_aspect = go_pred_tablePR_aspect.reindex(columns=columnsTitles)
    
    pred_mean_df.columns = display_labels
    go_pred_tableF1_aspect.columns = display_labels
    
    return pred_mean_df,go_pred_tableF1_aspect,go_pred_tablePR_aspect

In [39]:
#Create dataframes for figures
pred_mean_df_BP, go_pred_tableF1_BP,go_pred_tablePR_Precision_BP = prepare_figure_data_for_aspect("BP")
pred_mean_df_CC, go_pred_tableF1_CC,go_pred_tablePR_Precision_CC = prepare_figure_data_for_aspect("CC")
pred_mean_df_MF, go_pred_tableF1_MF, go_pred_tablePR_Precision_MF= prepare_figure_data_for_aspect("MF")

In [40]:
pred_mean_df_MF.loc['F1-Weighted']['INTERPRO2GO']

0.37151595578781754

In [41]:
func_pred_methods = pred_mean_df_MF.loc['F1-Weighted'].index.values
for method in func_pred_methods:
    data_based_group_table.at[method, 'Func_MF'] = pred_mean_df_MF.loc['F1-Weighted'][method]
    data_based_group_table.at[method, 'Func_BP'] = pred_mean_df_BP.loc['F1-Weighted'][method]
    data_based_group_table.at[method, 'Func_CC'] = pred_mean_df_CC.loc['F1-Weighted'][method]

In [42]:
data_based_group_table

Unnamed: 0,Sim_MF,Sim_BP,Sim_CC,Func_MF,Func_BP,Func_CC,Fam_Pred,Affinity_Pred
BLAST,0.19675,0.1434,0.05113,0.874888,0.555932,0.573139,,
HMMER,0.2456,0.30467,0.24449,0.890796,0.611008,0.595703,,
PFAM,0.34895,0.42222,0.51073,0.864411,0.557242,0.580642,,
K-SEP,0.22224,0.29241,0.29435,0.809005,0.516863,0.502334,,
ENSEMBL-ORTHOLOGY,,,,0.199461,0.240556,0.262632,,
UNIRULE2GO,,,,0.011459,0.011506,0.038385,,
INTERPRO2GO,,,,0.371516,0.112134,0.269393,,
AAC,-0.01205,0.2141,0.09121,0.409741,0.187196,0.232186,,
APAAC,0.17167,0.27358,0.24087,0.582846,0.344661,0.395269,,
PROTVEC,0.18544,0.29526,0.20583,0.636666,0.362433,0.380771,,


# Get Affinity Data

In [43]:
mse_list = []
mae_list = []
representation_name_list = []
for path in Path("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/revision-1/affinity_prediction/")\
.glob("Affinit_prediction_skempiv1_*_detail.csv"):
    #print(path)
    representation_name_list.append(str(path).split("skempiv1_")[1].split("_detail.csv")[0])
    mse_list.append(pd.read_csv(path)['val_mse_errors'])
    mae_list.append(pd.read_csv(path)['val_mae_errors'])
    
df_mse = pd.concat(mse_list, axis=1)
df_mse.columns = representation_name_list
#Sorting columns by their mean value
df_mse = df_mse.reindex(df_mse.mean().sort_values().index, axis=1)

df_mae = pd.concat(mae_list, axis=1)
df_mae.columns = representation_name_list
#Sorting columns by their mean value
df_mae = df_mae.reindex(df_mae.mean().sort_values().index, axis=1)

In [44]:
for method in df_mse.mean().index.values:
    data_based_group_table.at[method, 'Affinity_Pred'] = df_mse.mean()[method]

In [45]:
data_based_group_table

Unnamed: 0,Sim_MF,Sim_BP,Sim_CC,Func_MF,Func_BP,Func_CC,Fam_Pred,Affinity_Pred
BLAST,0.19675,0.1434,0.05113,0.874888,0.555932,0.573139,,
HMMER,0.2456,0.30467,0.24449,0.890796,0.611008,0.595703,,
PFAM,0.34895,0.42222,0.51073,0.864411,0.557242,0.580642,,2.257884
K-SEP,0.22224,0.29241,0.29435,0.809005,0.516863,0.502334,,0.970504
ENSEMBL-ORTHOLOGY,,,,0.199461,0.240556,0.262632,,
UNIRULE2GO,,,,0.011459,0.011506,0.038385,,
INTERPRO2GO,,,,0.371516,0.112134,0.269393,,
AAC,-0.01205,0.2141,0.09121,0.409741,0.187196,0.232186,,1.847783
APAAC,0.17167,0.27358,0.24087,0.582846,0.344661,0.395269,,1.79005
PROTVEC,0.18544,0.29526,0.20583,0.636666,0.362433,0.380771,,1.134063


In [46]:
data_based_group_table.to_csv("/media/DATA/serbulent/DATA/Thesis/ReviewPaper/results/revision-1/data_based_group_table.csv")