In [None]:
import pandas as pd
import numpy as np
import re
import csv
import os 
import math
import gzip
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from progressbar import ProgressBar
import matplotlib.pyplot as plt

## Plot ROC curves based on the metrics 

Using the false and true positive rates stored in the metrics file, we can plot the ROC curves; use the different functions depending on whether the measure is IC-based, edge-based or direct groupwise (as results are slightly different formats).

In [None]:
# directory to contain ROC curves
if not os.path.exists('ROCfigures'):
    os.makedirs('ROCfigures')

In [None]:
if not os.path.exists('ROCfigures/ICPairwise'):
    os.makedirs('ROCfigures/ICPairwise')

def plotROC_ic(metrics_file):
    
    ''' For IC-based similarity measures.
    
    Depending on the format of your SML computation results, this function
    may need adapting to suit. 
    Due to splitting files to lessen
    the computational demands in our run, we had slightly different results 
    formats hence 3 separate functions with different naming methods '''
    
    data = pd.read_csv(metrics_file, sep='\t')
    
    data = data.drop('Unnamed: 0', axis=1)
    
    for i in range(len(data.index)):
        
        ic_name = metrics_file.replace('ROCmetrics/metrics_', '').replace('_indirect_groupwise_combinations.tsv', '')
        measure_name = data.loc[i, 'measure'] # for plot naming later 
        
        fpr = data.loc[i, 'fpr'].strip('[').strip(']').strip(' ').split(',') # these are saved as strings so have to clean
        tpr = data.loc[i, 'tpr'].strip('[').strip(']').strip(' ').split(',')
        
        x = [ float(i) for i in fpr ]
        y = [ float(i) for i in tpr ]
        
        # plot the curve 
        plt.figure(figsize=(8, 8))
        plt.plot(x, y, linestyle='--', marker='o', color='orange', lw = 2, label='ROC curve', clip_on=False)
        plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('{} IC-based {} ROC curve,'.format(ic_name, measure_name) + ' AUC = %.4f'%metrics.auc(x, y))
        plt.legend(loc="lower right")
                  
        plt.savefig('ROCfigures/ICPairwise/ROC_{}_{}.png'.format(ic_name, measure_name)) 
        
        #plt.show()

In [None]:
if not os.path.exists('ROCfigures/EdgePairwise'):
    os.makedirs('ROCfigures/EdgePairwise')
    
def plotROC_edge(metrics_file):
    
    data = pd.read_csv(metrics_file, sep='\t')
    
    data = data.drop('Unnamed: 0', axis=1)
    
    for i in range(len(data.index)):
        
        measure_name = data.loc[i, 'measure'] 
        
        
        fpr = data.loc[i, 'fpr'].strip('[').strip(']').strip(' ').split(',')
        tpr = data.loc[i, 'tpr'].strip('[').strip(']').strip(' ').split(',')
        
        x = [ float(i) for i in fpr ]
        y = [ float(i) for i in tpr ]
        
        plt.figure(figsize=(8, 8))
        plt.plot(x, y, linestyle='--', marker='o', color='orange', lw = 2, label='ROC curve', clip_on=False)
        plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Edge-based {} ROC curve,'.format(measure_name) + ' AUC = %.4f'%metrics.auc(x, y))
        plt.legend(loc="lower right")
                  
        plt.savefig('ROCfigures/EdgePairwise/ROC_{}.png'.format(measure_name)) 
        
        #plt.show()

In [None]:
if not os.path.exists('ROCfigures/DirectGroupwise'):
    os.makedirs('ROCfigures/DirectGroupwise')

def plotROC_direct(metrics_file):
    
    data = pd.read_csv(metrics_file, sep='\t')
    
    data = data.drop('Unnamed: 0', axis=1)
    
    # dictionary for replacing bad measure names
    measures_revised = pd.read_csv('GenerateXML/measures_revised.csv', sep=',')
    measures_dict = dict(zip(measures_revised['ID'], measures_revised['Flag']))
    
    for i in range(len(data.index)):
        
        measure_name = measures_dict.get(data.loc[i, 'measure'])        
        measure_name = measure_name.replace('SIM_', '')
        
        fpr = data.loc[i, 'fpr'].strip('[').strip(']').strip(' ').split(',')
        tpr = data.loc[i, 'tpr'].strip('[').strip(']').strip(' ').split(',')
        
        x = [ float(i) for i in fpr ]
        y = [ float(i) for i in tpr ]
        
        plt.figure(figsize=(8, 8))
        plt.plot(x, y, linestyle='--', marker='o', color='orange', lw = 2, label='ROC curve', clip_on=False)
        plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Direct {} ROC curve,'.format(measure_name) + ' AUC = %.4f'%metrics.auc(x, y))
        plt.legend(loc="lower right")
                  
        plt.savefig('ROCfigures/DirectGroupwise/ROC_{}.png'.format(measure_name)) 
        
        #plt.show()

As seen below, we ended up splitting IC-based measures. 
Here, we used a single IC measure, with all available suitable pairwise and groupwise measures, in each SML run. 
All direct groupwise, and non-IC based pairwise with groupwise, were completed in single run.

In [None]:
# run on all direct groupwise 

plotROC_direct('ROCmetrics/metrics_direct_groupwise_combinations.tsv')

## note that some measures have had name changes in the updated GenerateXML/measures_revised.csv since running this notebook

In [None]:
# edge based 
plotROC_edge('ROCmetrics/metrics_indirect_groupwise_combinations.tsv')

In [None]:
# run on all IC based 

plotROC_ic('ROCmetrics/metrics_max_indirect_groupwise_combinations.tsv')
plotROC_ic('ROCmetrics/metrics_min_indirect_groupwise_combinations.tsv')
plotROC_ic('ROCmetrics/metrics_resnik_indirect_groupwise_combinations.tsv')
plotROC_ic('ROCmetrics/metrics_sanchez_indirect_groupwise_combinations.tsv')
plotROC_ic('ROCmetrics/metrics_seco_indirect_groupwise_combinations.tsv')
plotROC_ic('ROCmetrics/metrics_zhou_indirect_groupwise_combinations.tsv')

# this was one metric that was run separately due to different requirements 
plotROC_ic('ROCmetrics/metrics_GIC_indirect_groupwise.tsv')

## Plot histogram of all AUC scores 

We can load in all the metrics files and extract the AUC scores to plot the distribution.

In [None]:
def hist_AUC(metrics_files):
    
    ''' input a list of all metrics files in ROCmetrics (or equivalent) folder '''
    
    auc_scores = pd.DataFrame(columns=['measure', 'AUC']) # to store all AUC scores
    
    for file in metrics_files:
    
        data = pd.read_csv('ROCmetrics/{}'.format(file), sep='\t')

        aucs = data.loc[:, ['measure','AUC']]

        auc_scores = auc_scores.append(aucs)
    
    # use a threhsold of 0.5 as some measures give an inverted score (< 0.5)
    auc_over_05 = auc_scores.loc[(auc_scores.loc[:, 'AUC'] >= 0.5),:]
    auc_under_05 = auc_scores.loc[(auc_scores.loc[:, 'AUC'] < 0.5),:].reset_index(drop=True)
    
    # for scores under 0.5, represent as 1 - score so all > 0.5
    for (i, auc) in enumerate(auc_under_05.loc[:,'AUC']):

        auc_under_05.loc[i, 'AUC'] = (1 - auc)
    
    # rejoin for complete data 
    auc_scores = auc_over_05.append(auc_under_05)

    # calculate quantiles of AUC score distribution
    q25 = np.quantile(auc_scores.loc[:, 'AUC'].tolist(), 0.25)
    q50 = np.quantile(auc_scores.loc[:, 'AUC'].tolist(), 0.5)
    q75 = np.quantile(auc_scores.loc[:, 'AUC'].tolist(), 0.75)
    
    # plot histogram with quantiles displayed
    plt.hist(auc_scores['AUC'], color = 'orange', edgecolor = 'white',
             bins = 14) # edit bins as required 
    plt.title('AUC scores across all measures')
    plt.xlabel('AUC')
    plt.ylabel('Frequency')
    plt.vlines(q25, 0, 130, color='red', linestyle='dashed', label='0.25 quantile AUC = %.4f'%q25)
    plt.vlines(q50, 0, 115, color='navy', linestyle='dashed', label='0.50 quantile AUC = %.4f'%q50)
    plt.vlines(q75, 0, 115, color='darkgreen', linestyle='dashed', label='0.75 quantile AUC = %.4f'%q75)
    plt.legend( fontsize='x-small',loc="upper right")

    plt.savefig('ROCFigures/hist_AUC.svg')
    
    return plt.show()


In [None]:
# get all files and run 

metrics_files = os.listdir('ROCmetrics')

hist_AUC(metrics_files)

## Plot histogram of all MRR scores

Using same format as above, we can plot a histogram of all MRR scores (NA and zero-based). 

In [None]:
def hist_MRR_NA(metrics_files):
    
    ''' input a list of all metrics files in ROCmetrics (or equivalent) folder '''
    
    mrr_scores = pd.DataFrame(columns=['measure', 'MRR_NA']) # to store all MRR scores
    
    for file in metrics_files:
    
        data = pd.read_csv('ROCmetrics/{}'.format(file), sep='\t')

        mrr = data.loc[:, ['measure','MRR_NA']]

        mrr_scores = mrr_scores.append(mrr)
    
    # calculate quantiles of MRR score distribution
    q25 = np.quantile(mrr_scores.loc[:, 'MRR_NA'].tolist(), 0.25)
    q50 = np.quantile(mrr_scores.loc[:, 'MRR_NA'].tolist(), 0.5)
    q75 = np.quantile(mrr_scores.loc[:, 'MRR_NA'].tolist(), 0.75)
    
    # plot histogram with quantiles displayed
    plt.hist(mrr_scores['MRR_NA'], color = 'cornflowerblue', edgecolor = 'white',
             bins = 13) # edit bins as required 
    plt.title('NaN-based MRR scores across all measures')
    plt.xlabel('MRR')
    plt.ylabel('Frequency')
    plt.vlines(q25, 0, 130, color='red', linestyle='dashed', label='0.25 quantile MRR = %.4f'%q25)
    plt.vlines(q50, 0, 115, color='navy', linestyle='dashed', label='0.50 quantile MRR = %.4f'%q50)
    plt.vlines(q75, 0, 115, color='darkgreen', linestyle='dashed', label='0.75 quantile MRR = %.4f'%q75)
    plt.legend( fontsize='x-small',loc="upper right")

    plt.savefig('ROCFigures/hist_MRR_NA.svg')
    
    return plt.show()


In [None]:
# get all files and run 

metrics_files = os.listdir('ROCmetrics')

hist_MRR_NA(metrics_files)

In [None]:
def hist_MRR_0(metrics_files):
    
    ''' input a list of all metrics files in ROCmetrics (or equivalent) folder '''
    
    mrr_scores = pd.DataFrame(columns=['measure', 'MRR_0']) # to store all MRR scores
    
    for file in metrics_files:
    
        data = pd.read_csv('ROCmetrics/{}'.format(file), sep='\t')

        mrr = data.loc[:, ['measure','MRR_0']]

        mrr_scores = mrr_scores.append(mrr)

    # calculate quantiles of MRR score distribution
    q25 = np.quantile(mrr_scores.loc[:, 'MRR_0'].tolist(), 0.25)
    q50 = np.quantile(mrr_scores.loc[:, 'MRR_0'].tolist(), 0.5)
    q75 = np.quantile(mrr_scores.loc[:, 'MRR_0'].tolist(), 0.75)
    
    # plot histogram with quantiles displayed
    plt.hist(mrr_scores['MRR_0'], color = 'mediumturquoise', edgecolor = 'white',
             bins = 13) # edit bins as required 
    plt.title('Zero-based MRR scores across all measures')
    plt.xlabel('MRR')
    plt.ylabel('Frequency')
    plt.vlines(q25, 0, 130, color='red', linestyle='dashed', label='0.25 quantile MRR = %.4f'%q25)
    plt.vlines(q50, 0, 115, color='navy', linestyle='dashed', label='0.50 quantile MRR = %.4f'%q50)
    plt.vlines(q75, 0, 115, color='darkgreen', linestyle='dashed', label='0.75 quantile MRR = %.4f'%q75)
    plt.legend( fontsize='x-small',loc="upper right")

    plt.savefig('ROCFigures/hist_MRR_0.svg')
    
    return plt.show()


In [None]:
# get all files and run 

metrics_files = os.listdir('ROCmetrics')

hist_MRR_0(metrics_files)

## Plot histogram of top 10 accuracy

Plot the distribution of top 10 accuracy across all scores.


In [None]:
def hist_top10(metrics_files):
    
    ''' input a list of all metrics files in ROCmetrics (or equivalent) folder '''
    
    mrr_scores = pd.DataFrame(columns=['measure', 'Top10_Acc']) # to store all MRR scores
    
    for file in metrics_files:
    
        data = pd.read_csv('ROCmetrics/{}'.format(file), sep='\t')

        mrr = data.loc[:, ['measure','Top10_Acc']]

        mrr_scores = mrr_scores.append(mrr)

    # calculate quantiles of MRR score distribution
    q25 = np.quantile(mrr_scores.loc[:, 'Top10_Acc'].tolist(), 0.25)
    q50 = np.quantile(mrr_scores.loc[:, 'Top10_Acc'].tolist(), 0.5)
    q75 = np.quantile(mrr_scores.loc[:, 'Top10_Acc'].tolist(), 0.75)
    
    # plot histogram with quantiles displayed
    plt.hist(mrr_scores['Top10_Acc'], color = 'coral', edgecolor = 'white',
             bins = 12) # edit bins as required 
    plt.title('Accuracy of the 10 highest scores across all measures')
    plt.xlabel('Top 10 accuracy')
    plt.ylabel('Frequency')
    plt.vlines(q25, 0, 130, color='red', linestyle='dashed', label='0.25 quantile top 10 accuracy = %.4f'%q25)
    plt.vlines(q50, 0, 115, color='navy', linestyle='dashed', label='0.50 quantile top 10 accuracy = %.4f'%q50)
    plt.vlines(q75, 0, 115, color='darkgreen', linestyle='dashed', label='0.75 quantile top 10 accuracy = %.4f'%q75)
    plt.legend( fontsize='x-small',loc="upper right")

    plt.savefig('ROCFigures/hist_top10.svg')
    
    return plt.show()


In [None]:
# get all files and run 

metrics_files = os.listdir('ROCmetrics')

hist_top10(metrics_files)