In [31]:
# [1] COLLECT ALL GROUND TRUTH DATA
import collections
from algorithm_output_parser import process_algorithm_output
from algorithm_output_parser import toboggan_clean_output_parser
from algorithm_output_parser import catfish_output_parser

from parse_truth_ALL import main as parse_truth_all
        
show_flag = False

froot_indices = [0,1,2,3]
froots = ['zebra', 'mouse', 'human', 'salmon']
datadirs = ['zebrafish', 'mouse', 'human', 'salmon']


# This is used for loading toboggan pathset info
def convert_text_to_path(line):
    text_list = line.strip().split('\t')[1]
    text_list = text_list[1:-1].split(', ')  # omit the brackets and commas
    return list(map(lambda x: int(x), text_list))

def compute_jaccard_index(set_1, set_2):
    n = len(set_1.intersection(set_2))
    return n / float(len(set_1) + len(set_2) - n)


def get_tob_results( inputfile, truth_dict ):
    alg_correct_counter = collections.defaultdict(int)
    alg_jaccard_counter = collections.defaultdict(float)
    truth_optsize_counter = collections.defaultdict(int)
    optsizedict = {}
    # temp data structures
    current_key_pattern = None
    temp_path_set = set()
    # iterate over lines to the end of file,
    # checking for instances in instance_set
    with open(inputfile, 'r') as reader:
        for line in reader:
            # get filename+instancenum info
            current_line = line.strip().split()
            if '#' in line:
                if current_key_pattern != None:  # this means we just finished collecting a pathset
                    toboggan_size = len(temp_path_set)
                    true_pathset = truth_dict[current_key_pattern]
                    
                    if len(temp_path_set) != len(true_pathset):
                        current_key_pattern = current_line[1] + ' ' + current_line[2]  # key = 'filenum instancenum'
                        temp_path_set = set()
                        continue
                    
                    else:
                        truth_optsize_counter[len(temp_path_set)] += 1
                        optsizedict[current_key_pattern] = 1
                        if temp_path_set == true_pathset:
                            alg_correct_counter[toboggan_size] += 1
                        else:
                            alg_jaccard_counter[toboggan_size] += compute_jaccard_index(temp_path_set, true_pathset)
                        
                current_key_pattern = current_line[1] + ' ' + current_line[2]  # key = 'filenum instancenum'
                temp_path_set = set()                
            else:
                #if which_alg == 'toboggan':
                path = tuple(map(lambda x: int(x), current_line))
                #else:
                #    path = tuple(map(lambda x: int(x), current_line[7::]))
                temp_path_set.add(path)

    return alg_correct_counter, alg_jaccard_counter, truth_optsize_counter, optsizedict



def get_cat_results( inputfile, truth_dict, optsizedict ):
    alg_correct_counter = collections.defaultdict(int)
    alg_jaccard_counter = collections.defaultdict(float)
    current_key_pattern = None
    temp_path_set = set()
    with open(inputfile, 'r') as reader:
        for line in reader:
            current_line = line.strip().split()
            if '#' in line:
                if current_key_pattern != None:  # this means we just finished collecting a pathset
                    # skip instances where toboggan hasn't verified optimality
                    if current_key_pattern in optsizedict.keys():
                        toboggan_size = len(temp_path_set)
                        true_pathset = truth_dict[current_key_pattern]

                        if temp_path_set == true_pathset:
                            alg_correct_counter[toboggan_size] += 1
                        else:
                            alg_jaccard_counter[toboggan_size] += compute_jaccard_index(temp_path_set, true_pathset)
                        
                current_key_pattern = current_line[1] + ' ' + current_line[2]  # key = 'filenum instancenum'
                temp_path_set = set()                
            else:
                path = tuple(map(lambda x: int(x), current_line[7::]))
                temp_path_set.add(path)

    return alg_correct_counter, alg_jaccard_counter

In [None]:
total_ksize_counter = collections.defaultdict(int)
total_toboggan_correct_counter = collections.defaultdict(int)
total_toboggan_jaccard_counter = collections.defaultdict(float)
total_catfish_correct_counter = collections.defaultdict(int)
total_catfish_jaccard_counter = collections.defaultdict(float)

for froot_idx in froot_indices:
    froot = froots[froot_idx]

    # COUNT RELEVANT INFO
    datadirending = datadirs[froot_idx]
    datadir = '/home/kyle/data/rnaseq/' + datadirending + '/'

    # [1] First get groundtruth pathset dictionary
    print("getting groundtruth for {}".format(froot))
    truth_dict = parse_truth_all(datadir)

    # [2] Next get toboggan results
    print("toboggan")
    tobogganfile = "./data/master-clean-" + froot + ".txt"
    toboggan_correct_counter, toboggan_jaccard_counter, truth_optsize_counter, optsizedict = get_tob_results(tobogganfile, truth_dict)
    for key in toboggan_correct_counter:
        total_toboggan_correct_counter[key] += toboggan_correct_counter[key]
    for key in toboggan_jaccard_counter:
        total_toboggan_jaccard_counter[key] += toboggan_jaccard_counter[key]
        
    # [3] Next get catfish results
    print("catfish")
    catfishfile = "../catfish-comparison/catfish-log-" + froot + ".txt"
    catfish_correct_counter, catfish_jaccard_counter = get_cat_results(catfishfile, truth_dict, optsizedict)
    for key in catfish_correct_counter:
        total_catfish_correct_counter[key] += catfish_correct_counter[key]
    for key in catfish_jaccard_counter:
        total_catfish_jaccard_counter[key] += catfish_jaccard_counter[key]

    for key, val in truth_optsize_counter.items():
        total_ksize_counter[key] += val
    

In [36]:
# PRODUCES TABLE FOR EXACT RECOVERY

total_nontrivial = 0
for ksize in range(2, 1 + max(total_ksize_counter.keys()) ):
    total_nontrivial += total_ksize_counter[ksize]

cat_total = 0
tob_total = 0
ksize_totals = 0

for ksize in range(2, 1 + max(total_ksize_counter.keys()) ):
    val1 = total_toboggan_correct_counter[ksize]
    val2 = total_catfish_correct_counter[ksize]
    num_ksize = total_ksize_counter[ksize]
    cat_total += val2
    tob_total += val1

    ksize_totals += num_ksize
    if num_ksize == 0:
        continue
    print("{:2d} & {:14.4f}\% & {:7.3f} & {:7.3f} \\\\".format(ksize, 100*num_ksize/total_nontrivial, val2/num_ksize, val1/num_ksize))

print("\\midrule")
print("all & {:3.2f}\% & {:7.3f} & {:7.3f} \\\\".format( 100*ksize_totals/total_nontrivial, cat_total/total_nontrivial, tob_total/total_nontrivial))

 2 &        43.0663\% &   0.992 &   0.972 \\
 3 &        24.3228\% &   0.972 &   0.972 \\
 4 &        14.4028\% &   0.945 &   0.946 \\
 5 &         8.6815\% &   0.913 &   0.913 \\
 6 &         5.2393\% &   0.884 &   0.881 \\
 7 &         3.1083\% &   0.868 &   0.861 \\
 8 &         1.1555\% &   0.846 &   0.837 \\
 9 &         0.0222\% &   0.740 &   0.741 \\
10 &         0.0012\% &   0.752 &   0.802 \\
11 &         0.0001\% &   0.500 &   0.500 \\
\midrule
all & 100.00\% &   0.962 &   0.953 \\


In [None]:
"""

EVERYTHING BELOW THIS I HAVEN'T EDITED YET


"""

In [5]:
import numpy
data_by_size_combined = {}
for size, count in collections.Counter(numpy.array(combined_stats[0])).items():
    data_by_size_combined[size] = [count]
total_num = sum( [ val[0] for val in data_by_size_combined.values() ] )
for which_alg in range(len(alg_names)):
    all_counter = collections.Counter(numpy.array(combined_stats[which_alg]))
    identical_counter = collections.Counter(numpy.array(combined_identicals[which_alg]))
    for size, count in identical_counter.items():
        data_by_size_combined[size].append(count/data_by_size_combined[size][0])

In [6]:
max_size = max(data_by_size_combined.keys())
ordered_list = [0 for x in range(0,max_size-2+1)]
for size, val in data_by_size_combined.items():
    ordered_list[size-2] = [ size, *val ]

total_num_insts = 0
for row in ordered_list:
    total_num_insts += row[1]

print("All instances combined, {}".format(total_num_insts))

print("{} & {}  &  &  &  {} &  {}  \\\\".format("$k$", "instances", "\\catfish", "\\toboggan"))
print("\\midrule")

for row in ordered_list:
    inst_size = row[0]
    num_inst = row[1]
    catfish = row[2]
    toboggan = row[3]
    # if inst_size < 5: continue
    print(" {:2d} & {:14.4f}\% & & & {:7.3f} & {:7.3f} \\\\".format(inst_size, 100*num_inst/total_num_insts, catfish, toboggan))

print(" All & 100\% & & & {:7.3f} & {:7.3f} \\\\".format( len(combined_identicals[0])/total_num_insts, len(combined_identicals[1])/total_num_insts ))

All instances combined, 1442629
$k$ & instances  &  &  &  \catfish &  \toboggan  \\
\midrule
  2 &        63.2791\% & & &   0.992 &   0.995 \\
  3 &        22.0775\% & & &   0.967 &   0.969 \\
  4 &         8.5237\% & & &   0.931 &   0.930 \\
  5 &         3.4920\% & & &   0.886 &   0.886 \\
  6 &         1.5375\% & & &   0.830 &   0.828 \\
  7 &         0.6698\% & & &   0.788 &   0.780 \\
  8 &         0.2889\% & & &   0.767 &   0.766 \\
  9 &         0.1241\% & & &   0.740 &   0.743 \\
 10 &         0.0070\% & & &   0.752 &   0.802 \\
 11 &         0.0004\% & & &   0.500 &   0.500 \\
 All & 100\% & & &   0.973 &   0.975 \\


In [7]:
# Make table DIVIDED BY SPECIES
import numpy
    
alg_names = ['catfish', 'toboggan']
individual_stats = {}
individual_identicals = {} 
data_by_size = {}
for idx_froot, froot in enumerate(froots):
    individual_stats[froot] = [[],[]]
    individual_identicals[froot] = [[],[]]
    for idx_alg, which_alg in enumerate(alg_names):
        print( idx_alg, idx_froot, froot)
        individual_stats[froot][idx_alg].extend( [ int(x) for x in all_stats[idx_froot][idx_alg][:,0] ]  )
        individual_identicals[froot][idx_alg].extend( [int(x) for x in all_stats[idx_froot][idx_alg][indices_identical[idx_froot][idx_alg],0]] )
        
    data_by_size[froot] = {}
    for size, count in collections.Counter(numpy.array(individual_stats[froot][0])).items():
        data_by_size[froot][size] = [count]
    total_num = sum( [ val[0] for val in data_by_size[froot].values() ] )
    for which_alg in range(len(alg_names)):
        all_counter = collections.Counter(numpy.array(individual_stats[froot][which_alg]))
        identical_counter = collections.Counter(numpy.array(individual_identicals[froot][which_alg]))
        for size, count in identical_counter.items():
            data_by_size[froot][size].append(count/data_by_size[froot][size][0])

0 0 human
1 0 human
0 1 mouse
1 1 mouse
0 2 zebra
1 2 zebra


In [8]:
for froot in froots:
    max_size = max(data_by_size[froot].keys())
    ordered_list = [0 for x in range(0,max_size-2+1)]
    for size, val in data_by_size[froot].items():
        ordered_list[size-2] = [ size, *val ]
    total_num_insts = 0
    for row in ordered_list:
        total_num_insts += row[1]
        
    print(froot, total_num_insts)
    print("{} & {}  &  {} &  {}  \\\\".format("$k$", "instances", "\\catfish", "\\toboggan"))
    print("\\midrule")

    
    for row in ordered_list:
        inst_size = row[0]
        num_inst = row[1]
        catfish = row[2]
        toboggan = row[3]
        # if inst_size < 5: continue
        print(" {:2d} & {:14.4f}\% & {:7.3f} & {:7.3f} \\\\".format(inst_size, 100*num_inst/total_num_insts, catfish, toboggan))
    print("\\midrule")
    print(" All & 100\% &  {:7.3f} & {:7.3f} \\\\".format( len(individual_identicals[froot][0])/total_num_insts, len(individual_identicals[froot][1])/total_num_insts ))

human 526815
$k$ & instances  &  \catfish &  \toboggan  \\
\midrule
  2 &        55.3832\% &   0.992 &   0.996 \\
  3 &        24.8378\% &   0.970 &   0.973 \\
  4 &        11.0312\% &   0.937 &   0.939 \\
  5 &         4.9135\% &   0.897 &   0.894 \\
  6 &         2.2336\% &   0.846 &   0.847 \\
  7 &         0.9848\% &   0.805 &   0.798 \\
  8 &         0.4212\% &   0.766 &   0.767 \\
  9 &         0.1860\% &   0.734 &   0.748 \\
 10 &         0.0087\% &   0.761 &   0.848 \\
\midrule
 All & 100\% &    0.969 &   0.972 \\
mouse 470349
$k$ & instances  &  \catfish &  \toboggan  \\
\midrule
  2 &        59.4943\% &   0.992 &   0.995 \\
  3 &        23.4974\% &   0.966 &   0.968 \\
  4 &         9.6369\% &   0.930 &   0.928 \\
  5 &         4.1312\% &   0.880 &   0.883 \\
  6 &         1.8605\% &   0.821 &   0.814 \\
  7 &         0.8402\% &   0.776 &   0.769 \\
  8 &         0.3755\% &   0.773 &   0.770 \\
  9 &         0.1573\% &   0.751 &   0.746 \\
 10 &         0.0066\% &   0.677 &  

In [None]:
# PLOT: GROUDTRUTH RECOVERY -- main image

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy
import collections
figurename = 'groundtruth-recovery-'

def rebin_above_counter(input_counter, threshold=5):
    temp_ddict = collections.defaultdict(int)
    threshold_dict = collections.defaultdict(int)
    for key, val in input_counter.items():
        if key >= threshold:
            threshold_dict[threshold] += val
        else:
            temp_ddict[key] = val
    return temp_ddict, threshold_dict

def rebin_below_counter(input_counter, threshold=5):
    temp_ddict = collections.defaultdict(int)
    threshold_dict = collections.defaultdict(int)
    for key, val in input_counter.items():
        if key < threshold:
            threshold_dict[threshold] += val
        else:
            temp_ddict[key] = val
    return temp_ddict, threshold_dict

def get_font_info():
    yaxis_font_size = 12
    xaxis_font_size = 13
    legend_font_size = 10
    axis_font = {'fontname':'DejaVu Sans', 'size':'{}'.format(xaxis_font_size)}
    title_font = {'fontname':'DejaVu Sans', 'size':'16'}
    xtick_font = 9
    ytick_font = 9
    return yaxis_font_size, xaxis_font_size, legend_font_size, axis_font, title_font, xtick_font, ytick_font

def plot_recovery_outset(figurename, all_stats, identical_stats, counter_cutoff=5, show_flag=True, annotation_flag=True):

    def tally_ratios_and_counts(all_counter, identical_counter):
        data_ratios = []
        soln_sizes = []
        num_datapoints = []
        for soln_size, x in all_counter.items():
            y = identical_counter[soln_size]
            if x != 0:
                data_ratios.append(y/x)
                num_datapoints.append(int(x))
                soln_sizes.append(int(soln_size))
        return data_ratios, soln_sizes, num_datapoints
                
    # [1] collect data
    alg_names = ['catfish', 'toboggan']
    ratio_correctness_both = {}
    soln_sizes_both = {}
    num_datapoints_both = {}
    ratio_correctness_outliers = {}
    
    for which_alg in range(len(alg_names)):

        all_counter = collections.Counter(numpy.array(all_stats[which_alg]))
        identical_counter = collections.Counter(numpy.array(identical_stats[which_alg]))
        # Move all datapoints of size > counter_cutoff into the last bin
        all_counter, threshold_all_counter = rebin_above_counter(all_counter, counter_cutoff)
        identical_counter, threshold_identical_counter = rebin_above_counter(identical_counter, counter_cutoff)
        ratio_correctness_both[which_alg], soln_sizes_xdata, instance_count_ydata = tally_ratios_and_counts(all_counter, identical_counter)
        ratio_correctness_outliers[which_alg], soln_sizes_outliers_xdata, outliers_count_ydata = tally_ratios_and_counts(threshold_all_counter, threshold_identical_counter)

    # [1.1] get bar heights for "instance count by decomp size"
    #   x = soln_sizes_xdata
    #   y = instance_ratio_bar_heights_ydata
    total_num_instances = sum(instance_count_ydata)
    instance_ratio_bar_heights_ydata = list(numpy.array(instance_count_ydata)/total_num_instances)
    outlier_ratio_bar_heights_ydata = list(numpy.array(outliers_count_ydata)/total_num_instances)
    
    # [2] Set up both axes
    plt.close()
    plt.clf()
    plt.figure(figsize=(4,3))
    ax = plt.gca()
    fig = plt.gcf()

    color_toboggan =  tuple( x/256 for x in [212,170,0] ) # cm.colors[0]
    color_catfish = tuple( x/256 for x in [0.0,160.0,212.0] ) # cm.colors[3]
    color_average = tuple( x/512 for x in [212,340,212] )
    colors = [color_catfish, color_toboggan]
    decomp_size_color = tuple([ x/256 for x in [220,220,220] ] )
    
    yaxis_font_size, xaxis_font_size, legend_font_size, axis_font, title_font, xtick_font, ytick_font = get_font_info()
    
    width = 0.3       # the width of the bars
    top_zorder = 10
    bottom_zorder = 1
    percent_or_ratio_scalar = 100 # change to 1 to make ratio instead of percent

    # [3] PLOT MAIN DATA
    # [3.1] Plot bar silhouette for instance ratios
    ydata = [ x*percent_or_ratio_scalar for x in instance_ratio_bar_heights_ydata ]
    plt.bar( soln_sizes_xdata, ydata, 2*width, color='none',
            edgecolor='black', linewidth=0.5, zorder=top_zorder, label='ground truth')
    
    # [3.2] Make colored bars for toboggan / catfish
    scaled_data_both = ratio_correctness_both.copy()
    for idx in range(len(instance_count_ydata)):
        scaled_data_both[0][idx] *= ydata[idx]
        scaled_data_both[1][idx] *= ydata[idx]
    which_alg = 0            
    plt.bar([x-width/2 for x in soln_sizes_xdata ], scaled_data_both[which_alg], width,
            alpha=1, color=colors[which_alg], label=alg_names[which_alg], zorder=bottom_zorder)
    which_alg = 1
    plt.bar( [x+width/2 for x in soln_sizes_xdata ], scaled_data_both[which_alg], width,
             alpha=1, color=colors[which_alg], label=alg_names[which_alg], zorder=bottom_zorder)

    # [4] PLOT OUTLIER BARS
    # [4.1] Plot bar silhouette for instance ratios
    ydata = [ x*percent_or_ratio_scalar for x in outlier_ratio_bar_heights_ydata ]
    plt.bar( soln_sizes_outliers_xdata, ydata, 2*width, color='none',
            edgecolor='black', linewidth=0.5, zorder=top_zorder)
    
    # [4.2] Make colored bars for toboggan / catfish
    scaled_data_both = ratio_correctness_outliers.copy()
    for idx in range(len(outliers_count_ydata)):
        scaled_data_both[0][idx] *= ydata[idx]
        scaled_data_both[1][idx] *= ydata[idx]
    which_alg = 0            
    plt.bar([x-width/2 for x in soln_sizes_outliers_xdata ], scaled_data_both[which_alg], width,
            alpha=1, color=colors[which_alg], zorder=bottom_zorder, hatch='//////')
    which_alg = 1
    plt.bar( [x+width/2 for x in soln_sizes_outliers_xdata ], scaled_data_both[which_alg], width,
             alpha=1, color=colors[which_alg], zorder=bottom_zorder, hatch='\\\\\\\\\\\\')
    mpl.rcParams['hatch.color'] = 'white'
    mpl.rcParams['hatch.linewidth'] = 2.25

    annotation_text = 'see\ninset'
    if annotation_flag:
        annotation_text = 'see\ntable'
    plt.annotate(annotation_text, xy=(counter_cutoff, .3*percent_or_ratio_scalar), xytext=(counter_cutoff, .11*percent_or_ratio_scalar), fontsize=9, fontstyle='italic', horizontalalignment='center' )

    # MODIFY AXES
    #   X-AXIS
    plt.xlabel("Decomposition size (k)", **axis_font)
    xlims = plt.xlim()
    # Determine xlims
    xlimmin = xlims[0]
    xlimmax = max(soln_sizes_xdata) + 1.5
    plt.xlim([xlimmin, xlimmax])
    ax.tick_params('x', labelsize=xtick_font)
    ax.set_xticks( [x for x in range(2,counter_cutoff+1) ] )
    new_xticklabels = [str(x) for x in list(range(2, counter_cutoff))]
    new_xticklabels.extend( ['{}+'.format(counter_cutoff)] )
    ax.set_xticklabels( new_xticklabels )
    
    #   Y-AXIS
    plt.ylabel("Percent of instances", color='black')
    plt.gca().yaxis.label.set_size(yaxis_font_size)
    # ax.set_yticks([])
    plt.ylim([0,percent_or_ratio_scalar])
    
    plt.legend(loc='upper right', frameon=False, prop={'size':legend_font_size})#, handles=ax1, labels=alg_names)
    ax.tick_params('y', colors='black', labelsize=ytick_font)

    # plt.title(figtitle, **title_font)
    fig.tight_layout()

    # REMOVE SPINES
    # Hide the right and top spines
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    # ax.set_axis_bgcolor(tuple([x/256 for x in [247,247,247] ]))
    # deprecated
    ax.set_facecolor(tuple([x/256 for x in [229,229,229] ]))
    

    # Only show ticks on the left and bottom spines
    ax.yaxis.set_ticks_position('left')
    plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off')
    
    if show_flag:
        plt.show()
    else:
        plt.savefig( './figures/' + ''.join([figurename, '.pdf']), format='pdf', dpi=300)
        plt.savefig( './figures/' + ''.join([figurename, '.svg']), format='svg', dpi=300)
        plt.savefig( './figures/' + ''.join([figurename, '.png']), format='png', dpi=300)
    plt.close()
    print("Done with plot {}".format(figurename))

In [None]:
counter_cutoff = 5

plot_recovery_outset('all-recovery-stats-outset', combined_stats.copy(), combined_identicals.copy(), counter_cutoff, True)
plot_recovery_outset('all-recovery-stats-outset', combined_stats.copy(), combined_identicals.copy(), counter_cutoff, False)

In [None]:
# PLOT: GROUDTRUTH RECOVERY -- inset image

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy
import collections

def plot_recovery_inset(figurename, all_stats, identical_stats, counter_cutoff=5, show_flag=True):

    def tally_ratios_and_counts(all_counter, identical_counter):
        data_ratios = []
        soln_sizes = []
        num_datapoints = []
        for soln_size, x in all_counter.items():
            y = identical_counter[soln_size]
            if x != 0:
                data_ratios.append(y/x)
                num_datapoints.append(int(x))
                soln_sizes.append(int(soln_size))
        return data_ratios, soln_sizes, num_datapoints
                
    # [1] collect data
    alg_names = ['catfish', 'toboggan']
    ratio_correctness_both = {}
    soln_sizes_both = {}
    num_datapoints_both = {}
    
    for which_alg in range(len(alg_names)):
        all_counter = collections.Counter(numpy.array(all_stats[which_alg]))
        total_num_instances = sum(all_counter.values())
        identical_counter = collections.Counter(numpy.array(identical_stats[which_alg]))
        # Move all datapoints of size > counter_cutoff into the last bin
        all_counter, threshold_all_counter = rebin_below_counter(all_counter, counter_cutoff)
        identical_counter, threshold_identical_counter = rebin_below_counter(identical_counter, counter_cutoff)
        ratio_correctness_both[which_alg], soln_sizes_xdata, instance_count_ydata = tally_ratios_and_counts(all_counter, identical_counter)
        print(all_counter)
    # [1.1] get bar heights for "instance count by decomp size"
    #   x = soln_sizes_xdata
    #   y = instance_ratio_bar_heights_ydata
    instance_ratio_bar_heights_ydata = list(numpy.array(instance_count_ydata)/total_num_instances)
    
    # [2] Set up both axes
    plt.close()
    plt.clf()
    plt.figure(figsize=(4,3))
    ax = plt.gca()
    fig = plt.gcf()

    color_toboggan =  tuple( x/256 for x in [212,170,0] ) # cm.colors[0]
    color_catfish = tuple( x/256 for x in [0.0,160.0,212.0] ) # cm.colors[3]
    color_average = tuple( x/512 for x in [212,340,212] )
    colors = [color_catfish, color_toboggan]
    decomp_size_color = tuple([ x/256 for x in [220,220,220] ] )
    
    yaxis_font_size, xaxis_font_size, legend_font_size, axis_font, title_font, xtick_font, ytick_font = get_font_info()
    
    width = 0.3       # the width of the bars
    top_zorder = 10
    bottom_zorder = 1
    percent_or_ratio_scalar = 100

    # [3] PLOT MAIN DATA
    # [3.1] Plot bar silhouette for instance ratios
    ydata = [ x*percent_or_ratio_scalar for x in instance_ratio_bar_heights_ydata ]
    plt.bar( soln_sizes_xdata, ydata, 2*width, color='none',
            edgecolor='black', linewidth=0.5, zorder=top_zorder)
    
    # [3.2] Make colored bars for toboggan / catfish
    scaled_data_both = ratio_correctness_both.copy()
    for idx in range(len(instance_count_ydata)):
        scaled_data_both[0][idx] *= ydata[idx]
        scaled_data_both[1][idx] *= ydata[idx]
    which_alg = 0            
    plt.bar([x-width/2 for x in soln_sizes_xdata ], scaled_data_both[which_alg], width,
            alpha=1, color=colors[which_alg], label=alg_names[which_alg], zorder=bottom_zorder)
    which_alg = 1
    plt.bar( [x+width/2 for x in soln_sizes_xdata ], scaled_data_both[which_alg], width,
             alpha=1, color=colors[which_alg], label=alg_names[which_alg], zorder=bottom_zorder)

    # MODIFY AXES
    #   X-AXIS
    #plt.xlabel("decomposition size", **axis_font)
    xlims = plt.xlim()
    # Determine xlims
    xlimmin = xlims[0]
    xlimmax = max(soln_sizes_xdata) + 0.5
    plt.xlim([xlimmin, xlimmax])
    ax.tick_params('x', labelsize=xtick_font)
    new_xticklabels = [str(x) for x in list(range(counter_cutoff, max(soln_sizes_xdata)+1))]
    ax.set_xticks( [x for x in range(counter_cutoff, max(soln_sizes_xdata)+1) ] )
    ax.set_xticklabels( new_xticklabels )

    #   Y-AXIS
    #plt.ylabel("ratio of instances", color='black', **axis_font)
    # ax.set_yticks([])
    # plt.ylim([0,1])
    # plt.legend(loc='upper right', frameon=False, prop={'size':legend_font_size})#, handles=ax1, labels=alg_names)
    ax.tick_params('y', colors='black', labelsize=ytick_font)

    #plt.title("ground truth recovery", **title_font)
    fig.tight_layout()

    # REMOVE SPINES
    # Hide the right and top spines
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    # ax.set_axis_bgcolor(tuple([x/256 for x in [229,229,229] ]))
    # deprecated
    ax.set_facecolor(tuple([x/256 for x in [229,229,229] ]))

    # Only show ticks on the left and bottom spines
    ax.yaxis.set_ticks_position('left')
    plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off')

    # ax.xaxis.set_ticks_position('bottom')    

    if show_flag:
        plt.show()
    else:
        plt.savefig( './figures/' + ''.join([figurename, '.pdf']), format='pdf', dpi=300)
        plt.savefig( './figures/' + ''.join([figurename, '.svg']), format='svg', dpi=300)
        plt.savefig( './figures/' + ''.join([figurename, '.png']), format='png', dpi=300)
    plt.close()
    print("Done with plot {}".format(figurename))

In [None]:
plot_recovery_inset('all-recovery-stats-inset', combined_stats.copy(), combined_identicals.copy(), counter_cutoff, True)
plot_recovery_inset('all-recovery-stats-inset', combined_stats.copy(), combined_identicals.copy(), counter_cutoff, False)

In [None]:
# Now do species specific

# combine all datasets
counter_cutoff = 5
alg_names = ['catfish', 'toboggan']
for idx_froot, froot in enumerate(froots):
    combined_stats = [[],[]]
    combined_identicals = [[],[]]
    for idx_alg, which_alg in enumerate(alg_names):
        print( idx_alg, idx_froot, froot)
        combined_stats[idx_alg].extend( [ int(x) for x in all_stats[idx_froot][idx_alg][:,0] ]  )
        combined_identicals[idx_alg].extend( [int(x) for x in all_stats[idx_froot][idx_alg][indices_identical[idx_froot][idx_alg],0]] )
    plot_recovery_outset( 'recovery-stats-outset-' + froot, combined_stats.copy(), combined_identicals.copy(), counter_cutoff, True)
    plot_recovery_outset( 'recovery-stats-outset-' + froot, combined_stats.copy(), combined_identicals.copy(), counter_cutoff, False)

In [15]:
# NOW DO BOXPLOTS ON INSTANCES WHERE ALGORITHM MISSES GROUNDTRUTH

def get_stats(set1, set2):
    identical = False
    if set1 == set2:
        identical = True
    len_intersect = len(set1.intersection(set2))
    jaccard = len_intersect / ( len(set1) + len(set2) - len_intersect )
    if len(set1) == 0:
        recall = 1.0
    else:
        recall = len_intersect / len(set1)

    return recall, jaccard, identical

# all_path_info, all_toboggan_data = get_all_data( [0,1,2] )
# froots = ['human', 'mouse', 'zebra']

data_catfish = {}
data_toboggan = {}
num_nontrivials = []
num_both_wrong = collections.defaultdict(int)

froots = ['zebra', 'human', 'mouse']

for froot in froots:
    gt_pathset_dict = all_path_info['groundtruth'][froot]
    toboggan_paths = all_path_info['toboggan'][froot]
    catfish_paths = all_path_info['catfish'][froot]
    
    this_catfish = []
    this_toboggan = []
    this_num_nontrivial = 0

    for key, gt_path in gt_pathset_dict.items():
        gt_size = len(gt_path)
        if gt_size == 1:  # skip trivial instances
            continue
        cat_recall, cat_jaccard, cat_identical = get_stats(gt_path, catfish_paths[key])
        tob_recall, tob_jaccard, tob_identical = get_stats(gt_path, toboggan_paths[key])
        if not cat_identical:
            this_catfish.append( (gt_size, cat_recall, cat_jaccard, key) )
        if not tob_identical:
            this_toboggan.append((gt_size, tob_recall, tob_jaccard, key) )
        this_num_nontrivial += 1
        if not cat_identical and not tob_identical:
            num_both_wrong[froot] += 1
    
    data_catfish[froot] = this_catfish
    data_toboggan[froot] = this_toboggan
    num_nontrivials.append(this_num_nontrivial)

    print("On {}".format(froot))
    print("catfish has {}".format(len(this_catfish)))
    print("toboggan has {}".format(len(this_toboggan)))
    print("intersection is {}".format(num_both_wrong[froot]))

On zebra
catfish has 9069
toboggan has 8085
intersection is 5692
On human
catfish has 18088
toboggan has 17292
intersection is 12262
On mouse
catfish has 16660
toboggan has 16406
intersection is 11657


In [38]:
# Violinplots for ground truth recovery

import numpy
import matplotlib.pyplot as plt
import collections

froots = ['zebra', 'human', 'mouse']
figurename = 'alg-jaccard-'
alg_names = ['catfish', 'toboggan']

color_toboggan =  tuple( x/300 for x in [212,170,0] ) # cm.colors[0]
color_catfish = tuple( x/300 for x in [0.0,160.0,212.0] ) # cm.colors[3]
violin_colors = [color_catfish, color_toboggan]
violin_bar_color = [0, 0, 0]

violin_positions = [1,1.45]

violin_width = 0.4

show_flag = False


for idx, froot in enumerate(froots):
    
    this_catfish = data_catfish[froot]
    this_toboggan = data_toboggan[froot]
    num_nontrivial = num_nontrivials[idx]

    sizes_cat, recall_cat, jaccard_cat, keys_cat = zip(*this_catfish)
    sizes_tob, recall_tob, jaccard_tob, keys_tob = zip(*this_toboggan)

    plt.close()
    plt.clf()
    
    fig = plt.gcf()
    ax = fig.add_subplot(111)
    ax = plt.gca()

    violin_parts = ax.violinplot([jaccard_cat, jaccard_tob], violin_positions, widths=violin_width, showmeans=False, showextrema=True, showmedians=True)

    # change color
    # violin_colors defined above
    for vp_idx, vp in enumerate(violin_parts['bodies']):
        vp.set( facecolor=violin_colors[vp_idx], alpha=1 )
    # Make all the violin statistics marks light gray:
    for partname in ('cbars','cmins','cmaxes', 'cmedians'):
        vp = violin_parts[partname]
        vp.set_edgecolor( tuple( [x/256 for x in violin_bar_color ] ) )
        # vp.set_linewidth(1)    
        
    if idx == 0:
        plt.ylabel('Jaccard index', fontsize=16)
        ax.tick_params('y', labelsize=15)
        ax.yaxis.set_ticks_position('left')
        ax.yaxis.set_label_position('left')
    else:
        plt.tick_params(
            axis='y',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off')
        ax.set_yticks([])
            
    ylims = plt.ylim()
    plt.ylim([ylims[0],1])


    # Hide the right and top spines
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    # Only show ticks on the left and bottom spines
    plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off')

    ax.set_xticks( violin_positions )
    ax.set_xticklabels( [alg_names[0] , alg_names[1] ], fontsize=14 )

    fig.tight_layout()

    if show_flag:
        plt.show()
    else:
        plt.savefig( './figures/' + ''.join([figurename, froot, '.pdf']), format='pdf', dpi=300)
        plt.savefig( './figures/' + ''.join([figurename, froot, '.png']), format='png', dpi=300)
        plt.savefig( './figures/' + ''.join([figurename, froot, '.svg']), format='svg', dpi=300)
    plt.close()
    print("Done with plot {}".format(figurename))

Done with plot alg-jaccard-
Done with plot alg-jaccard-
Done with plot alg-jaccard-


In [39]:
# Aggregate Violinplot for ground truth recovery

all_jaccard_cat = []
all_jaccard_tob = []

for idx, froot in enumerate(froots):
    
    this_catfish = data_catfish[froot]
    this_toboggan = data_toboggan[froot]
    num_nontrivial = num_nontrivials[idx]

    sizes_cat, recall_cat, jaccard_cat, keys_cat = zip(*this_catfish)
    sizes_tob, recall_tob, jaccard_tob, keys_tob = zip(*this_toboggan)
    all_jaccard_cat.extend(jaccard_cat)
    all_jaccard_tob.extend(jaccard_tob)


    
plt.close()
plt.clf()


# plt.figure(figsize=(3,3))
fig = plt.gcf()
ax = fig.add_subplot(111)
ax = plt.gca()

violin_parts = ax.violinplot([all_jaccard_cat, all_jaccard_tob], violin_positions,  widths=violin_width, showmeans=False, showextrema=True, showmedians=True)

# change color
# violin_colors defined above
for vp_idx, vp in enumerate(violin_parts['bodies']):
    vp.set( facecolor=violin_colors[vp_idx], alpha=1 )
# Make all the violin statistics marks light gray:
for partname in ('cbars','cmins','cmaxes', 'cmedians'):
    vp = violin_parts[partname]
    vp.set_edgecolor( tuple( [x/256 for x in violin_bar_color ] ) )
    # vp.set_linewidth(1)    


plt.ylabel('Jaccard index', fontsize=16)
ylims = plt.ylim()
plt.ylim([ylims[0],1])
ax.tick_params('y', labelsize=15)


# REMOVE SPINES
# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
# ax.set_facecolor(tuple([x/256 for x in [229,229,229] ]))


# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position('right')
ax.yaxis.set_label_position('right')
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off')

ax.set_xticks( violin_positions )
ax.set_xticklabels( [alg_names[0] , alg_names[1] ], fontsize=15 )

fig.tight_layout()

if show_flag:
    plt.show()
else:
    plt.savefig( './figures/' + ''.join([figurename, '-all.pdf']), format='pdf', dpi=300)
    plt.savefig( './figures/' + ''.join([figurename, '-all.png']), format='png', dpi=300)
    plt.savefig( './figures/' + ''.join([figurename, '-all.svg']), format='svg', dpi=300)
plt.close()
print("Done with plot {}".format(figurename))

Done with plot alg-jaccard-


In [None]:
# FILTER DATA
from load_all_data import get_all_data

# [1] COLLECT ALL GROUND TRUTH DATA
from parse_truth_ALL import main as parse_truth_all
import collections
from algorithm_output_parser import toboggan_output_parser

print("\\toprule")
print("Dataset &  instances &  non-trivial &  optimal &  non-optimal \\\\")
print("\\midrule")
list_eq_k_wrong_path = []
froots = ['zebra', 'mouse', 'human']
total_nontrivial = 0
total_instances = 0
total_num_gt_optimal = 0
total_num_gt_wrong = 0 

for froot in froots:
    #print("working on {}...".format(froot))
    # all_path_info, all_toboggan_data = get_all_data( [0,1,2] )
    gt_pathset_dict = all_path_info['groundtruth'][froot]
    all_toboggan_paths = all_path_info['toboggan'][froot]

    # COUNT RELEVANT INFO
    num_eq_k_wrong_path = 0
    num_nontrivial = 0
    num_gt_optimal_size = 0
    num_gt_wrong_size = 0
    num_timeouts = 0
    
    total_instances += len(gt_pathset_dict)
    
    for key, gt_pathset in gt_pathset_dict.items():
        if len(gt_pathset) == 1:
            continue
        tob_pathset = all_toboggan_paths[key]
        num_nontrivial += 1
        if tob_pathset == set():
            num_timeouts += 1
            continue
        if len(gt_pathset) == len(tob_pathset):
            num_gt_optimal_size += 1
            if set(gt_pathset) != set(tob_pathset):
                num_eq_k_wrong_path += 1
        else:
            num_gt_wrong_size += 1
    list_eq_k_wrong_path.append(num_eq_k_wrong_path)

    total_nontrivial += num_nontrivial
    total_num_gt_optimal += num_gt_optimal_size
    total_num_gt_wrong += num_gt_wrong_size
    print("\\texttt{{ {} }} & {:10d}& {:10d} & {:10.3f}\\% & {:10.3f}\\% \\\\ \% {}".format(froot,
        len(gt_pathset_dict), num_nontrivial, 100*num_gt_optimal_size/(num_nontrivial), 100*num_gt_wrong_size/num_nontrivial, num_timeouts))

print(" All & {:10d} & {:10d} & {:10.3f}\\% & {:10.3f}\\% \\\\ ".format(
        total_instances, total_nontrivial, 100*total_num_gt_optimal/total_nontrivial,
        100*total_num_gt_wrong/total_nontrivial) )
print(list_eq_k_wrong_path)