In [10]:
import json, os, pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import re
pd.set_option('display.max_rows', None)

SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

def set_axis_style(ax, labels, x_label, y_label):
    ax.get_xaxis().set_tick_params(direction='out')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_xticks(np.arange(1, len(labels) + 1))
    ax.set_xticklabels(labels)
    ax.set_xlim(0.25, len(labels) + 0.75)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    
def flatten(nested_list):
    return [item for sublist in nested_list for item in sublist]

def unzip(zipped_data):
    return list(zip(*zipped_data))


def highlight_max(data, color='yellow'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'background-color: {}'.format(color)
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

In [11]:
results_sop = []
results_cs = []
results_time = []
for program in os.listdir("../evaluation-data/"):
    if "max-dim" in program:
        continue
    results_for_prog_sop = []
    results_for_prog_cs = []
    results_for_prog_time = []

    for ref in os.listdir(f"../evaluation-data/{program}"):
        with open(f'../evaluation-data/{program}/{ref}/aggr.json') as f:
            data = json.load(f)
            results_for_prog_sop.append(data['sum_of_pairs'])
            results_for_prog_cs.append(data['column_score'])
            results_for_prog_time.append(data['time_ms'])
    results_sop.append((program, results_for_prog_sop))
    results_cs.append((program, results_for_prog_cs))
    results_time.append((program, results_for_prog_time))    

In [12]:
column_mapping = ['RV40', 'RV11', 'RV50', 'RV12', 'RV20', 'RV30']
df_sop = pd.DataFrame.from_items(results_sop, columns=column_mapping, orient='index')
df_cs = pd.DataFrame.from_items(results_cs, columns=column_mapping, orient='index')
df_time = pd.DataFrame.from_items(results_time, columns=column_mapping, orient='index')

TypeError: Argument 'objects' has incorrect type (expected numpy.ndarray, got list)

In [13]:
df_sop = df_sop[sorted(df_sop.columns.tolist())].sort_index()
df_cs = df_cs[sorted(df_cs.columns.tolist())].sort_index()
df_time = df_time[sorted(df_time.columns.tolist())].sort_index()

NameError: name 'df_sop' is not defined

In [None]:
results_sop_detail = {}
results_cs_detail = {}
results_time_detail = {}
for program in os.listdir("../evaluation-data/"):
#     if "Mafft-Fast" not in program and "Dialign" not in program and "spam-align-w-3_d-10" not in program:
#         continue
    results_for_prog_sop = defaultdict(list)
    results_for_prog_cs = defaultdict(list)
    results_for_prog_time = defaultdict(list)

    for ref in os.listdir(f"../evaluation-data/{program}"):
        for file in os.listdir(f"../evaluation-data/{program}/{ref}"):
            if not file.endswith("json") or "aggr" in file:
                continue
            with open(f"../evaluation-data/{program}/{ref}/{file}") as f:
                data = json.load(f)
                results_for_prog_sop[ref].append(data['sum_of_pairs'])
                results_for_prog_cs[ref].append(data['column_score'])
                results_for_prog_time[ref].append(data['time_ms'])

    results_sop_detail[program] = results_for_prog_sop
    results_cs_detail[program] = results_for_prog_cs
    results_time_detail[program] = results_for_prog_time

# Comparison of SoP and CS scores between Dialign, Mafft-Fast and spam-align

Spam-align has been run with a pattern set of 5 patterns with a weight of 3 and len of 13 each. This pattern set resulted in the best results on the balibase dataset.

Mafft-Fast stands for a call of Mafft (v7.310) with the arguments `--quiet --retree 1 --maxiterate 0`  representing the strategy `FFT-NS-1 (very fast; recommended for >2000 sequences; progressive method with a rough guide tree)`.  
Mafft-Accurate in the later tables stands for a call with the arguments `--quiet --localpair --maxiterate 1000` representig the strategy `L-INS-i (probably most accurate; recommended for <200 sequences; iterative refinement method incorporating local pairwise alignment information)`

In [None]:

def set_box_color(bp, color):
    plt.setp(bp['boxes'], color=color)
    plt.setp(bp['whiskers'], color=color)
    plt.setp(bp['caps'], color=color)
    plt.setp(bp['medians'], color=color)

def plot_detail_data(data, filename):
    ticks, data_mafft = unzip(sorted(data['Mafft-Fast'].items()))
    data_dialign = unzip(sorted(data['Dialign'].items()))[1]
    data_spam = unzip(sorted(data['spam-align-w-3_d-10'].items()))[1]
    
    fig = plt.figure(figsize=(10,7))
    bp1 = plt.boxplot(data_mafft, positions=np.array(range(len(data_mafft)))*3.0-0.7, sym='', widths=0.6)
    bp2 = plt.boxplot(data_dialign, positions=np.array(range(len(data_dialign)))*3.0, sym='', widths=0.6)
    bp3 = plt.boxplot(data_spam, positions=np.array(range(len(data_spam)))*3.0+0.7, sym='', widths=0.6)
    
    set_box_color(bp1, '#D7191C') # colors are from http://colorbrewer2.org/
    set_box_color(bp2, '#2C7BB6')
    set_box_color(bp3, '#31a354')

    plt.plot([], c='#D7191C', label='Mafft-Fast')
    plt.plot([], c='#2C7BB6', label='Dialign')
    plt.plot([], c='#31a354', label='Spam-Align')
    plt.legend()
    
    plt.xticks(range(0, len(ticks) * 3, 3), ticks)
    plt.xlim(-3, len(ticks)*3)
    plt.tight_layout()
    plt.savefig(filename)
plot_detail_data(results_sop_detail, 'sop-boxplot.png')
plot_detail_data(results_cs_detail, 'cs-boxplot.png')

## Sum of pairs score

In [None]:
df_sop

In [None]:
df_sop.loc['spam-align-w-2_d-10':].style.apply(highlight_max)

In [None]:
df_sop_best = df_sop.loc[['Dialign','Mafft-Accurate', 'Mafft-Fast','spam-align-w-3_d-10'],:]
with open('df_sop_best.tex', 'w') as f:
    f.writelines(df_sop_best.to_latex())

## Column score

In [None]:
df_cs

## Execution time in ms

In [None]:
df_time

In [None]:
def aggr_data(data):
    aggr_data = defaultdict(dict)
    for key, values in data.items():
        m = re.search('spam-align-w-(?P<w>\d)_d-(?P<d>\d+)', key)
        if m is None:
            continue
        w = int(m.group('w'))
        d = int(m.group('d'))
        aggr = np.mean(flatten(values.values()))
        aggr_data[w][d] = aggr 
    return aggr_data
sop_scores = aggr_data(results_sop_detail)
cs_scores = aggr_data(results_cs_detail)
time_aggr = aggr_data(results_time_detail)

In [None]:
plt.figure(figsize=(10,7))
for weight, data in sorted(sop_scores.items()):
    x, y = unzip(sorted(data.items()))
    plt.plot(x, y,label=weight)
plt.xlabel('# don\'t care positions')
plt.ylabel('SoP score')
plt.legend(title='weight')

In [None]:
plt.figure(figsize=(10,7))
for weight, data in sorted(cs_scores.items()):
    x, y = unzip(sorted(data.items()))
    plt.plot(x, y,label=weight)
plt.xlabel('# don\'t care positions')
plt.ylabel('Column score')
plt.legend(title='weight')

In [None]:
plt.figure(figsize=(10,7))
for weight, data in sorted(time_aggr.items()):
    x, y = unzip(sorted(data.items()))
    plt.plot(x, y,label=weight)
plt.xlabel('# don\'t care positions')
plt.ylabel('Time in ms')
plt.legend(title='weight')