In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))


import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import os
import pandas as pd
import seaborn as sb
import statistics as st

from Bio import SeqIO

sb.set()
pd.set_option("display.max_rows", None)

# Evaluation of Swarm on Franzén data

The following notebook describes the steps and results of the evaluation.

In [None]:
# Initial files and directories:
#
# model_supported_franzen
# |- data # will contain the in-silico sequenced data sets
# |
# |- evaluation # will contain the evaluation plots and tables
# |
# |- outputs # will contain the cluster and metric outputs
# |
# |- tasks  # task files for the different runs of Swarm
# |
# \- 40168_2015_105_MOESM9_ESM.csv  # CSV version of Additional file 9 of Franzén et al. (see below)

## Analysis workflow

The data sets are prepared as described in Franzén et al., *Improved OTU-picking using long-read 16S rRNA gene amplicon sequencing and generic hierarchical clustering* (https://doi.org/10.1186/s40168-015-0105-6),
except that there are no Ns between the forward and reverse portion of the reads.

The taxonomic assignment is obtained from the known sources during the creation of the mock communities (simulated sequencing).

Swarm (v3) is executed with and without the fastidious refinement step. Runs without refinement have the suffix `__nf`, while those with refinement end with `__2f`.

## Commands

The following commands prepare and cluster the data sets. The results are evaluated below.

In order to execute the workflow as provided here, the `tools` subdirectory of the overall repository has to contain the binaries of Infernal (cmbuild, cmalign) and ART (art_illumina), but the paths can be adjusted. 

IMPORTANT: The commands are not intended to be executed from this notebook. They should be executed from the root directory of the overall repository.

In [None]:
%%bash

TOOLS_DIR=tools
ANALYSIS_DIR=analyses/model_supported_franzen
DATA_DIR=${ANALYSIS_DIR}/data
OUTPUT_DIR=${ANALYSIS_DIR}/outputs

SWARM=${TOOLS_DIR}/swarm-3.0.0/bin/swarm # adjust to your system

RUNS=( swarm_v3__nf swarm_v3__2f )

# prepare tools and reference data
CMBUILD_PATH=${TOOLS_DIR}/infernal-1.1.2-linux-intel-gcc/binaries/cmbuild # adjust to your system
python -m scripts.analyses.analysis_franzen prepare ${DATA_DIR} --cmbuild ${CMBUILD_PATH}

# create list files of mock communities
python -m scripts.analyses.analysis_franzen lists ${ANALYSIS_DIR}/40168_2015_105_MOESM9_ESM.csv ${DATA_DIR}/list_files

# create mock communities
GG_DB=${DATA_DIR}/gg_13_5.fasta
MOCK_LIST=${DATA_DIR}/list_files/list.txt
MSA_MODEL=${DATA_DIR}/bacteria16S_508_mod5.cmfile
CMALIGN_PATH=${TOOLS_DIR}/infernal-1.1.2-linux-intel-gcc/binaries/cmalign # adjust to your system
ART_PATH=${TOOLS_DIR}/art_bin_VanillaIceCream/art_illumina # adjust to your system
python -m scripts.analyses.analysis_franzen mock ${GG_DB} ${MOCK_LIST} ${MSA_MODEL} ${DATA_DIR} --cmalign ${CMALIGN_PATH} --art ${ART_PATH} --miseq --skip_ambiguous --num_n 0

# run Swarm and determine clustering quality
DATA=( "LC" "MC" "HC" )
REGIONS=( "V3-V4" "V4" )
for D in "${DATA[@]}"; do
  for ((I=1; I<=10; I++)); do
    for R in "${REGIONS[@]}"; do
      echo ${D}_${I}_${R}
      READS=${D}_${I}_${R}:${DATA_DIR}/${D}_${I}/${D}_${I}_${R}_miseq_rs.fastq
      TAX=franzen:${DATA_DIR}/${D}_${I}/${D}_${I}_${R}_miseq_rs.tax
      for RUN in "${RUNS[@]}"; do
        python -m scripts.analyses.analysis_franzen run_swarm ${RUN} ${READS} ${ANALYSIS_DIR}/tasks/${RUN}.txt ${OUTPUT_DIR}/${D}_${I}_${R}/${RUN} --tax_files ${TAX} --swarm ${SWARM}
        for F in ${OUTPUT_DIR}/${D}_${I}_${R}/${RUN}/*__metrics.csv; do mv ${F} ${OUTPUT_DIR}/${D}_${I}_${R}/${RUN}/${RUN}_${F##*/}; done
      done
    done
  done
done

## Evaluation

**Configuration**

In [2]:
data_sets = ['LC', 'MC', 'HC']
num_samples = 10
regions = ['V3-V4', 'V4']
ground_truths = ['franzen']

opts = ['swarm_v3__nf', 'swarm_v3__2f']

data_dir = 'data'
results_dir = 'outputs'
eval_dir = 'evaluation'

### Number of clusters and amplicons

Reads the input files and the cluster outputs for all data sets and compares the number of clusters and amplicons.

In [3]:
# Requires the input and OTU files. Alternatively, the evaluation can use the stored information (see below).
df_columns = ['data_set', 'tool', 'mode', 'refinement', 'threshold', 'num_input_amplicons', 'input_mass', 'num_clusters', 'num_output_amplicons', 'output_mass', 'ds', 'rt']

rows = []

for ds in data_sets:
    for rt in regions:
        for s in range(1, num_samples + 1):
            run_name = '%s_%i_%s' % (ds, s, rt)
            
            seq_file = '%s/%s_%i/%s_miseq_rs.fastq' % (data_dir, ds, s, run_name) # the input sequences
            num_input_amplicons = 0
            input_mass = 0
            with open(seq_file, 'r') as in_file:
                for record in SeqIO.parse(in_file, 'fastq'):
                    num_input_amplicons += 1
                    input_mass += int(record.id.split('_')[-1]) if ('_' in record.id) else 1
            
            for opt in opts:
                otu_files = [f for f in os.listdir('%s/%s/%s/' % (results_dir, run_name, opt)) if f.endswith('_otus.txt')]
                
                
                for f in otu_files:
                    otu_file = '%s/%s/%s/%s' % (results_dir, run_name, opt, f)

                    num_output_amplicons = 0
                    num_clusters = 0
                    output_mass = 0
                    with open(otu_file, 'r') as in_file:
                        for line in in_file:
                            num_output_amplicons += len(line.strip().split(' '))
                            num_clusters += 1
                            output_mass += sum([int(m.split('_')[-1]) for m in line.strip().split(' ')])
                            
                    tool = 'swarm'
                    mode = 'swarm'
                    refinement = opt.split('__')[1]
                    threshold = float(f.split('_')[-2])
                            
                    rows.append([run_name, tool, mode, refinement, threshold, num_input_amplicons, input_mass, num_clusters, num_output_amplicons, output_mass, ds, rt])

df_counts = pd.DataFrame(rows, columns = df_columns)
df_counts.sort_values(by = ['data_set', 'tool', 'mode', 'refinement', 'threshold'], inplace = True)

*Column descriptions:*   
`num_input_amplicons`: The number of entries in the corresponding input file.   
`input_mass`: The sum of the abundances of all entries in the input file.   
`num_clusters`: The number of computed clusters.   
`num_output_amplicons`: The number of amplicons contained in the clusters.   
`output_mass`: The sum of the abundances of all amplicons contained in the clusters.   

In [4]:
df_counts[['data_set', 'tool', 'refinement', 'threshold', 'num_input_amplicons', 'input_mass', 'num_clusters', 'num_output_amplicons', 'output_mass']]

Unnamed: 0,data_set,tool,refinement,threshold,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass
549,HC_10_V3-V4,swarm,2f,1.0,9940,9940,9831,9935,9940
539,HC_10_V3-V4,swarm,nf,1.0,9940,9940,9884,9935,9940
541,HC_10_V3-V4,swarm,nf,2.0,9940,9940,9651,9935,9940
542,HC_10_V3-V4,swarm,nf,3.0,9940,9940,9002,9935,9940
543,HC_10_V3-V4,swarm,nf,4.0,9940,9940,7890,9935,9940
544,HC_10_V3-V4,swarm,nf,5.0,9940,9940,6234,9935,9940
545,HC_10_V3-V4,swarm,nf,6.0,9940,9940,4462,9935,9940
546,HC_10_V3-V4,swarm,nf,7.0,9940,9940,2956,9935,9940
547,HC_10_V3-V4,swarm,nf,8.0,9940,9940,1799,9935,9940
548,HC_10_V3-V4,swarm,nf,9.0,9940,9940,1090,9935,9940


In [5]:
df_counts.to_csv('%s/df_counts.csv' % eval_dir, sep = ';', index = False)
#df_counts = pd.read_csv('%s/df_counts.csv' % eval_dir, sep = ';')

### Clustering quality

In [6]:
# Requires the metrics files. Alternatively, the evaluation can use the stored information (see below).
dfs = []
for ds in data_sets:
    for rt in regions:
        for s in range(1, num_samples + 1):
            run_name = '%s_%i_%s' % (ds, s, rt)
            
            for opt in opts:
                for gt in ground_truths:
                    df = pd.read_csv('%s/%s/%s/%s_%s_%s__metrics.csv' % (results_dir, run_name, opt, opt, run_name, gt), sep = ';')
                    
                    df['gt'] = gt
                    df['mode'] = 'swarm'
                    df['refinement'] = [m.split('__')[1] for m in df['task']]
                    df['ds'] = ds
                    df['rt'] = rt
                    
                    dfs.append(df)
                    
df_quality = pd.concat(dfs, ignore_index = True)
df_quality.rename(columns = {'task': 'run', 'reads': 'data_set'}, inplace = True)
df_quality.sort_values(by = ['data_set', 'gt', 'tool', 'mode', 'refinement', 'threshold'], inplace = True)

*Column descriptions:*   
`precision`: Quantifies the extent to which amplicons in a cluster are also from the same species.   
`recall`: Measures the proportion of amplicons from the same species that are grouped in the same cluster.   
`adjrandindex`: Measures the agreement between the clusters and the taxonomic assignment and corrects for chance.   

In [7]:
df_quality[['data_set', 'gt', 'tool', 'refinement', 'threshold', 'precision', 'recall', 'adjrandindex']]

Unnamed: 0,data_set,gt,tool,refinement,threshold,precision,recall,adjrandindex
549,HC_10_V3-V4,franzen,swarm,2f,1.0,0.99698,0.058983,0.003869
539,HC_10_V3-V4,franzen,swarm,nf,1.0,0.998993,0.054454,0.001077
540,HC_10_V3-V4,franzen,swarm,nf,2.0,0.995571,0.075893,0.00988
541,HC_10_V3-V4,franzen,swarm,nf,3.0,0.982788,0.1385,0.047334
542,HC_10_V3-V4,franzen,swarm,nf,4.0,0.963463,0.248012,0.134872
543,HC_10_V3-V4,franzen,swarm,nf,5.0,0.924811,0.41238,0.275227
544,HC_10_V3-V4,franzen,swarm,nf,6.0,0.880624,0.589834,0.421022
545,HC_10_V3-V4,franzen,swarm,nf,7.0,0.825767,0.740312,0.502953
546,HC_10_V3-V4,franzen,swarm,nf,8.0,0.77685,0.85546,0.508718
547,HC_10_V3-V4,franzen,swarm,nf,9.0,0.743634,0.92622,0.515905


In [8]:
df_quality.to_csv('%s/df_quality.csv' % eval_dir, sep = ';', index = False)
#df_quality = pd.read_csv('%s/df_quality.csv' % eval_dir, sep = ';')

Combine counting and quality information:

In [9]:
df_c, df_q = df_counts.copy(), df_quality.copy()
drop_cols = ['join_col'] + ['%s_counts' % s for s in set(df_q.columns) & set(df_c.columns)]
df_c['join_col'] = df_c['data_set'] + df_c['tool'] + df_c['mode'] + df_c['refinement'] + df_c['threshold'].apply(str)
df_q['join_col'] = df_q['data_set'] + df_q['tool'] + df_q['mode'] + df_q['refinement'] + df_q['threshold'].apply(str)
df_joined = df_q.join(df_c.set_index('join_col'), on = 'join_col', rsuffix = '_counts').drop(drop_cols, axis = 1)

In [10]:
df_joined.to_csv('%s/df_joined.csv' % eval_dir, sep = ';', index = False)
#df_joined = pd.read_csv('%s/df_joined.csv' % eval_dir, sep = ';')

Determine the maximum, average and N-best average clustering quality (for N = 5).

In [11]:
df_columns = ['data_set', 'gt', 'tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex', 'num_input_amplicons', 'input_mass', 'num_clusters', 'num_output_amplicons', 'output_mass', 'ds', 'rt']

max_rows = []
mean_rows = []
nbest_rows = []
n = 5

for (d, g, t, m, f, ds, rt), grp in df_joined.groupby(by = ['data_set', 'gt', 'tool', 'mode', 'refinement', 'ds', 'rt']):
    best = grp.nlargest(1, 'adjrandindex')
    max_rows.append([d, g, t, m, f, best['precision'].values[0], best['recall'].values[0], best['adjrandindex'].values[0], best['num_input_amplicons'].values[0], best['input_mass'].values[0], best['num_clusters'].values[0], best['num_output_amplicons'].values[0], best['output_mass'].values[0], ds, rt])
    mean_rows.append([d, g, t, m, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean(), grp['num_input_amplicons'].mean(), grp['input_mass'].mean(), grp['num_clusters'].mean(), grp['num_output_amplicons'].mean(), grp['output_mass'].mean(), ds, rt])
    nbest = grp.nlargest(n, 'adjrandindex')
    nbest_rows.append([d, g, t, m, f, nbest['precision'].mean(), nbest['recall'].mean(), nbest['adjrandindex'].mean(), nbest['num_input_amplicons'].mean(), nbest['input_mass'].mean(), nbest['num_clusters'].mean(), nbest['num_output_amplicons'].mean(), nbest['output_mass'].mean(), ds, rt])
    
df_joined_max = pd.DataFrame(max_rows, columns = df_columns)
df_joined_mean = pd.DataFrame(mean_rows, columns = df_columns)
df_joined_nbest = pd.DataFrame(nbest_rows, columns = df_columns)

In [12]:
df_joined_max.to_csv('%s/df_joined_max.csv' % eval_dir, sep = ';', index = False)
df_joined_mean.to_csv('%s/df_joined_mean.csv' % eval_dir, sep = ';', index = False)
df_joined_nbest.to_csv('%s/df_joined_nbest.csv' % eval_dir, sep = ';', index = False)
#df_joined_max = pd.read_csv('%s/df_joined_max.csv' % eval_dir, sep = ';')
#df_joined_mean = pd.read_csv('%s/df_joined_mean.csv' % eval_dir, sep = ';')
#df_joined_nbest = pd.read_csv('%s/df_joined_nbest.csv' % eval_dir, sep = ';')

In [13]:
df_max = df_joined_max.loc[df_joined_max['gt'] == 'franzen']
df_mean = df_joined_mean.loc[df_joined_mean['gt'] == 'franzen']
df_nbest = df_joined_nbest.loc[df_joined_nbest['gt'] == 'franzen']

For the chosen ground truth, average the maximum, average and N-best average values per complexity (e.g. LC) and read type (e.g. V3-V4). 

In [14]:
df_columns = ['data_set', 'gt', 'tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex', 'num_input_amplicons', 'input_mass', 'num_clusters', 'num_output_amplicons', 'output_mass', 'ds', 'rt']

def average_complexity(df):
    rows = []
    for (gt, ds, rt, tool, mode, f), grp in df.groupby(by = ['gt', 'ds', 'rt', 'tool', 'mode', 'refinement']):
        rows.append(['%s_%s' % (ds, rt), gt, tool, mode, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean(), grp['num_input_amplicons'].mean(), grp['input_mass'].mean(), grp['num_clusters'].mean(), grp['num_output_amplicons'].mean(), grp['output_mass'].mean(), ds, rt])
    return pd.DataFrame(rows, columns = df_columns)

In [15]:
df_joined_max_avg = average_complexity(df_max)
df_joined_mean_avg = average_complexity(df_mean)
df_joined_nbest_avg = average_complexity(df_nbest)

In [16]:
df_joined_max_avg.to_csv('%s/df_joined_max_avg.csv' % eval_dir, sep = ';', index = False)
df_joined_mean_avg.to_csv('%s/df_joined_mean_avg.csv' % eval_dir, sep = ';', index = False)
df_joined_nbest_avg.to_csv('%s/df_joined_nbest_avg.csv' % eval_dir, sep = ';', index = False)
#df_joined_max_avg = pd.read_csv('%s/df_joined_max_avg.csv' % eval_dir, sep = ';')
#df_joined_mean_avg = pd.read_csv('%s/df_joined_mean_avg.csv' % eval_dir, sep = ';')
#df_joined_nbest_avg = pd.read_csv('%s/df_joined_nbest_avg.csv' % eval_dir, sep = ';')

**Maximum clustering quality**

Rank by adjusted Rand index (per data set):

In [17]:
for (d, t), grp in df_joined_max_avg.groupby(by = ['data_set', 'tool']):
    print('Data set: %s / Tool: %s' % (d, t))
    display(grp.sort_values(by = 'adjrandindex', ascending = False))

Data set: HC_V3-V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
1,HC_V3-V4,franzen,swarm,swarm,nf,0.739601,0.925437,0.522373,9958.0,9958.0,1098.7,9952.4,9958.0,HC,V3-V4
0,HC_V3-V4,franzen,swarm,swarm,2f,0.996222,0.059555,0.004003,9958.0,9958.0,9840.7,9952.4,9958.0,HC,V3-V4


Data set: HC_V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
2,HC_V4,franzen,swarm,swarm,2f,0.825773,0.760851,0.515218,9958.0,9958.0,2334.0,8208.7,9958.0,HC,V4
3,HC_V4,franzen,swarm,swarm,nf,0.708053,0.876749,0.470145,9958.0,9958.0,1325.6,8208.7,9958.0,HC,V4


Data set: LC_V3-V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
5,LC_V3-V4,franzen,swarm,swarm,nf,0.874385,0.949943,0.799444,1988.0,1988.0,185.5,1987.1,1988.0,LC,V3-V4
4,LC_V3-V4,franzen,swarm,swarm,2f,0.999196,0.055759,0.002253,1988.0,1988.0,1975.0,1987.1,1988.0,LC,V3-V4


Data set: LC_V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
7,LC_V4,franzen,swarm,swarm,nf,0.868492,0.991508,0.820684,1988.0,1988.0,99.7,1659.5,1988.0,LC,V4
6,LC_V4,franzen,swarm,swarm,2f,0.942811,0.7577,0.671789,1988.0,1988.0,493.2,1659.5,1988.0,LC,V4


Data set: MC_V3-V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
9,MC_V3-V4,franzen,swarm,swarm,nf,0.807498,0.957864,0.695459,4966.0,4966.0,406.9,4962.6,4966.0,MC,V3-V4
8,MC_V3-V4,franzen,swarm,swarm,2f,0.998791,0.057249,0.00283,4966.0,4966.0,4924.1,4962.6,4966.0,MC,V3-V4


Data set: MC_V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
11,MC_V4,franzen,swarm,swarm,nf,0.781186,0.965133,0.679616,4966.0,4966.0,331.2,4101.5,4966.0,MC,V4
10,MC_V4,franzen,swarm,swarm,2f,0.892488,0.762384,0.620727,4966.0,4966.0,1184.4,4101.5,4966.0,MC,V4


Average maximum values over all data sets and sort by adjusted Rand index:

In [18]:
rows = []
for (t, m, f), grp in df_joined_max_avg.groupby(by = ['tool', 'mode', 'refinement']):
    rows.append([t, m, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean()])
pd.DataFrame(rows, columns = ['tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex']).sort_values(by = 'adjrandindex', ascending = False)

Unnamed: 0,tool,mode,refinement,precision,recall,adjrandindex
1,swarm,swarm,nf,0.796536,0.944439,0.66462
0,swarm,swarm,2f,0.942547,0.408916,0.302803


**Average clustering quality**

Rank by adjusted Rand index (per data set):

In [19]:
for (d, t), grp in df_joined_mean_avg.groupby(by = ['data_set', 'tool']):
    print('Data set: %s / Tool: %s' % (d, t))
    display(grp.sort_values(by = 'adjrandindex', ascending = False))

Data set: HC_V3-V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
1,HC_V3-V4,franzen,swarm,swarm,nf,0.875615,0.502917,0.288483,9958.0,9958.0,5349.75,9952.4,9958.0,HC,V3-V4
0,HC_V3-V4,franzen,swarm,swarm,2f,0.996222,0.059555,0.004003,9958.0,9958.0,9840.7,9952.4,9958.0,HC,V3-V4


Data set: HC_V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
2,HC_V4,franzen,swarm,swarm,2f,0.825773,0.760851,0.515218,9958.0,9958.0,2334.0,8208.7,9958.0,HC,V4
3,HC_V4,franzen,swarm,swarm,nf,0.593928,0.908861,0.373045,9958.0,9958.0,1014.2,8208.7,9958.0,HC,V4


Data set: LC_V3-V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
5,LC_V3-V4,franzen,swarm,swarm,nf,0.950822,0.489486,0.387664,1988.0,1988.0,1106.33,1987.1,1988.0,LC,V3-V4
4,LC_V3-V4,franzen,swarm,swarm,2f,0.999196,0.055759,0.002253,1988.0,1988.0,1975.0,1987.1,1988.0,LC,V3-V4


Data set: LC_V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
7,LC_V4,franzen,swarm,swarm,nf,0.86653,0.910635,0.734009,1988.0,1988.0,232.75,1659.5,1988.0,LC,V4
6,LC_V4,franzen,swarm,swarm,2f,0.942811,0.7577,0.671789,1988.0,1988.0,493.2,1659.5,1988.0,LC,V4


Data set: MC_V3-V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
9,MC_V3-V4,franzen,swarm,swarm,nf,0.918595,0.498619,0.357547,4966.0,4966.0,2705.35,4962.6,4966.0,MC,V3-V4
8,MC_V3-V4,franzen,swarm,swarm,2f,0.998791,0.057249,0.00283,4966.0,4966.0,4924.1,4962.6,4966.0,MC,V3-V4


Data set: MC_V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
10,MC_V4,franzen,swarm,swarm,2f,0.892488,0.762384,0.620727,4966.0,4966.0,1184.4,4101.5,4966.0,MC,V4
11,MC_V4,franzen,swarm,swarm,nf,0.747419,0.910547,0.583258,4966.0,4966.0,544.08,4101.5,4966.0,MC,V4


Average mean values over all data sets and sort by adjusted Rand index:

In [20]:
rows = []
for (t, m, f), grp in df_joined_mean_avg.groupby(by = ['tool', 'mode', 'refinement']):
    rows.append([t, m, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean()])
pd.DataFrame(rows, columns = ['tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex']).sort_values(by = 'adjrandindex', ascending = False)

Unnamed: 0,tool,mode,refinement,precision,recall,adjrandindex
1,swarm,swarm,nf,0.825485,0.703511,0.454001
0,swarm,swarm,2f,0.942547,0.408916,0.302803


**N-best average clustering quality**

Rank by adjusted Rand index (per data set):

In [21]:
for (d, t), grp in df_joined_nbest_avg.groupby(by = ['data_set', 'tool']):
    print('Data set: %s / Tool: %s' % (d, t))
    display(grp.sort_values(by = 'adjrandindex', ascending = False))

Data set: HC_V3-V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
1,HC_V3-V4,franzen,swarm,swarm,nf,0.782445,0.814887,0.48181,9958.0,9958.0,2207.44,9952.4,9958.0,HC,V3-V4
0,HC_V3-V4,franzen,swarm,swarm,2f,0.996222,0.059555,0.004003,9958.0,9958.0,9840.7,9952.4,9958.0,HC,V3-V4


Data set: HC_V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
2,HC_V4,franzen,swarm,swarm,2f,0.825773,0.760851,0.515218,9958.0,9958.0,2334.0,8208.7,9958.0,HC,V4
3,HC_V4,franzen,swarm,swarm,nf,0.639093,0.928892,0.439021,9958.0,9958.0,873.68,8208.7,9958.0,HC,V4


Data set: LC_V3-V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
5,LC_V3-V4,franzen,swarm,swarm,nf,0.911315,0.801278,0.681821,1988.0,1988.0,483.16,1987.1,1988.0,LC,V3-V4
4,LC_V3-V4,franzen,swarm,swarm,2f,0.999196,0.055759,0.002253,1988.0,1988.0,1975.0,1987.1,1988.0,LC,V3-V4


Data set: LC_V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
7,LC_V4,franzen,swarm,swarm,nf,0.856714,0.988027,0.806465,1988.0,1988.0,104.08,1659.5,1988.0,LC,V4
6,LC_V4,franzen,swarm,swarm,2f,0.942811,0.7577,0.671789,1988.0,1988.0,493.2,1659.5,1988.0,LC,V4


Data set: MC_V3-V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
9,MC_V3-V4,franzen,swarm,swarm,nf,0.855711,0.810055,0.615375,4966.0,4966.0,1145.98,4962.6,4966.0,MC,V3-V4
8,MC_V3-V4,franzen,swarm,swarm,2f,0.998791,0.057249,0.00283,4966.0,4966.0,4924.1,4962.6,4966.0,MC,V3-V4


Data set: MC_V4 / Tool: swarm


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
11,MC_V4,franzen,swarm,swarm,nf,0.759424,0.963268,0.65049,4966.0,4966.0,333.26,4101.5,4966.0,MC,V4
10,MC_V4,franzen,swarm,swarm,2f,0.892488,0.762384,0.620727,4966.0,4966.0,1184.4,4101.5,4966.0,MC,V4


Average mean values over all data sets and sort by adjusted Rand index:

In [22]:
rows = []
for (t, m, f), grp in df_joined_nbest_avg.groupby(by = ['tool', 'mode', 'refinement']):
    rows.append([t, m, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean()])
pd.DataFrame(rows, columns = ['tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex']).sort_values(by = 'adjrandindex', ascending = False)

Unnamed: 0,tool,mode,refinement,precision,recall,adjrandindex
1,swarm,swarm,nf,0.800784,0.884401,0.612497
0,swarm,swarm,2f,0.942547,0.408916,0.302803
