In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))


import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import os
import pandas as pd
import seaborn as sb
import statistics as st

from Bio import SeqIO

sb.set()
pd.set_option("display.max_rows", None)

# Evaluation of USEARCH and VSEARCH on Franzén data

The following notebook describes the steps and results of the evaluation.

In [None]:
# Initial files and directories:
#
# uvsearch_franzen
# |- data # will contain the in-silico sequenced data sets
# |
# |- evaluation # will contain the evaluation plots and tables
# |
# |- outputs # will contain the cluster and metric outputs
# |
# |- tasks  # task files for the different runs of U/VSEARCH
# |
# \- 40168_2015_105_MOESM9_ESM.csv  # CSV version of Additional file 9 of Franzén et al. (see below)

## Analysis workflow

The data sets are prepared as described in Franzén et al., *Improved OTU-picking using long-read 16S rRNA gene amplicon sequencing and generic hierarchical clustering* (https://doi.org/10.1186/s40168-015-0105-6),
except that there are no Ns between the forward and reverse portion of the reads.

The taxonomic assignment is obtained from the known sources during the creation of the mock communities (simulated sequencing).

Both tools are run with several clustering options:
 - USEARCH: `-cluster_fast`, `-cluster_smallmem`
 - VSEARCH: `--cluster_fast`, `--cluster_size`, `--cluster_smallmem`

## Commands

The following commands prepare and cluster the data sets. The results are evaluated below.

In order to execute the workflow as provided here, the `tools` subdirectory of the overall repository has to contain the binaries of Infernal (cmbuild, cmalign) and ART (art_illumina),
a VSEARCH binary and the USEARCH binary `usearch11.0.667_i86linux32`, but the paths can be adjusted. 

IMPORTANT: The commands are not intended to be executed from this notebook. They should be executed from the root directory of the overall repository.

In [None]:
%%bash

TOOLS_DIR=tools
ANALYSIS_DIR=analyses/uvsearch_franzen
DATA_DIR=${ANALYSIS_DIR}/data
OUTPUT_DIR=${ANALYSIS_DIR}/outputs

USEARCH=${TOOLS_DIR}/usearch11.0.667_i86linux32 # adjust to your system
VSEARCH=${TOOLS_DIR}/vsearch-2.14.2-linux-x86_64/bin/vsearch # adjust to your system

RUNS=( usearch_fast_length usearch_fast_size usearch_smallmem_length usearch_smallmem_size vsearch_fast vsearch_size vsearch_smallmem_length vsearch_smallmem_size )

# prepare tools and reference data
CMBUILD_PATH=${TOOLS_DIR}/infernal-1.1.2-linux-intel-gcc/binaries/cmbuild # adjust to your system
python -m scripts.analyses.analysis_franzen prepare ${DATA_DIR} --cmbuild ${CMBUILD_PATH}

# create list files of mock communities
python -m scripts.analyses.analysis_franzen lists ${ANALYSIS_DIR}/40168_2015_105_MOESM9_ESM.csv ${DATA_DIR}/list_files

# create mock communities
GG_DB=${DATA_DIR}/gg_13_5.fasta
MOCK_LIST=${DATA_DIR}/list_files/list.txt
MSA_MODEL=${DATA_DIR}/bacteria16S_508_mod5.cmfile
CMALIGN_PATH=${TOOLS_DIR}/infernal-1.1.2-linux-intel-gcc/binaries/cmalign # adjust to your system
ART_PATH=${TOOLS_DIR}/art_bin_VanillaIceCream/art_illumina # adjust to your system
python -m scripts.analyses.analysis_franzen mock ${GG_DB} ${MOCK_LIST} ${MSA_MODEL} ${DATA_DIR} --cmalign ${CMALIGN_PATH} --art ${ART_PATH} --miseq --skip_ambiguous --num_n 0

# run U/VSEARCH an determine clustering quality
DATA=( "LC" "MC" "HC" )
REGIONS=( "V3-V4" "V4" )
for D in "${DATA[@]}"; do
  for ((I=1; I<=10; I++)); do
    for R in "${REGIONS[@]}"; do
      echo ${D}_${I}_${R}
      READS=${D}_${I}_${R}:${DATA_DIR}/${D}_${I}/${D}_${I}_${R}_miseq_rs.fastq
      TAX=franzen:${DATA_DIR}/${D}_${I}/${D}_${I}_${R}_miseq_rs.tax
      for RUN in "${RUNS[@]}"; do
        python -m scripts.analyses.analysis_franzen run_uvsearch ${RUN} ${READS} ${ANALYSIS_DIR}/tasks/${RUN}.txt ${OUTPUT_DIR}/${D}_${I}_${R}/${RUN} --tax_files ${TAX} --usearch ${USEARCH} --vsearch ${VSEARCH}
        for F in ${OUTPUT_DIR}/${D}_${I}_${R}/${RUN}/*__metrics.csv; do mv ${F} ${OUTPUT_DIR}/${D}_${I}_${R}/${RUN}/${RUN}_${F##*/}; done
      done
    done
  done
done

## Evaluation

**Configuration**

In [2]:
data_sets = ['LC', 'MC', 'HC']
num_samples = 10
regions = ['V3-V4', 'V4']
ground_truths = ['franzen']

opts = ['usearch_fast_length', 'usearch_fast_size', 'usearch_smallmem_length', 'usearch_smallmem_size', 'vsearch_fast', 'vsearch_size', 'vsearch_smallmem_length', 'vsearch_smallmem_size']

data_dir = 'data'
results_dir = 'outputs'
eval_dir = 'evaluation'

### Number of clusters and amplicons

Reads the input files and the cluster outputs for all data sets and compares the number of clusters and amplicons.

In [3]:
# Requires the input and OTU files. Alternatively, the evaluation can use the stored information (see below).
df_columns = ['data_set', 'tool', 'mode', 'refinement', 'threshold', 'num_input_amplicons', 'input_mass', 'num_clusters', 'num_output_amplicons', 'output_mass', 'ds', 'rt']

rows = []

for ds in data_sets:
    for rt in regions:
        for s in range(1, num_samples + 1):
            run_name = '%s_%i_%s' % (ds, s, rt)
            
            seq_file = '%s/%s_%i/%s_miseq_rs.fastq' % (data_dir, ds, s, run_name) # the input sequences
            num_input_amplicons = 0
            input_mass = 0
            with open(seq_file, 'r') as in_file:
                for record in SeqIO.parse(in_file, 'fastq'):
                    num_input_amplicons += 1
                    input_mass += int(record.id.split('_')[-1]) if ('_' in record.id) else 1
            
            for opt in opts:                
                otu_files = [f for f in os.listdir('%s/%s/%s/' % (results_dir, run_name, opt)) if f.endswith('_otus.txt')]

                for f in otu_files:
                    otu_file = '%s/%s/%s/%s' % (results_dir, run_name, opt, f)

                    num_output_amplicons = 0
                    num_clusters = 0
                    output_mass = 0
                    with open(otu_file, 'r') as in_file:
                        for line in in_file:
                            num_output_amplicons += len(line.strip().split(' '))
                            num_clusters += 1
                            output_mass += sum([int(m.split('_')[-1]) for m in line.strip().split(' ')])
                            
                    tool, mode = f.split('__')
                    mode = mode.split('_0')[0]
                    refinement = 'nf'
                    threshold = float(f.split('_')[-2])

                    rows.append([run_name, tool, mode, refinement, threshold, num_input_amplicons, input_mass, num_clusters, num_output_amplicons, output_mass, ds, rt])
            
df_counts = pd.DataFrame(rows, columns = df_columns)

*Column descriptions:*   
`num_input_amplicons`: The number of entries in the corresponding input file.   
`input_mass`: The sum of the abundances of all entries in the input file.   
`num_clusters`: The number of computed clusters.   
`num_output_amplicons`: The number of amplicons contained in the clusters.   
`output_mass`: The sum of the abundances of all amplicons contained in the clusters.   

In [4]:
df_counts[['data_set', 'tool', 'mode', 'threshold', 'num_input_amplicons', 'input_mass', 'num_clusters', 'num_output_amplicons', 'output_mass']]

Unnamed: 0,data_set,tool,mode,threshold,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass
0,LC_1_V3-V4,usearch,fast_length,0.9,1980,1980,62,1979,1980
1,LC_1_V3-V4,usearch,fast_length,0.91,1980,1980,68,1979,1980
2,LC_1_V3-V4,usearch,fast_length,0.92,1980,1980,74,1979,1980
3,LC_1_V3-V4,usearch,fast_length,0.93,1980,1980,76,1979,1980
4,LC_1_V3-V4,usearch,fast_length,0.94,1980,1980,79,1979,1980
5,LC_1_V3-V4,usearch,fast_length,0.95,1980,1980,85,1979,1980
6,LC_1_V3-V4,usearch,fast_length,0.96,1980,1980,95,1979,1980
7,LC_1_V3-V4,usearch,fast_length,0.97,1980,1980,143,1979,1980
8,LC_1_V3-V4,usearch,fast_length,0.98,1980,1980,563,1979,1980
9,LC_1_V3-V4,usearch,fast_length,0.99,1980,1980,1667,1979,1980


In [5]:
df_counts.to_csv('%s/df_counts.csv' % eval_dir, sep = ';', index = False)
#df_counts = pd.read_csv('%s/df_counts.csv' % eval_dir, sep = ';')

### Clustering quality

In [6]:
# Requires the metrics files. Alternatively, the evaluation can use the stored information (see below).
dfs = []
for ds in data_sets:
    for rt in regions:
        for s in range(1, num_samples + 1):
            run_name = '%s_%i_%s' % (ds, s, rt)
            
            for opt in opts:
                for gt in ground_truths:
                    df = pd.read_csv('%s/%s/%s/%s_%s_%s__metrics.csv' % (results_dir, run_name, opt, opt, run_name, gt), sep = ';')
                    
                    df['gt'] = gt
                    df['mode'] = [m.split('__')[-1] for m in df['task']]
                    df['refinement'] = 'nf'
                    df['ds'] = ds
                    df['rt'] = rt
                    
                    dfs.append(df)
                    
df_quality = pd.concat(dfs, ignore_index = True)
df_quality.rename(columns = {'task': 'run', 'reads': 'data_set'}, inplace = True)

*Column descriptions:*   
`precision`: Quantifies the extent to which amplicons in a cluster are also from the same species.   
`recall`: Measures the proportion of amplicons from the same species that are grouped in the same cluster.   
`adjrandindex`: Measures the agreement between the clusters and the taxonomic assignment and corrects for chance.   

In [7]:
df_quality[['data_set', 'gt', 'tool', 'mode', 'threshold', 'precision', 'recall', 'adjrandindex']]

Unnamed: 0,data_set,gt,tool,mode,threshold,precision,recall,adjrandindex
0,LC_1_V3-V4,franzen,usearch,fast_length,0.9,0.621021,0.994442,0.620083
1,LC_1_V3-V4,franzen,usearch,fast_length,0.91,0.668519,0.990904,0.644716
2,LC_1_V3-V4,franzen,usearch,fast_length,0.92,0.730167,0.980798,0.692766
3,LC_1_V3-V4,franzen,usearch,fast_length,0.93,0.764528,0.995958,0.726177
4,LC_1_V3-V4,franzen,usearch,fast_length,0.94,0.787772,0.996968,0.740274
5,LC_1_V3-V4,franzen,usearch,fast_length,0.95,0.829207,0.979282,0.7779
6,LC_1_V3-V4,franzen,usearch,fast_length,0.96,0.879737,0.966145,0.847008
7,LC_1_V3-V4,franzen,usearch,fast_length,0.97,0.936331,0.934816,0.882608
8,LC_1_V3-V4,franzen,usearch,fast_length,0.98,0.971198,0.671551,0.649214
9,LC_1_V3-V4,franzen,usearch,fast_length,0.99,0.997979,0.182921,0.080414


In [8]:
df_quality.to_csv('%s/df_quality.csv' % eval_dir, sep = ';', index = False)
#df_quality = pd.read_csv('%s/df_quality.csv' % eval_dir, sep = ';')

Combine counting and quality information:

In [9]:
df_c, df_q = df_counts.copy(), df_quality.copy()
drop_cols = ['join_col'] + ['%s_counts' % s for s in set(df_q.columns) & set(df_c.columns)]
df_c['join_col'] = df_c['data_set'] + df_c['tool'] + df_c['mode'] + df_c['refinement'] + df_c['threshold'].apply(str)
df_q['join_col'] = df_q['data_set'] + df_q['tool'] + df_q['mode'] + df_q['refinement'] + df_q['threshold'].apply(str)
df_joined = df_q.join(df_c.set_index('join_col'), on = 'join_col', rsuffix = '_counts').drop(drop_cols, axis = 1)

In [10]:
df_joined.to_csv('%s/df_joined.csv' % eval_dir, sep = ';', index = False)
#df_joined = pd.read_csv('%s/df_joined.csv' % eval_dir, sep = ';')

Determine the maximum, average and N-best average clustering quality (for N = 5).

In [11]:
df_columns = ['data_set', 'gt', 'tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex', 'num_input_amplicons', 'input_mass', 'num_clusters', 'num_output_amplicons', 'output_mass', 'ds', 'rt']

max_rows = []
mean_rows = []
nbest_rows = []
n = 5

for (d, g, t, m, f, ds, rt), grp in df_joined.groupby(by = ['data_set', 'gt', 'tool', 'mode', 'refinement', 'ds', 'rt']):
    best = grp.nlargest(1, 'adjrandindex')
    max_rows.append([d, g, t, m, f, best['precision'].values[0], best['recall'].values[0], best['adjrandindex'].values[0], best['num_input_amplicons'].values[0], best['input_mass'].values[0], best['num_clusters'].values[0], best['num_output_amplicons'].values[0], best['output_mass'].values[0], ds, rt])
    mean_rows.append([d, g, t, m, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean(), grp['num_input_amplicons'].mean(), grp['input_mass'].mean(), grp['num_clusters'].mean(), grp['num_output_amplicons'].mean(), grp['output_mass'].mean(), ds, rt])
    nbest = grp.nlargest(n, 'adjrandindex')
    nbest_rows.append([d, g, t, m, f, nbest['precision'].mean(), nbest['recall'].mean(), nbest['adjrandindex'].mean(), nbest['num_input_amplicons'].mean(), nbest['input_mass'].mean(), nbest['num_clusters'].mean(), nbest['num_output_amplicons'].mean(), nbest['output_mass'].mean(), ds, rt])
    
df_joined_max = pd.DataFrame(max_rows, columns = df_columns)
df_joined_mean = pd.DataFrame(mean_rows, columns = df_columns)
df_joined_nbest = pd.DataFrame(nbest_rows, columns = df_columns)

In [12]:
df_joined_max.to_csv('%s/df_joined_max.csv' % eval_dir, sep = ';', index = False)
df_joined_mean.to_csv('%s/df_joined_mean.csv' % eval_dir, sep = ';', index = False)
df_joined_nbest.to_csv('%s/df_joined_nbest.csv' % eval_dir, sep = ';', index = False)
#df_joined_max = pd.read_csv('%s/df_joined_max.csv' % eval_dir, sep = ';')
#df_joined_mean = pd.read_csv('%s/df_joined_mean.csv' % eval_dir, sep = ';')
#df_joined_nbest = pd.read_csv('%s/df_joined_nbest.csv' % eval_dir, sep = ';')

In [13]:
df_max = df_joined_max.loc[df_joined_max['gt'] == 'franzen']
df_mean = df_joined_mean.loc[df_joined_mean['gt'] == 'franzen']
df_nbest = df_joined_nbest.loc[df_joined_nbest['gt'] == 'franzen']

For the chosen ground truth, average the maximum, average and N-best average values per complexity (e.g. LC) and read type (e.g. V3-V4).   

In [14]:
df_columns = ['data_set', 'gt', 'tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex', 'num_input_amplicons', 'input_mass', 'num_clusters', 'num_output_amplicons', 'output_mass', 'ds', 'rt']

def average_complexity(df):
    rows = []
    for (gt, ds, rt, tool, mode, f), grp in df.groupby(by = ['gt', 'ds', 'rt', 'tool', 'mode', 'refinement']):
        rows.append(['%s_%s' % (ds, rt), gt, tool, mode, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean(), grp['num_input_amplicons'].mean(), grp['input_mass'].mean(), grp['num_clusters'].mean(), grp['num_output_amplicons'].mean(), grp['output_mass'].mean(), ds, rt])
    return pd.DataFrame(rows, columns = df_columns)

In [15]:
df_joined_max_avg = average_complexity(df_max)
df_joined_mean_avg = average_complexity(df_mean)
df_joined_nbest_avg = average_complexity(df_nbest)

In [16]:
df_joined_max_avg.to_csv('%s/df_joined_max_avg.csv' % eval_dir, sep = ';', index = False)
df_joined_mean_avg.to_csv('%s/df_joined_mean_avg.csv' % eval_dir, sep = ';', index = False)
df_joined_nbest_avg.to_csv('%s/df_joined_nbest_avg.csv' % eval_dir, sep = ';', index = False)
#df_joined_max_avg = pd.read_csv('%s/df_joined_max_avg.csv' % eval_dir, sep = ';')
#df_joined_mean_avg = pd.read_csv('%s/df_joined_mean_avg.csv' % eval_dir, sep = ';')
#df_joined_nbest_avg = pd.read_csv('%s/df_joined_nbest_avg.csv' % eval_dir, sep = ';')

**Maximum clustering quality**

Rank by adjusted Rand index (per data set):

In [17]:
for (d, t), grp in df_joined_max_avg.groupby(by = ['data_set', 'tool']):
    print('Data set: %s / Tool: %s' % (d, t))
    display(grp.sort_values(by = 'adjrandindex', ascending = False))

Data set: HC_V3-V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
0,HC_V3-V4,franzen,usearch,fast_length,nf,0.791259,0.890521,0.634581,9958.0,9958.0,706.1,9952.4,9958.0,HC,V3-V4
2,HC_V3-V4,franzen,usearch,smallmem_length,nf,0.789708,0.897318,0.623739,9958.0,9958.0,669.3,9952.4,9958.0,HC,V3-V4
3,HC_V3-V4,franzen,usearch,smallmem_size,nf,0.789708,0.897318,0.623739,9958.0,9958.0,669.3,9952.4,9958.0,HC,V3-V4
1,HC_V3-V4,franzen,usearch,fast_size,nf,0.789497,0.889525,0.618244,9958.0,9958.0,712.0,9952.4,9958.0,HC,V3-V4


Data set: HC_V3-V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
4,HC_V3-V4,franzen,vsearch,fast,nf,0.811487,0.924978,0.655363,9958.0,9958.0,668.3,9952.4,9958.0,HC,V3-V4
5,HC_V3-V4,franzen,vsearch,size,nf,0.811477,0.924978,0.655353,9958.0,9958.0,668.2,9952.4,9958.0,HC,V3-V4
6,HC_V3-V4,franzen,vsearch,smallmem_length,nf,0.811477,0.924978,0.655353,9958.0,9958.0,668.2,9952.4,9958.0,HC,V3-V4
7,HC_V3-V4,franzen,vsearch,smallmem_size,nf,0.811477,0.924978,0.655353,9958.0,9958.0,668.2,9952.4,9958.0,HC,V3-V4


Data set: HC_V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
8,HC_V4,franzen,usearch,fast_length,nf,0.73889,0.894667,0.569112,9958.0,9958.0,725.0,8208.7,9958.0,HC,V4
10,HC_V4,franzen,usearch,smallmem_length,nf,0.783703,0.83813,0.541965,9958.0,9958.0,1588.3,8208.7,9958.0,HC,V4
11,HC_V4,franzen,usearch,smallmem_size,nf,0.783703,0.83813,0.541965,9958.0,9958.0,1588.3,8208.7,9958.0,HC,V4
9,HC_V4,franzen,usearch,fast_size,nf,0.784459,0.833643,0.539726,9958.0,9958.0,1635.9,8208.7,9958.0,HC,V4


Data set: HC_V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
12,HC_V4,franzen,vsearch,fast,nf,0.786198,0.832504,0.542587,9958.0,9958.0,1585.2,8208.7,9958.0,HC,V4
13,HC_V4,franzen,vsearch,size,nf,0.786198,0.832504,0.542587,9958.0,9958.0,1585.2,8208.7,9958.0,HC,V4
14,HC_V4,franzen,vsearch,smallmem_length,nf,0.786198,0.832504,0.542587,9958.0,9958.0,1585.2,8208.7,9958.0,HC,V4
15,HC_V4,franzen,vsearch,smallmem_size,nf,0.786198,0.832504,0.542587,9958.0,9958.0,1585.2,8208.7,9958.0,HC,V4


Data set: LC_V3-V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
17,LC_V3-V4,franzen,usearch,fast_size,nf,0.910176,0.938597,0.843113,1988.0,1988.0,135.9,1987.1,1988.0,LC,V3-V4
16,LC_V3-V4,franzen,usearch,fast_length,nf,0.907172,0.943068,0.841104,1988.0,1988.0,133.0,1987.1,1988.0,LC,V3-V4
18,LC_V3-V4,franzen,usearch,smallmem_length,nf,0.892913,0.95751,0.837057,1988.0,1988.0,118.9,1987.1,1988.0,LC,V3-V4
19,LC_V3-V4,franzen,usearch,smallmem_size,nf,0.892913,0.95751,0.837057,1988.0,1988.0,118.9,1987.1,1988.0,LC,V3-V4


Data set: LC_V3-V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
20,LC_V3-V4,franzen,vsearch,fast,nf,0.90591,0.94766,0.846127,1988.0,1988.0,132.4,1987.1,1988.0,LC,V3-V4
21,LC_V3-V4,franzen,vsearch,size,nf,0.90591,0.94766,0.846127,1988.0,1988.0,132.4,1987.1,1988.0,LC,V3-V4
22,LC_V3-V4,franzen,vsearch,smallmem_length,nf,0.90591,0.94766,0.846127,1988.0,1988.0,132.4,1987.1,1988.0,LC,V3-V4
23,LC_V3-V4,franzen,vsearch,smallmem_size,nf,0.90591,0.94766,0.846127,1988.0,1988.0,132.4,1987.1,1988.0,LC,V3-V4


Data set: LC_V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
24,LC_V4,franzen,usearch,fast_length,nf,0.913169,0.943146,0.845603,1988.0,1988.0,132.6,1659.5,1988.0,LC,V4
26,LC_V4,franzen,usearch,smallmem_length,nf,0.886956,0.969671,0.832461,1988.0,1988.0,118.9,1659.5,1988.0,LC,V4
27,LC_V4,franzen,usearch,smallmem_size,nf,0.886956,0.969671,0.832461,1988.0,1988.0,118.9,1659.5,1988.0,LC,V4
25,LC_V4,franzen,usearch,fast_size,nf,0.878528,0.984331,0.832083,1988.0,1988.0,97.8,1659.5,1988.0,LC,V4


Data set: LC_V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
28,LC_V4,franzen,vsearch,fast,nf,0.890804,0.971638,0.836464,1988.0,1988.0,118.9,1659.5,1988.0,LC,V4
29,LC_V4,franzen,vsearch,size,nf,0.890804,0.971638,0.836464,1988.0,1988.0,118.9,1659.5,1988.0,LC,V4
30,LC_V4,franzen,vsearch,smallmem_length,nf,0.890804,0.971638,0.836464,1988.0,1988.0,118.9,1659.5,1988.0,LC,V4
31,LC_V4,franzen,vsearch,smallmem_size,nf,0.890804,0.971638,0.836464,1988.0,1988.0,118.9,1659.5,1988.0,LC,V4


Data set: MC_V3-V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
33,MC_V3-V4,franzen,usearch,fast_size,nf,0.861779,0.919627,0.773512,4966.0,4966.0,339.7,4962.6,4966.0,MC,V3-V4
34,MC_V3-V4,franzen,usearch,smallmem_length,nf,0.859436,0.920857,0.767829,4966.0,4966.0,335.5,4962.6,4966.0,MC,V3-V4
35,MC_V3-V4,franzen,usearch,smallmem_size,nf,0.859436,0.920857,0.767829,4966.0,4966.0,335.5,4962.6,4966.0,MC,V3-V4
32,MC_V3-V4,franzen,usearch,fast_length,nf,0.863253,0.917003,0.767498,4966.0,4966.0,342.6,4962.6,4966.0,MC,V3-V4


Data set: MC_V3-V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
36,MC_V3-V4,franzen,vsearch,fast,nf,0.873321,0.937516,0.790458,4966.0,4966.0,335.1,4962.6,4966.0,MC,V3-V4
37,MC_V3-V4,franzen,vsearch,size,nf,0.873321,0.937516,0.790458,4966.0,4966.0,335.1,4962.6,4966.0,MC,V3-V4
38,MC_V3-V4,franzen,vsearch,smallmem_length,nf,0.873321,0.937516,0.790458,4966.0,4966.0,335.1,4962.6,4966.0,MC,V3-V4
39,MC_V3-V4,franzen,vsearch,smallmem_size,nf,0.873321,0.937516,0.790458,4966.0,4966.0,335.1,4962.6,4966.0,MC,V3-V4


Data set: MC_V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
40,MC_V4,franzen,usearch,fast_length,nf,0.835802,0.923611,0.737577,4966.0,4966.0,334.8,4101.5,4966.0,MC,V4
41,MC_V4,franzen,usearch,fast_size,nf,0.836071,0.90775,0.706774,4966.0,4966.0,535.3,4101.5,4966.0,MC,V4
42,MC_V4,franzen,usearch,smallmem_length,nf,0.844354,0.897491,0.705809,4966.0,4966.0,580.7,4101.5,4966.0,MC,V4
43,MC_V4,franzen,usearch,smallmem_size,nf,0.844354,0.897491,0.705809,4966.0,4966.0,580.7,4101.5,4966.0,MC,V4


Data set: MC_V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
44,MC_V4,franzen,vsearch,fast,nf,0.839025,0.907263,0.707587,4966.0,4966.0,531.5,4101.5,4966.0,MC,V4
45,MC_V4,franzen,vsearch,size,nf,0.839025,0.907263,0.707587,4966.0,4966.0,531.5,4101.5,4966.0,MC,V4
46,MC_V4,franzen,vsearch,smallmem_length,nf,0.839025,0.907263,0.707587,4966.0,4966.0,531.5,4101.5,4966.0,MC,V4
47,MC_V4,franzen,vsearch,smallmem_size,nf,0.839025,0.907263,0.707587,4966.0,4966.0,531.5,4101.5,4966.0,MC,V4


Best option of VSEARCH better than best option of USEARCH on each V3-V4 data set and vice versa for V4 data sets.

Average maximum values over all data sets and sort by adjusted Rand index:

In [18]:
rows = []
for (t, m, f), grp in df_joined_max_avg.groupby(by = ['tool', 'mode', 'refinement']):
    rows.append([t, m, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean()])
pd.DataFrame(rows, columns = ['tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex']).sort_values(by = 'adjrandindex', ascending = False)

Unnamed: 0,tool,mode,refinement,precision,recall,adjrandindex
0,usearch,fast_length,nf,0.841591,0.918669,0.732579
4,vsearch,fast,nf,0.851124,0.92026,0.729764
5,vsearch,size,nf,0.851122,0.92026,0.729763
6,vsearch,smallmem_length,nf,0.851122,0.92026,0.729763
7,vsearch,smallmem_size,nf,0.851122,0.92026,0.729763
1,usearch,fast_size,nf,0.843418,0.912246,0.718909
2,usearch,smallmem_length,nf,0.842845,0.913496,0.718144
3,usearch,smallmem_size,nf,0.842845,0.913496,0.718144


VSEARCH options makes no difference. For USEARCH, length-sorted slightly better. Minor differences between normal / fast and smallmem versions.

`USEARCH` pick: `fast_length` = `-cluster_fast -sort length
`

`VSEARCH` pick: `size` = `--cluster_size`

**Average clustering quality**

Rank by adjusted Rand index (per data set):

In [19]:
for (d, t), grp in df_joined_mean_avg.groupby(by = ['data_set', 'tool']):
    print('Data set: %s / Tool: %s' % (d, t))
    display(grp.sort_values(by = 'adjrandindex', ascending = False))

Data set: HC_V3-V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
0,HC_V3-V4,franzen,usearch,fast_length,nf,0.643079,0.844202,0.409204,9958.0,9958.0,1434.93,9952.4,9958.0,HC,V3-V4
2,HC_V3-V4,franzen,usearch,smallmem_length,nf,0.637531,0.854703,0.406743,9958.0,9958.0,1350.42,9952.4,9958.0,HC,V3-V4
3,HC_V3-V4,franzen,usearch,smallmem_size,nf,0.637531,0.854703,0.406743,9958.0,9958.0,1350.42,9952.4,9958.0,HC,V3-V4
1,HC_V3-V4,franzen,usearch,fast_size,nf,0.63975,0.847558,0.403198,9958.0,9958.0,1411.98,9952.4,9958.0,HC,V3-V4


Data set: HC_V3-V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
4,HC_V3-V4,franzen,vsearch,fast,nf,0.643781,0.868622,0.417428,9958.0,9958.0,1341.19,9952.4,9958.0,HC,V3-V4
5,HC_V3-V4,franzen,vsearch,size,nf,0.643685,0.868548,0.417372,9958.0,9958.0,1340.79,9952.4,9958.0,HC,V3-V4
6,HC_V3-V4,franzen,vsearch,smallmem_length,nf,0.643685,0.868548,0.417372,9958.0,9958.0,1340.79,9952.4,9958.0,HC,V3-V4
7,HC_V3-V4,franzen,vsearch,smallmem_size,nf,0.643685,0.868548,0.417372,9958.0,9958.0,1340.79,9952.4,9958.0,HC,V3-V4


Data set: HC_V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
8,HC_V4,franzen,usearch,fast_length,nf,0.480523,0.918688,0.337962,9958.0,9958.0,583.98,8208.7,9958.0,HC,V4
9,HC_V4,franzen,usearch,fast_size,nf,0.428316,0.947932,0.299213,9958.0,9958.0,358.14,8208.7,9958.0,HC,V4
10,HC_V4,franzen,usearch,smallmem_length,nf,0.425858,0.948767,0.298826,9958.0,9958.0,348.42,8208.7,9958.0,HC,V4
11,HC_V4,franzen,usearch,smallmem_size,nf,0.425858,0.948767,0.298826,9958.0,9958.0,348.42,8208.7,9958.0,HC,V4


Data set: HC_V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
12,HC_V4,franzen,vsearch,fast,nf,0.426159,0.953002,0.300856,9958.0,9958.0,345.12,8208.7,9958.0,HC,V4
13,HC_V4,franzen,vsearch,size,nf,0.426046,0.952956,0.300791,9958.0,9958.0,345.06,8208.7,9958.0,HC,V4
14,HC_V4,franzen,vsearch,smallmem_length,nf,0.426046,0.952956,0.300791,9958.0,9958.0,345.06,8208.7,9958.0,HC,V4
15,HC_V4,franzen,vsearch,smallmem_size,nf,0.426046,0.952956,0.300791,9958.0,9958.0,345.06,8208.7,9958.0,HC,V4


Data set: LC_V3-V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
18,LC_V3-V4,franzen,usearch,smallmem_length,nf,0.807545,0.872907,0.655469,1988.0,1988.0,285.47,1987.1,1988.0,LC,V3-V4
19,LC_V3-V4,franzen,usearch,smallmem_size,nf,0.807545,0.872907,0.655469,1988.0,1988.0,285.47,1987.1,1988.0,LC,V3-V4
16,LC_V3-V4,franzen,usearch,fast_length,nf,0.807054,0.870572,0.653672,1988.0,1988.0,290.69,1987.1,1988.0,LC,V3-V4
17,LC_V3-V4,franzen,usearch,fast_size,nf,0.807294,0.870027,0.65292,1988.0,1988.0,290.85,1987.1,1988.0,LC,V3-V4


Data set: LC_V3-V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
20,LC_V3-V4,franzen,vsearch,fast,nf,0.808588,0.877711,0.66106,1988.0,1988.0,285.14,1987.1,1988.0,LC,V3-V4
21,LC_V3-V4,franzen,vsearch,size,nf,0.808588,0.877711,0.66106,1988.0,1988.0,285.14,1987.1,1988.0,LC,V3-V4
22,LC_V3-V4,franzen,vsearch,smallmem_length,nf,0.808588,0.877711,0.66106,1988.0,1988.0,285.14,1987.1,1988.0,LC,V3-V4
23,LC_V3-V4,franzen,vsearch,smallmem_size,nf,0.808588,0.877711,0.66106,1988.0,1988.0,285.14,1987.1,1988.0,LC,V3-V4


Data set: LC_V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
24,LC_V4,franzen,usearch,fast_length,nf,0.804936,0.95421,0.732341,1988.0,1988.0,133.26,1659.5,1988.0,LC,V4
25,LC_V4,franzen,usearch,fast_size,nf,0.781879,0.97777,0.730771,1988.0,1988.0,97.64,1659.5,1988.0,LC,V4
26,LC_V4,franzen,usearch,smallmem_length,nf,0.780947,0.977511,0.729202,1988.0,1988.0,98.16,1659.5,1988.0,LC,V4
27,LC_V4,franzen,usearch,smallmem_size,nf,0.780947,0.977511,0.729202,1988.0,1988.0,98.16,1659.5,1988.0,LC,V4


Data set: LC_V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
28,LC_V4,franzen,vsearch,fast,nf,0.779134,0.977709,0.727734,1988.0,1988.0,97.76,1659.5,1988.0,LC,V4
29,LC_V4,franzen,vsearch,size,nf,0.779134,0.977709,0.727734,1988.0,1988.0,97.76,1659.5,1988.0,LC,V4
30,LC_V4,franzen,vsearch,smallmem_length,nf,0.779134,0.977709,0.727734,1988.0,1988.0,97.76,1659.5,1988.0,LC,V4
31,LC_V4,franzen,vsearch,smallmem_size,nf,0.779134,0.977709,0.727734,1988.0,1988.0,97.76,1659.5,1988.0,LC,V4


Data set: MC_V3-V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
32,MC_V3-V4,franzen,usearch,fast_length,nf,0.726632,0.858781,0.547816,4966.0,4966.0,714.04,4962.6,4966.0,MC,V3-V4
33,MC_V3-V4,franzen,usearch,fast_size,nf,0.725209,0.859935,0.547732,4966.0,4966.0,707.73,4962.6,4966.0,MC,V3-V4
34,MC_V3-V4,franzen,usearch,smallmem_length,nf,0.721807,0.863611,0.545058,4966.0,4966.0,693.77,4962.6,4966.0,MC,V3-V4
35,MC_V3-V4,franzen,usearch,smallmem_size,nf,0.721807,0.863611,0.545058,4966.0,4966.0,693.77,4962.6,4966.0,MC,V3-V4


Data set: MC_V3-V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
36,MC_V3-V4,franzen,vsearch,fast,nf,0.725665,0.872111,0.553586,4966.0,4966.0,692.74,4962.6,4966.0,MC,V3-V4
37,MC_V3-V4,franzen,vsearch,size,nf,0.725655,0.872067,0.553544,4966.0,4966.0,692.77,4962.6,4966.0,MC,V3-V4
38,MC_V3-V4,franzen,vsearch,smallmem_length,nf,0.725655,0.872067,0.553544,4966.0,4966.0,692.77,4962.6,4966.0,MC,V3-V4
39,MC_V3-V4,franzen,vsearch,smallmem_size,nf,0.725655,0.872067,0.553544,4966.0,4966.0,692.77,4962.6,4966.0,MC,V3-V4


Data set: MC_V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
40,MC_V4,franzen,usearch,fast_length,nf,0.638981,0.938934,0.540493,4966.0,4966.0,301.29,4101.5,4966.0,MC,V4
42,MC_V4,franzen,usearch,smallmem_length,nf,0.597981,0.964818,0.514676,4966.0,4966.0,205.25,4101.5,4966.0,MC,V4
43,MC_V4,franzen,usearch,smallmem_size,nf,0.597981,0.964818,0.514676,4966.0,4966.0,205.25,4101.5,4966.0,MC,V4
41,MC_V4,franzen,usearch,fast_size,nf,0.598479,0.96393,0.514325,4966.0,4966.0,206.12,4101.5,4966.0,MC,V4


Data set: MC_V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
44,MC_V4,franzen,vsearch,fast,nf,0.597457,0.966312,0.515035,4966.0,4966.0,203.77,4101.5,4966.0,MC,V4
45,MC_V4,franzen,vsearch,size,nf,0.597378,0.966332,0.514919,4966.0,4966.0,203.75,4101.5,4966.0,MC,V4
46,MC_V4,franzen,vsearch,smallmem_length,nf,0.597378,0.966332,0.514919,4966.0,4966.0,203.75,4101.5,4966.0,MC,V4
47,MC_V4,franzen,vsearch,smallmem_size,nf,0.597378,0.966332,0.514919,4966.0,4966.0,203.75,4101.5,4966.0,MC,V4


Same tendencies as maximum.

Average the mean values over all data sets and sort by adjusted Rand index.

In [20]:
rows = []
for (t, m, f), grp in df_joined_mean_avg.groupby(by = ['tool', 'mode', 'refinement']):
    rows.append([t, m, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean()])
pd.DataFrame(rows, columns = ['tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex']).sort_values(by = 'adjrandindex', ascending = False)

Unnamed: 0,tool,mode,refinement,precision,recall,adjrandindex
0,usearch,fast_length,nf,0.683534,0.897564,0.536915
4,vsearch,fast,nf,0.663464,0.919245,0.529283
5,vsearch,size,nf,0.663415,0.919221,0.529237
6,vsearch,smallmem_length,nf,0.663415,0.919221,0.529237
7,vsearch,smallmem_size,nf,0.663415,0.919221,0.529237
2,usearch,smallmem_length,nf,0.661945,0.913719,0.524996
3,usearch,smallmem_size,nf,0.661945,0.913719,0.524996
1,usearch,fast_size,nf,0.663488,0.911192,0.524693


**N-best average clustering quality**

Rank by adjusted Rand index (per data set):

In [21]:
for (d, t), grp in df_joined_nbest_avg.groupby(by = ['data_set', 'tool']):
    print('Data set: %s / Tool: %s' % (d, t))
    display(grp.sort_values(by = 'adjrandindex', ascending = False))

Data set: HC_V3-V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
0,HC_V3-V4,franzen,usearch,fast_length,nf,0.727073,0.875593,0.530104,9958.0,9958.0,994.7,9952.4,9958.0,HC,V3-V4
2,HC_V3-V4,franzen,usearch,smallmem_length,nf,0.721751,0.887509,0.523642,9958.0,9958.0,912.02,9952.4,9958.0,HC,V3-V4
3,HC_V3-V4,franzen,usearch,smallmem_size,nf,0.721751,0.887509,0.523642,9958.0,9958.0,912.02,9952.4,9958.0,HC,V3-V4
1,HC_V3-V4,franzen,usearch,fast_size,nf,0.723113,0.879085,0.518181,9958.0,9958.0,974.32,9952.4,9958.0,HC,V3-V4


Data set: HC_V3-V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
4,HC_V3-V4,franzen,vsearch,fast,nf,0.737256,0.905547,0.544724,9958.0,9958.0,908.36,9952.4,9958.0,HC,V3-V4
5,HC_V3-V4,franzen,vsearch,size,nf,0.737155,0.905428,0.544618,9958.0,9958.0,907.94,9952.4,9958.0,HC,V3-V4
6,HC_V3-V4,franzen,vsearch,smallmem_length,nf,0.737155,0.905428,0.544618,9958.0,9958.0,907.94,9952.4,9958.0,HC,V3-V4
7,HC_V3-V4,franzen,vsearch,smallmem_size,nf,0.737155,0.905428,0.544618,9958.0,9958.0,907.94,9952.4,9958.0,HC,V3-V4


Data set: HC_V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
8,HC_V4,franzen,usearch,fast_length,nf,0.656963,0.875002,0.459497,9958.0,9958.0,1018.3,8208.7,9958.0,HC,V4
9,HC_V4,franzen,usearch,fast_size,nf,0.585323,0.936799,0.421687,9958.0,9958.0,581.34,8208.7,9958.0,HC,V4
10,HC_V4,franzen,usearch,smallmem_length,nf,0.581744,0.938165,0.420205,9958.0,9958.0,562.92,8208.7,9958.0,HC,V4
11,HC_V4,franzen,usearch,smallmem_size,nf,0.581744,0.938165,0.420205,9958.0,9958.0,562.92,8208.7,9958.0,HC,V4


Data set: HC_V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
12,HC_V4,franzen,vsearch,fast,nf,0.589836,0.941817,0.427803,9958.0,9958.0,561.58,8208.7,9958.0,HC,V4
13,HC_V4,franzen,vsearch,size,nf,0.589802,0.941827,0.427797,9958.0,9958.0,561.56,8208.7,9958.0,HC,V4
14,HC_V4,franzen,vsearch,smallmem_length,nf,0.589802,0.941827,0.427797,9958.0,9958.0,561.56,8208.7,9958.0,HC,V4
15,HC_V4,franzen,vsearch,smallmem_size,nf,0.589802,0.941827,0.427797,9958.0,9958.0,561.56,8208.7,9958.0,HC,V4


Data set: LC_V3-V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
17,LC_V3-V4,franzen,usearch,fast_size,nf,0.823174,0.975791,0.781254,1988.0,1988.0,94.04,1987.1,1988.0,LC,V3-V4
16,LC_V3-V4,franzen,usearch,fast_length,nf,0.821724,0.976833,0.780688,1988.0,1988.0,93.36,1987.1,1988.0,LC,V3-V4
18,LC_V3-V4,franzen,usearch,smallmem_length,nf,0.82126,0.976424,0.77894,1988.0,1988.0,94.48,1987.1,1988.0,LC,V3-V4
19,LC_V3-V4,franzen,usearch,smallmem_size,nf,0.82126,0.976424,0.77894,1988.0,1988.0,94.48,1987.1,1988.0,LC,V3-V4


Data set: LC_V3-V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
20,LC_V3-V4,franzen,vsearch,fast,nf,0.825956,0.979512,0.785849,1988.0,1988.0,94.44,1987.1,1988.0,LC,V3-V4
21,LC_V3-V4,franzen,vsearch,size,nf,0.825956,0.979512,0.785849,1988.0,1988.0,94.44,1987.1,1988.0,LC,V3-V4
22,LC_V3-V4,franzen,vsearch,smallmem_length,nf,0.825956,0.979512,0.785849,1988.0,1988.0,94.44,1987.1,1988.0,LC,V3-V4
23,LC_V3-V4,franzen,vsearch,smallmem_size,nf,0.825956,0.979512,0.785849,1988.0,1988.0,94.44,1987.1,1988.0,LC,V3-V4


Data set: LC_V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
24,LC_V4,franzen,usearch,fast_length,nf,0.843425,0.981534,0.795446,1988.0,1988.0,92.92,1659.5,1988.0,LC,V4
25,LC_V4,franzen,usearch,fast_size,nf,0.853257,0.967484,0.790793,1988.0,1988.0,125.0,1659.5,1988.0,LC,V4
26,LC_V4,franzen,usearch,smallmem_length,nf,0.852668,0.966977,0.789185,1988.0,1988.0,126.1,1659.5,1988.0,LC,V4
27,LC_V4,franzen,usearch,smallmem_size,nf,0.852668,0.966977,0.789185,1988.0,1988.0,126.1,1659.5,1988.0,LC,V4


Data set: LC_V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
28,LC_V4,franzen,vsearch,fast,nf,0.855534,0.967248,0.791847,1988.0,1988.0,126.1,1659.5,1988.0,LC,V4
29,LC_V4,franzen,vsearch,size,nf,0.855534,0.967248,0.791847,1988.0,1988.0,126.1,1659.5,1988.0,LC,V4
30,LC_V4,franzen,vsearch,smallmem_length,nf,0.855534,0.967248,0.791847,1988.0,1988.0,126.1,1659.5,1988.0,LC,V4
31,LC_V4,franzen,vsearch,smallmem_size,nf,0.855534,0.967248,0.791847,1988.0,1988.0,126.1,1659.5,1988.0,LC,V4


Data set: MC_V3-V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
32,MC_V3-V4,franzen,usearch,fast_length,nf,0.783201,0.922425,0.673199,4966.0,4966.0,379.32,4962.6,4966.0,MC,V3-V4
33,MC_V3-V4,franzen,usearch,fast_size,nf,0.793068,0.910746,0.672723,4966.0,4966.0,427.72,4962.6,4966.0,MC,V3-V4
34,MC_V3-V4,franzen,usearch,smallmem_length,nf,0.791128,0.912764,0.668301,4966.0,4966.0,420.16,4962.6,4966.0,MC,V3-V4
35,MC_V3-V4,franzen,usearch,smallmem_size,nf,0.791128,0.912764,0.668301,4966.0,4966.0,420.16,4962.6,4966.0,MC,V3-V4


Data set: MC_V3-V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
36,MC_V3-V4,franzen,vsearch,fast,nf,0.806653,0.917768,0.685002,4966.0,4966.0,447.2,4962.6,4966.0,MC,V3-V4
37,MC_V3-V4,franzen,vsearch,size,nf,0.806641,0.917703,0.684954,4966.0,4966.0,447.18,4962.6,4966.0,MC,V3-V4
38,MC_V3-V4,franzen,vsearch,smallmem_length,nf,0.806641,0.917703,0.684954,4966.0,4966.0,447.18,4962.6,4966.0,MC,V3-V4
39,MC_V3-V4,franzen,vsearch,smallmem_size,nf,0.806641,0.917703,0.684954,4966.0,4966.0,447.18,4962.6,4966.0,MC,V3-V4


Data set: MC_V4 / Tool: usearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
40,MC_V4,franzen,usearch,fast_length,nf,0.773033,0.90854,0.645536,4966.0,4966.0,452.58,4101.5,4966.0,MC,V4
41,MC_V4,franzen,usearch,fast_size,nf,0.728843,0.953799,0.629817,4966.0,4966.0,296.24,4101.5,4966.0,MC,V4
42,MC_V4,franzen,usearch,smallmem_length,nf,0.728389,0.954215,0.629736,4966.0,4966.0,294.76,4101.5,4966.0,MC,V4
43,MC_V4,franzen,usearch,smallmem_size,nf,0.728389,0.954215,0.629736,4966.0,4966.0,294.76,4101.5,4966.0,MC,V4


Data set: MC_V4 / Tool: vsearch


Unnamed: 0,data_set,gt,tool,mode,refinement,precision,recall,adjrandindex,num_input_amplicons,input_mass,num_clusters,num_output_amplicons,output_mass,ds,rt
44,MC_V4,franzen,vsearch,fast,nf,0.736342,0.955908,0.637638,4966.0,4966.0,294.88,4101.5,4966.0,MC,V4
45,MC_V4,franzen,vsearch,size,nf,0.736342,0.955908,0.637638,4966.0,4966.0,294.88,4101.5,4966.0,MC,V4
46,MC_V4,franzen,vsearch,smallmem_length,nf,0.736342,0.955908,0.637638,4966.0,4966.0,294.88,4101.5,4966.0,MC,V4
47,MC_V4,franzen,vsearch,smallmem_size,nf,0.736342,0.955908,0.637638,4966.0,4966.0,294.88,4101.5,4966.0,MC,V4


Same tendencies as maximum.

Average the N-best values over all data sets and sort by adjusted Rand index.

In [22]:
rows = []
for (t, m, f), grp in df_joined_nbest_avg.groupby(by = ['tool', 'mode', 'refinement']):
    rows.append([t, m, f, grp['precision'].mean(), grp['recall'].mean(), grp['adjrandindex'].mean()])
pd.DataFrame(rows, columns = ['tool', 'mode', 'refinement', 'precision', 'recall', 'adjrandindex']).sort_values(by = 'adjrandindex', ascending = False)

Unnamed: 0,tool,mode,refinement,precision,recall,adjrandindex
0,usearch,fast_length,nf,0.76757,0.923321,0.647412
4,vsearch,fast,nf,0.758596,0.944633,0.645477
5,vsearch,size,nf,0.758572,0.944604,0.645451
6,vsearch,smallmem_length,nf,0.758572,0.944604,0.645451
7,vsearch,smallmem_size,nf,0.758572,0.944604,0.645451
1,usearch,fast_size,nf,0.75113,0.937284,0.635742
2,usearch,smallmem_length,nf,0.74949,0.939342,0.635002
3,usearch,smallmem_size,nf,0.74949,0.939342,0.635002


Same as maximum.