-
Notifications
You must be signed in to change notification settings - Fork 0
/
expanding_the_unifrac_toolbox_tracked_changes.tex
806 lines (654 loc) · 74.9 KB
/
expanding_the_unifrac_toolbox_tracked_changes.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
% Template for PLoS
%DIF LATEXDIFF DIFFERENCE FILE
%DIF DEL /Users/dphamnyghonca/Downloads/unifrac_paper-submission/expanding_the_unifrac_toolbox.tex Thu Mar 10 10:32:28 2016
%DIF ADD expanding_the_unifrac_toolbox.tex Mon May 2 22:11:18 2016
% Version 3.1 February 2015
%
% To compile to pdf, run:
% latex plos.template
% bibtex plos.template
% latex plos.template
% latex plos.template
% dvipdf plos.template
%
% % % % % % % % % % % % % % % % % % % % % %
%
% -- IMPORTANT NOTE
%
% This template contains comments intended
% to minimize problems and delays during our production
% process. Please follow the template instructions
% whenever possible.
%
% % % % % % % % % % % % % % % % % % % % % % %
%
% Once your paper is accepted for publication,
% PLEASE REMOVE ALL TRACKED CHANGES in this file and leave only
% the final text of your manuscript.
%
% There are no restrictions on package use within the LaTeX files except that
% no packages listed in the template may be deleted.
%
% Please do not include colors or graphics in the text.
%
% Please do not create a heading level below \subsection. For 3rd level headings, use \paragraph{}.
%
% % % % % % % % % % % % % % % % % % % % % % %
%
% -- FIGURES AND TABLES
%
% Please include tables/figure captions directly after the paragraph where they are first cited in the text.
%
% DO NOT INCLUDE GRAPHICS IN YOUR MANUSCRIPT
% - Figures should be uploaded separately from your manuscript file.
% - Figures generated using LaTeX should be extracted and removed from the PDF before submission.
% - Figures containing multiple panels/subfigures must be combined into one image file before submission.
% For figure citations, please use "Fig." instead of "Figure".
% See http://www.plosone.org/static/figureGuidelines for PLOS figure guidelines.
%
% Tables should be cell-based and may not contain:
% - tabs/spacing/line breaks within cells to alter layout or alignment
% - vertically-merged cells (no tabular environments within tabular environments, do not use \multirow)
% - colors, shading, or graphic objects
% See http://www.plosone.org/static/figureGuidelines#tables for table guidelines.
%
% For tables that exceed the width of the text column, use the adjustwidth environment as illustrated in the example table in text below.
%
% % % % % % % % % % % % % % % % % % % % % % % %
%
% -- EQUATIONS, MATH SYMBOLS, SUBSCRIPTS, AND SUPERSCRIPTS
%
% IMPORTANT
% Below are a few tips to help format your equations and other special characters according to our specifications. For more tips to help reduce the possibility of formatting errors during conversion, please see our LaTeX guidelines at http://www.plosone.org/static/latexGuidelines
%
% Please be sure to include all portions of an equation in the math environment.
%
% Do not include text that is not math in the math environment. For example, CO2 will be CO\textsubscript{2}.
%
% Please add line breaks to long display equations when possible in order to fit size of the column.
%
% For inline equations, please do not include punctuation (commas, etc) within the math environment unless this is part of the equation.
%
% % % % % % % % % % % % % % % % % % % % % % % %
%
% Please contact latex@plos.org with any questions.
%
% % % % % % % % % % % % % % % % % % % % % % % %
\documentclass[10pt,letterpaper]{article}
\usepackage[top=0.85in,left=2.75in,footskip=0.75in]{geometry}
% Use adjustwidth environment to exceed column width (see example table in text)
\usepackage{changepage}
% Use Unicode characters when possible
\usepackage[utf8]{inputenc}
% textcomp package and marvosym package for additional characters
\usepackage{textcomp,marvosym}
% fixltx2e package for \textsubscript
\usepackage{fixltx2e}
% listings package for code
\usepackage{listings}
% amsmath and amssymb packages, useful for mathematical formulas and symbols
\usepackage{amsmath,amssymb}
% cite package, to clean up citations in the main text. Do not remove.
\usepackage{cite}
% Use nameref to cite supporting information files (see Supporting Information section for more info)
\usepackage{nameref,hyperref}
% line numbers
\usepackage[right]{lineno}
% ligatures disabled
\usepackage{microtype}
\DisableLigatures[f]{encoding = *, family = * }
% rotating package for sideways tables
\usepackage{rotating}
%DIF 112a112-127
% prevent figures from floating too far with \FloatBarrier %DIF >
\usepackage{placeins} %DIF >
%DIF >
% make sure that eps to pdf files are output into current dir %DIF >
\usepackage[outdir=./]{epstopdf} %DIF >
%DIF >
% Make subsections numbered 1, 2, 3 ... instead of 0.1, 0.2, 0.3 ... as requested by GLBIO/CCBC 2016 reviewer. %DIF >
%DIF >
\renewcommand{\thesubsection}{\arabic{subsection}} %DIF >
\makeatletter %DIF >
\def\@seccntformat#1{\@ifundefined{#1@cntformat}% %DIF >
{\csname the#1\endcsname\quad}% default %DIF >
{\csname #1@cntformat\endcsname}}% enable individual control %DIF >
\newcommand\section@cntformat{} %DIF >
\makeatother %DIF >
%DIF >
%DIF -------
% Remove comment for double spacing
%\usepackage{setspace}
%\doublespacing
% Text layout
\raggedright
\setlength{\parindent}{0.5cm}
\textwidth 5.25in
\textheight 8.75in
\setlength{\headheight}{28pt}
% Bold the 'Figure #' in the caption and separate it from the title/caption with a period
% Captions will be left justified
\usepackage[aboveskip=1pt,labelfont=bf,labelsep=period,justification=raggedright,singlelinecheck=off]{caption}
% Use the PLoS provided BiBTeX style
\bibliographystyle{plos2015}
% Remove brackets from numbering in List of References
\makeatletter
\renewcommand{\@biblabel}[1]{\quad#1.}
\makeatother
% Leave date blank
\date{}
% Header and Footer with logo
\usepackage{lastpage,fancyhdr,graphicx}
\usepackage{epstopdf}
\pagestyle{myheadings}
\pagestyle{fancy}
\fancyhf{}
\lhead{\includegraphics[width=2.0in]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/PLOS-submission.eps}}
\rfoot{\thepage/\pageref{LastPage}}
\renewcommand{\footrule}{\hrule height 2pt \vspace{2mm}}
\fancyheadoffset[L]{2.25in}
\fancyfootoffset[L]{2.25in}
\lfoot{\sf PLOS}
%% Include all macros below
\newcommand{\lorem}{{\bf LOREM}}
\newcommand{\ipsum}{{\bf IPSUM}}
%% END MACROS SECTION
%DIF PREAMBLE EXTENSION ADDED BY LATEXDIFF
%DIF UNDERLINE PREAMBLE %DIF PREAMBLE
\RequirePackage[normalem]{ulem} %DIF PREAMBLE
\RequirePackage{color}\definecolor{RED}{rgb}{1,0,0}\definecolor{BLUE}{rgb}{0,0,1} %DIF PREAMBLE
\providecommand{\DIFadd}[1]{{\protect\color{blue}\uwave{#1}}} %DIF PREAMBLE
\providecommand{\DIFdel}[1]{{\protect\color{red}\sout{#1}}} %DIF PREAMBLE
%DIF SAFE PREAMBLE %DIF PREAMBLE
\providecommand{\DIFaddbegin}{} %DIF PREAMBLE
\providecommand{\DIFaddend}{} %DIF PREAMBLE
\providecommand{\DIFdelbegin}{} %DIF PREAMBLE
\providecommand{\DIFdelend}{} %DIF PREAMBLE
%DIF FLOATSAFE PREAMBLE %DIF PREAMBLE
\providecommand{\DIFaddFL}[1]{\DIFadd{#1}} %DIF PREAMBLE
\providecommand{\DIFdelFL}[1]{\DIFdel{#1}} %DIF PREAMBLE
\providecommand{\DIFaddbeginFL}{} %DIF PREAMBLE
\providecommand{\DIFaddendFL}{} %DIF PREAMBLE
\providecommand{\DIFdelbeginFL}{} %DIF PREAMBLE
\providecommand{\DIFdelendFL}{} %DIF PREAMBLE
%DIF END PREAMBLE EXTENSION ADDED BY LATEXDIFF
\begin{document}
\vspace*{0.35in}
% Title must be 250 characters or less.
% Please capitalize all terms in the title except conjunctions, prepositions, and articles.
\begin{flushleft}
{\Large
\textbf\newline{Expanding the UniFrac toolbox}
}
\newline
% Insert author names, affiliations and corresponding author email (do not include titles, positions, or degrees).
\\
Ruth G Wong\textsuperscript{1 \Yinyang},
Jia R Wu\textsuperscript{1 \Yinyang},
Gregory B Gloor\textsuperscript{1 *}
\\
\bigskip
\bf{1} Department of Biochemistry, University of Western Ontario, London, Ontario, Canada
\\
\bigskip
% Insert additional author notes using the symbols described below. Insert symbol callouts after author names as necessary.
%
% Remove or comment out the author notes below if they aren't used.
%
% Primary Equal Contribution Note
\Yinyang These authors contributed equally to this work.
% Additional Equal Contribution Note
% Also use this double-dagger symbol for special authorship notes, such as senior authorship.
% \ddag These authors also contributed equally to this work.
% Current address notes
%\textcurrency a Insert current address of first author with an address update
% \textcurrency b Insert current address of second author with an address update
% \textcurrency c Insert current address of third author with an address update
% Deceased author note
%\dag Deceased
% Group/Consortium Author Note
%\textpilcrow Membership list can be found in the Acknowledgments section.
% Use the asterisk to denote corresponding authorship and provide email address in note below.
* ggloor@uwo.ca
\end{flushleft}
% Please keep the abstract below 300 words
\section*{Abstract}
\DIFdelbegin \DIFdel{Microbiome analysis is frequently performed using the }\DIFdelend \DIFaddbegin \DIFadd{The }\DIFaddend UniFrac distance metric \DIFaddbegin \DIFadd{is often used }\DIFaddend to separate groups \DIFaddbegin \DIFadd{in microbiome analysis, but requires a constant sequencing depth to work properly}\DIFaddend . Here we demonstrate that unweighted UniFrac is highly sensitive to rarefaction instance and to sequencing depth in uniform data sets \DIFaddbegin \DIFadd{with no clear structure or separation between groups}\DIFaddend . We show that this arises because of subcompositional effects. We introduce information UniFrac and \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac, two new weightings that are not \DIFaddbegin \DIFadd{as }\DIFaddend sensitive to rarefaction and allow greater separation of outliers than classic unweighted and weighted UniFrac. With this expansion of the UniFrac toolbox, we hope to empower researchers to extract more varied information from their data.
% Please keep the Author Summary between 150 and 200 words
% Use first person. PLOS ONE authors please skip this step.
% Author Summary not valid for PLOS ONE submissions.
% \section*{Author Summary}
% Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur eget porta erat. Morbi consectetur est vel gravida pretium. Suspendisse ut dui eu ante cursus gravida non sed sem. Nullam sapien tellus, commodo id velit id, eleifend volutpat quam. Phasellus mauris velit, dapibus finibus elementum vel, pulvinar non tellus. Nunc pellentesque pretium diam, quis maximus dolor faucibus id. Nunc convallis sodales ante, ut ullamcorper est egestas vitae. Nam sit amet enim ultrices, ultrices elit pulvinar, volutpat risus.
\linenumbers
\section*{Introduction}
In 2005, Lozupone et al introduced the UniFrac distance metric, a measure to calculate the difference between \DIFdelbegin \DIFdel{microbiomes }\DIFdelend \DIFaddbegin \DIFadd{microbiome samples }\DIFaddend that incorporated phylogenetic distance \cite{lozupone2005unifrac}. The goal of \DIFdelbegin \DIFdel{UniFrac }\DIFdelend \DIFaddbegin \DIFadd{the UniFrac distance metric }\DIFaddend was to enable objective comparison between microbiome samples from different conditions. In 2007, Lozupone added a proportional weighting to the original unweighted method \cite{lozupone2007quantitative}. Since then, papers reporting these metrics have garnered over a thousand citations, and enabled research about everything from how kwashiorkor causes malnutrition \cite{smith2013gut} to how people can have similar microbiomes to their pet dogs \cite{song2013cohabiting}. Except for generalized UniFrac, used to make hybrid unweighted and weighted UniFrac comparisons \cite{chen2012associating}, few advances in the metric have occurred since 2007. In this paper we examine \DIFdelbegin \DIFdel{a data set where unweighted }\DIFdelend \DIFaddbegin \DIFadd{data sets where }\DIFaddend UniFrac gives misleading results, and \DIFaddbegin \DIFadd{present and }\DIFaddend discuss some alternative weightings for UniFrac.
\DIFaddbegin \paragraph{\DIFadd{Operational Taxonomic Units}}\DIFadd{\mbox{}}\\
\DIFadd{Unlike more distinct species, such as mammalian species, bacterial species are not well defined. Bacterial genomes are highly variable, and regions used to identify bacteria vary in a continuum rather than clusters of similar sequences.
}
\DIFadd{Historically bacteria that are have 97\% identity in a 16S rRNA gene variable region are considered to be the same taxa \mbox{%DIFAUXCMD
\cite{ciccarelli2006toward}}%DIFAUXCMD
. The 97\% cutoff was arbitrarily chosen to best map sequence data to bacterial classifications. This threshold is thought to maximizes the grouping of bacteria classified as the same species while minimizing the grouping of bacteria classified as different species \mbox{%DIFAUXCMD
\cite{caporaso2011global}}%DIFAUXCMD
. Before sequencing bacterial classification was often done by appearance or by metabolic products, so there are outliers where bacteria classified in the same species are actually genetically very different, or bacteria classified in different genus are genetically very similar.
}
\DIFadd{However, it is difficult to determine how a batch of sequences should be partitioned into groups of 97\% identity. One way is to perform a clustering algorithm (using software such as UCLUST \mbox{%DIFAUXCMD
\cite{edgar2010search}}%DIFAUXCMD
) that partitions the groups and then later assign taxonomic identity by matching the seed or central sequences with public databases, such as SILVA \mbox{%DIFAUXCMD
\cite{quast2013silva}}%DIFAUXCMD
, the Ribosomal Database Project \mbox{%DIFAUXCMD
\cite{cole2009ribosomal}}%DIFAUXCMD
, or Greengenes \mbox{%DIFAUXCMD
\cite{desantis2006greengenes}}%DIFAUXCMD
. Another method is closed reference OTU picking, which starts off with seed sequences from known bacteria and perform the clustering such that the 97\% identity groups are centered on the seed sequences. In any case, the resulting taxonomic groupings are known as Operational Taxonomic Units (OTUs), and are used consistently within the same experiment. While OTUs can be annotated with standard taxonomic names such that results can be compared between experiments, technically the taxonomic groupings used by different experiments are not the same, except with closed reference OTUs, or individual sequence unit methods. Individual sequence unit (ISU) methods which do not use OTUs can be run with software such as DADA2 \mbox{%DIFAUXCMD
\cite{callahan2015dada2}}%DIFAUXCMD
.
}
\DIFadd{Grouping of amplicon sequences into OTUs allows for the data to be summarized into a table of counts per OTU per sample.
}
\subsection{\DIFadd{Data}}
\DIFadd{UniFrac requires two pieces of information: a phylogenetic tree and a table of counts per inferred taxa per sample. These are derived from a gene tag sequencing experiment, such as the commonly used 16S rRNA gene \mbox{%DIFAUXCMD
\cite{tringe2008renaissance}}%DIFAUXCMD
. The sequenced gene contains a variable region, allowing the sequences to be grouped into OTUs as described in the previous section. A count table can then be generated with the number of reads per OTU per sample. The center sequence of each OTU group can be put into a multiple sequence alignment, from which a phylogenetic tree can be inferred.
}
\DIFadd{The phylogenetic tree is created through a multiple sequence alignment with the representative OTU sequences, using software such as MUSCLE \mbox{%DIFAUXCMD
\cite{edgar2004muscle} }%DIFAUXCMD
and FastTree \mbox{%DIFAUXCMD
\cite{price2010fasttree}}%DIFAUXCMD
, or using a guide tree, such as through Greengenes \mbox{%DIFAUXCMD
\cite{desantis2006greengenes} }%DIFAUXCMD
or the QIIME software \mbox{%DIFAUXCMD
\cite{caporaso2010qiime}}%DIFAUXCMD
. Each leaf of the tree represents one of the OTUs, and each of the branches of the tree has a length. Additionally, the tree needs to be rooted for the UniFrac calculation to be performed. This is often done by rooting the tree at its midpoint.
}
\subsection{\DIFadd{Compositional Data Analysis}}
\DIFadd{Microbiome data is in the form of a list of counts per feature (OTUs in this case), with the features composing an aspect of the microbiome for each sample. This is compositional data because the total sum of reads for a sample is arbitrary, being determined by the capacity of the sequencing instrument \mbox{%DIFAUXCMD
\cite{fernandes2013anova} }%DIFAUXCMD
\mbox{%DIFAUXCMD
\cite{fernandes2014unifying} }%DIFAUXCMD
\mbox{%DIFAUXCMD
\cite{gloor2010microbiome}}%DIFAUXCMD
. There are several core truths about microbiome data and its compositional nature that should be considered when making an analysis strategy.
}
\DIFadd{First, the total number of reads per sample is influenced by sample collection, extraction, sequencing library preparation, and sequencing platform, and is irrelevant to the biological implications of the data. Additionally, the constraint of the count total causes the abundance of different taxa to appear to be negatively correlated with each other when analyzed by conventional statistics \mbox{%DIFAUXCMD
\cite{lovell2015proportionality}}%DIFAUXCMD
. When one taxa increases in abundance, the counts detected in other taxa decrease in abundance, even if the taxa are not decreasing in abundance biologically. For example, one study compared the microbiome of vaginal swab samples from women with bacterial vaginosis (BV), women without BV, and women with intermediate BV, using qPCR to quantify the taxa \mbox{%DIFAUXCMD
\cite{zozaya2010quantitative}}%DIFAUXCMD
. }\textit{\DIFadd{Prevotella}} \DIFadd{was found to increase through non-BV to intermediate to BV, while }\textit{\DIFadd{Lactobacillus iners}} \DIFadd{stayed relatively the same \mbox{%DIFAUXCMD
\cite{zozaya2010quantitative}}%DIFAUXCMD
. If the same samples were put through a gene tag sequencing experiment where the taxa could not be quantified and the total read counts were constrained, one might incorrectly conclude that the abundance of }\textit{\DIFadd{Lactobacillus iners}} \DIFadd{was decreasing while }\textit{\DIFadd{Prevotella}} \DIFadd{was increasing.
}
\DIFadd{To prevent incorrect conclusions, data should be analyzed in a compositional way. In Euclidean space, data points can increase or decrease freely. Compositional data is under a sum constraint, and exist in a non-Euclidean space known as the Aitchison simplex \mbox{%DIFAUXCMD
\cite{aitchison1982statistical}}%DIFAUXCMD
. A data transformation can be performed to put the data into Euclidean space, so that it can be analyzed with standard statistical methods that depend on Cartesian coordinates and linear relationships. These transformations involve examining the ratios of different OTU abundances to each other, so that the total number of reads do not unduly affect the result \mbox{%DIFAUXCMD
\cite{gloor2016s} }%DIFAUXCMD
\mbox{%DIFAUXCMD
\cite{gloor2016compositional}}%DIFAUXCMD
. In the example with bacterial vaginosis, using ratios of taxa to each other would elucidate the nature of the biological change in the data.
}
\DIFaddend \subsection{Unweighted UniFrac}
Unweighted UniFrac \cite{lozupone2005unifrac} uses an inferred evolutionary distance to measure similarity between samples. It requires a reference phylogenetic tree containing all the taxa present in the samples to be examined\DIFaddbegin \DIFadd{, plus information about which taxa were detected in each sample}\DIFaddend . The calculation is performed by dividing the branch lengths \DIFaddbegin \DIFadd{that are not }\DIFaddend shared between the two samples by the branch lengths covered by either sample. \DIFaddbegin \DIFadd{Figure ~\ref{fig1} shows example calculations for UniFrac based on the tree overlap. }\DIFaddend A distance of 0 means that the samples are identical, and a distance of 1 means that the two samples share no taxa in common.
\begin{figure}[h]
\includegraphics[scale=0.5]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/unifrac_distance.eps}
\DIFdelbeginFL %DIFDELCMD < \caption{%%%
\DIFdelendFL \DIFaddbeginFL \caption[Unweighted UniFrac.]{\DIFaddendFL {\bf Unweighted UniFrac.}
When two samples do not share any branches of the phylogenetic tree, the unweighted UniFrac distance is maximized at 1. When two samples share half of their branch lengths on the phylogenetic tree, the unweighted UniFrac distance is 0.5. If the two samples contain exactly the same taxa, the unweighted UniFrac distance is minimized at 0, since the samples share all branches.}
\label{fig1}
\end{figure}
\DIFaddbegin \DIFadd{As UniFrac is a binary test of absence, it is sensitive to sequencing depth, and assumes that the data has been normalized to a common sequencing depth \mbox{%DIFAUXCMD
\cite{lozupone2011unifrac}}%DIFAUXCMD
. Thus, rarefaction prior to unweighted UniFrac has become a standard part of the microbiome analysis workflow, with built in rarefaction functions in QIIME \mbox{%DIFAUXCMD
\cite{caporaso2010qiime} }%DIFAUXCMD
and mothur \mbox{%DIFAUXCMD
\cite{schloss2009introducing}}%DIFAUXCMD
.
}
\FloatBarrier
\DIFaddend \subsection{Weighted UniFrac}
Weighted UniFrac \cite{lozupone2007quantitative} is an implementation of the Kantorovich–Rubinstein distance in mathematics, also known as the earth mover’s distance \cite{evans2012phylogenetic}. Rather than looking only at the presence or absence of taxa, each branch length of the phylogenetic tree is weighted by the difference in proportional abundance of the taxa between the two samples.
This technique reduces the problem of low abundance taxa being represented as a 0 or by a low count depending on sampling depth. In unweighted UniFrac, such taxa would flip from absent to present, and could skew the measurement: this would be especially problematic if the taxa are on a long branch. In weighted UniFrac, low abundance taxa have a much lower weight and so will have a lower impact on the total distance reported by the metric.
UniFrac is constituted as either a \DIFdelbegin \DIFdel{presence/absence }\DIFdelend \DIFaddbegin \DIFadd{binary weighting }\DIFaddend (unweighted UniFrac) \cite{lozupone2005unifrac}, a linear proportion (weighted UniFrac) \cite{lozupone2007quantitative}, or some combination of the two (generalized UniFrac) \cite{chen2012associating}. However, \DIFaddbegin \DIFadd{it is a misconception that }\DIFaddend the data are \DIFdelbegin \DIFdel{not linear , }\DIFdelend \DIFaddbegin \DIFadd{linear }\DIFaddend because the sum of the total number of reads is constrained by the sequencing machinery \cite{friedman2012inferring} \cite{fernandes2013anova} \cite{fernandes2014unifying} \cite{lovell2015proportionality} \DIFdelbegin \DIFdel{. }\DIFdelend \DIFaddbegin \DIFadd{as described above.
}
\DIFadd{Microbiome communities can exhibit tremendous variation in their total bacterial count. For example, a stool sample may produce more highly concentrated DNA extract than a skin swab sample, resulting in a different number of input molecules but a similar read count total. Vaginal samples from patients with bacterial vaginosis compared to patients without can have DNA extract concentrations that differ one magnitude \mbox{%DIFAUXCMD
\cite{zozaya2010quantitative}}%DIFAUXCMD
. }\DIFaddend Alternative weightings and non-linear transformations of data need to be explored. Furthermore, unweighted UniFrac is known to be unreliable, but it is not generally \DIFdelbegin \DIFdel{known or }\DIFdelend understood how this can impact results.
% You may title this section "Methods" or "Models".
% "Models" is not a valid title for PLoS ONE authors. However, PLoS ONE
% authors may use "Analysis"
\section*{Materials and Methods}
\subsection{Analytical techniques}
\paragraph{Rarefaction}\mbox{}\\
Rarefaction normalizes the samples OTU counts to a standard sequencing depth \DIFaddbegin \DIFadd{by sampling without replacement }\DIFaddend \cite{simberloff1978use}. This resulting table can be thought of as a random point estimate of the dataset, as the output is a sub-sample \DIFaddbegin \DIFadd{without replacement }\DIFaddend of the original table. This standardization process is recommended by the authors of UniFrac \cite{de2011evaluation} in order to account for the sensitivity of UniFrac to sequencing depth.
Rarefactions can be performed using the \DIFdelbegin \DIFdel{Qiime }\DIFdelend \DIFaddbegin \DIFadd{QIIME }\DIFaddend software \cite{caporaso2010qiime} or using the vegan package in R \cite{oksanen2007vegan}.
\paragraph{Unweighted UniFrac}\mbox{}\\
Unweighted UniFrac is calculated based on the presence or absence of counts for each branch in the phylogenetic tree, when comparing two samples. A branch \DIFdelbegin \DIFdel{is unshared when one sample has a non-zero abundance but not the other, and a branch is shared when both samples }\DIFdelend \DIFaddbegin \DIFadd{belongs to a sample when at least one of the OTUs in the leaves below it }\DIFaddend have a non-zero abundance. The formula for unweighted UniFrac is as follows, where $b$ is the set of branch lengths in the phylogenetic tree\DIFaddbegin \DIFadd{, $A$ and $B$ represent the two samples being compared, $\triangle$ is the symmetric difference between two sets, and $\cup$ is the union between two sets}\DIFaddend :
\begin{align*}
\DIFdelbegin \DIFdel{\frac{\sum_{}{} b_{unshared}}{\sum_{}{} b_{unshared} + \sum_{}{} b_{shared}}
}\DIFdelend \DIFaddbegin \DIFadd{Unweighted_{AB} = \frac{\sum_{}{} b_{A} \triangle b_{B}}{\sum_{}{} b_{A} \cup b_{B}}
}\DIFaddend \end{align*}
\DIFaddbegin \DIFadd{The sum of the branch lengths that belong to one sample but not the other is divided by the sum of the branch lengths that belong to one or both samples.
}
\DIFadd{Note that the implementation of unweighted UniFrac in QIIME (see Fig.~\ref{fig2}) and also GUniFrac (see Supporting figures ~\ref{gunifrac_1}, ~\ref{gunifrac_2}, and ~\ref{gunifrac_3}) includes a tree pruning procedure, where the tree is pruned to only include OTUs that are present in each pairwise sample comparison. Except for in figure ~\ref{fig2} and supporting figures ~\ref{gunifrac_1}, ~\ref{gunifrac_2}, and ~\ref{gunifrac_3}, the scripts used in this paper do not prune the tree, in order to be consistent with weighted UniFrac. In weighted UniFrac, pruning the tree makes the measurement a dissimilarity rather than a distance (Supporting Information ~\ref{S4_Fig}).
}
\DIFaddend \paragraph{Weighted UniFrac}\mbox{}\\
Weighted UniFrac \cite{lozupone2007quantitative} also incorporates each branch length of the phylogenetic tree, and weights them according to proportional abundance of the two samples. The formula for weighed UniFrac is as follows, where $A$ and $B$ are the two samples, $b$ is the set of branch lengths, and $\frac{A_{i}}{A_{T}}$ and $\frac{B_{i}}{B_{T}}$ are the proportional abundances associated with branch length $b_{i}$:
\begin{align*}
\DIFdelbegin \DIFdel{\sum_{i}^{n} b_{i} \times }%DIFDELCMD < \left| %%%
\DIFdel{\frac{A_{i}}{A_{T}} - \frac{B_{i}}{B_{T}} }%DIFDELCMD < \right|
%DIFDELCMD < %%%
\DIFdelend \DIFaddbegin \DIFadd{Weighted_{AB} = \frac{\sum_{i}^{n} b_{i} \times \left| \frac{A_{i}}{A_{T}} - \frac{B_{i}}{B_{T}} \right|}{\sum_{i}^{n} b_{i}}
}\DIFaddend \end{align*}
\paragraph{Information UniFrac}\mbox{}\\
Information UniFrac is calculated by weighing each branch length by the difference in the uncertainty of the taxa abundance between the two samples. Uncertainty \DIFaddbegin \DIFadd{or information ($I$) }\DIFaddend is calculated as follows, where p is the proportional abundance \cite{shannon2001mathematical}:
\begin{equation}\label{eq:schemeP}
\DIFaddbegin \DIFadd{I = }\DIFaddend - p \times log_{2}(p)
\end{equation}
If a sample is \DIFaddbegin \DIFadd{composed of }\DIFaddend 50\% taxa A and 50\% taxa B, then the proportional abundances have maximum uncertainty about what taxa is likely to be seen in a given sequence read. If a sample is 80\% taxa A and 20\% taxa B, then there is less uncertainty \DIFaddbegin \DIFadd{about both taxa}\DIFaddend , because a given sequence read is more likely to be taxa A \DIFaddbegin \DIFadd{and less likely to be taxa B}\DIFaddend . When the amount of uncertainty that a taxa has in one sample corresponds with the amount of uncertainty the same taxa has in a different sample, the abundance of that taxa is mutually informative between samples. Weighting UniFrac by uncertainty combines the the concept of uncertainty with phylogenetic relationships to identify taxa that are differentially informative between groups.
The formula for Information UniFrac is as follows:
\begin{align*}
\DIFdelbegin \DIFdel{\sum_{i}^{n} b_{i} \times }%DIFDELCMD < \left| %%%
\DIFdel{\frac{A_{i}}{A_{T}}log}%DIFDELCMD < \left(%%%
\DIFdel{\frac{A_{i}}{A_{T}}}%DIFDELCMD < \right) %%%
\DIFdel{- \frac{B_{i}}{B_{T}}log}%DIFDELCMD < \left(%%%
\DIFdel{\frac{B_{i}}{B_{T}}}%DIFDELCMD < \right) \right|
%DIFDELCMD < %%%
\DIFdelend \DIFaddbegin \DIFadd{Information_{AB} = \frac{\sum_{i}^{n} b_{i} \times \left| \frac{A_{i}}{A_{T}}log\left(\frac{A_{i}}{A_{T}}\right) - \frac{B_{i}}{B_{T}}log\left(\frac{B_{i}}{B_{T}}\right) \right|}{\sum_{i}^{n} b_{i}}
}\DIFaddend \end{align*}
\DIFaddbegin \DIFadd{Information UniFrac approaches a minimum of zero (Fig.~\ref{fig5}) when a sample is composed of a monoculture. It also related to the Aitchison distance in compositional data analysis \mbox{%DIFAUXCMD
\cite{egozcue2011evidence}}%DIFAUXCMD
.
}
\DIFaddend \paragraph{\DIFdelbegin \DIFdel{Centered }\DIFdelend Ratio UniFrac}\mbox{}\\
In complex microbiome communities, there \DIFdelbegin \DIFdel{are very many }\DIFdelend \DIFaddbegin \DIFadd{may be a large number of }\DIFaddend bacterial taxa with \DIFdelbegin \DIFdel{a low level of counts}\DIFdelend \DIFaddbegin \DIFadd{few counts, such that the data is sparse}\DIFaddend . Taking the geometric mean of the proportional abundances of taxa in a microbiome sample represents an unbiased baseline \DIFaddbegin \DIFadd{of the average abundance of features with geometric growth characteristics - such as bacteria which divide by fission }\DIFaddend \cite{aitchison1982statistical}. Experiments generally do not have power to detect differences at abundances below the mean \cite{fernandes2013anova}. Centering the proportional abundances around the geometric mean thus allows one to examine the data in \DIFaddbegin \DIFadd{this }\DIFaddend context, muting differences that are close to \DIFdelbegin \DIFdel{baseline and accentuating outliers}\DIFdelend \DIFaddbegin \DIFadd{the baseline abundance and accentuating OTUs that are much more abundant than the mean}\DIFaddend . The formula for \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac is as follows, where $gm$ is the geometric mean:
\begin{align*}
\DIFdelbegin \DIFdel{\sum_{i}^{n} b_{i} \times }%DIFDELCMD < \left| %%%
\DIFdel{\frac{\frac{A_{i}}{A_{T}}}{gm(A_{i})} - \frac{\frac{B_{i}}{B_{T}}}{{gm(B_{i})}} }%DIFDELCMD < \right|
%DIFDELCMD < %%%
\DIFdelend \DIFaddbegin \DIFadd{Ratio_{AB} = \frac{\sum_{i}^{n} b_{i} \times \left| \frac{\frac{A_{i}}{A_{T}}}{gm(A_{i})} - \frac{\frac{B_{i}}{B_{T}}}{{gm(B_{i})}} \right|}{\sum_{i}^{n} b_{i}}
}\DIFaddend \end{align*}
Note that the geometric mean is calculated by combining all children in the subtree of $b_{i}$ into $\frac{A_{i}}{A_{T}}$ for sample $A$ or $\frac{B_{i}}{B_{T}}$ for sample $B$, and including the rest of the single taxa proportional abundances separately. The one combined proportional abundance and the remaining single taxa proportional abundances are input into the geometric mean formula, as set $a$:
\begin{align*}
\DIFaddbegin \DIFadd{gm(a) = }\DIFaddend \left( \prod_{i}^{n} a_{i}\right)^{1/n}
\end{align*}
One challenge when it comes to the analysis of read count data is \DIFdelbegin \DIFdel{the presence of zero counts}\DIFdelend \DIFaddbegin \DIFadd{that the data is very sparse}\DIFaddend . Whether a low-abundance taxa \DIFaddbegin \DIFadd{or feature }\DIFaddend appears in the data as a zero or a low positive count is up to chance, and assuming that a zero count represents the absence of a taxa can be very misleading \cite{fernandes2013anova}. A Bayesian approach can be used to \DIFdelbegin \DIFdel{estimate the likelihood that a zero could be changed to a positive count if the sample were resequenced, }\DIFdelend \DIFaddbegin \DIFadd{give a posterior estimate of the likelihood for zero count OTUs: this is }\DIFaddend implemented by the cmultRepl command in the zCompositions package in R \cite{palarea2015zcompositions}.
The use of \DIFdelbegin \DIFdel{this }\DIFdelend \DIFaddbegin \DIFadd{ratio }\DIFaddend weighting for UniFrac produces measurements that violate the \DIFaddbegin \DIFadd{metric }\DIFaddend triangle inequality, such that Euclidean statistics are technically invalid. Thus this \DIFaddbegin \DIFadd{metric}\DIFaddend , like the Bray-Curtis metric, is a dissimilarity, not a distance.
For this paper, we calculate UniFrac metrics using a custom R script, which includes unweighted UniFrac, weighted UniFrac, information UniFrac, and \DIFdelbegin \DIFdel{centered ratio UniFrac : https://github.com/ruthgrace/exponentUnifrac/blob/master/UniFrac.r
}\DIFdelend \DIFaddbegin \DIFadd{ratio UniFrac \mbox{%DIFAUXCMD
\cite{ruth_unifrac_workshop}}%DIFAUXCMD
:
}\DIFaddend
\paragraph{Bray-Curtis dissimilarity metric}\mbox{}\\
The Bray Curtis dissimilarity metric \cite{beals1984bray} quantifies how dissimilar two sites are based on counts. A \DIFaddbegin \DIFadd{Bray-Curtis }\DIFaddend index of 0 means that two samples are identical, while a \DIFaddbegin \DIFadd{Bray-Curtis }\DIFaddend index of 1 means samples do not share any species. It is computed as a proportion through the formula:
\begin{align*}
C_{ij} &= 1 - \frac{2C_{ij}}{S_{i} + S_{j}} \\
\text{where}~C_{ij}&= \text{dissimilarity index bound by [0,1]} \\
S_{i} &= \text{Specimen counts at site i} \\
S_{j} &= \text{Specimen counts at site j} \\
\end{align*}
\subsection{Data preparation}
\DIFaddbegin \DIFadd{The data used comes in the form of a table of counts per operational taxonomic unit per sample, plus a phylogenetic tree. All of our data are derived from 16S rRNA gene tag sequencing experiments, and the data and scripts can be accessed at }\url{https://github.com/ruthgrace/r_scripts} \DIFadd{\mbox{%DIFAUXCMD
\cite{r_scripts}}%DIFAUXCMD
.
}\DIFaddend
\paragraph{Tongue dorsum data set}\mbox{}\\
\DIFaddbegin \DIFadd{The tongue dorsum data set is a collection of 60 microbiome samples taken from the tongues of healthy participants. There were 0.3 million reads across 554 OTUs, and a minimum and maximum of 659 and 17176 reads per sample.
}
\DIFaddend Samples from this experiment were sourced from the Human Microbiome Project \cite{turnbaugh2007human} Qiime Community profiling v35 \DIFdelbegin \DIFdel{otu }\DIFdelend \DIFaddbegin \DIFadd{OTU }\DIFaddend tables (\url{http://hmpdacc.org/HMQCP/}).
\DIFdelbegin \DIFdel{Analysis of the data was conducted on a Late 2011 15 inch MacBook Pro 2.4 GHz i7 with 16GB of 1333 MHz DDR3 RAM. }\DIFdelend \DIFaddbegin
\DIFaddend Rarefaction was conducted through Qiime version 1.8.0-20140103 \DIFaddbegin \DIFadd{to 659 reads (the lowest number of reads for a sample)}\DIFaddend , and generation of the ellipse figures was done in R version 3.2.3 (2015-12-10) "Wooden Christmas-Tree" x86\_64-apple-darwin13.4.0 (64 bit).
A principal coordinate analysis is drawn from each distance matrix per metric, and for the first principal coordinate of each metric, \DIFdelbegin \DIFdel{Vres }\DIFdelend \DIFaddbegin \DIFadd{the resultant value ($V_{res}$) }\DIFaddend is computed per each first principal coordinate as defined by the formula:
\begin{align*}
V_{res} &=\frac{|V_1 - V_i|}{range(V_1, V_i)} \\
\text{where}~V_{res}&= \text{Set of computed PC1s,} \\
V_1 &= \text{Reference PC1 (the first),} \\
V_i &= \text{Each subsequent PC1,} \\
\end{align*}
\paragraph{Tongue dorsum and buccal mucosa data set}\mbox{}\\
\DIFaddbegin \DIFadd{The tongue dorsum and buccal mucosa data set is a collection of 30 microbiome samples taken from the tongues of healthy participants, plus 30 microbiome samples taken from the buccal mucosa (cheek) of a different set of healthy participants. There were 0.4 million reads across 12701 OTUs, and a minimum and maximum of 5028 and 9861 reads per sample. Note that if the OTUs that are less than 1\% abundant in all samples are filtered out, only 179 OTUs remain.
}\DIFaddend
\DIFdelbegin \DIFdel{Thirty }\DIFdelend \DIFaddbegin \DIFadd{To create this data set, thirty }\DIFaddend random samples were selected from the tongue site of the Human Microbiome Project \cite{turnbaugh2007human} and \DIFdelbegin \DIFdel{30 }\DIFdelend \DIFaddbegin \DIFadd{thirty }\DIFaddend random samples from the buccal mucosa site. Samples were filtered so that only samples with 5000 to 10,000 reads were included.
Read counts from the HMP data set were rarefied to the smallest total read count per sample using the vegan R package \cite{oksanen2007vegan} before the unweighted UniFrac distance was calculated. Weighted, information, and \DIFdelbegin \DIFdel{centered log }\DIFdelend \DIFaddbegin \DIFadd{ratio }\DIFaddend UniFrac were calculated on the data set without rarefaction. The resulting distances were plotted for principal coordinate analysis.
\DIFdelbegin \DIFdel{The script used to run this analysis can be referenced at }%DIFDELCMD < \url{https://github.com/ruthgrace/exponentUnifrac/blob/master/tongue_cheek_script.r}%%%
\DIFdel{.
}%DIFDELCMD <
%DIFDELCMD < %%%
\DIFdelend \paragraph{Breast milk data set}\mbox{}\\
\DIFdelbegin %DIFDELCMD <
%DIFDELCMD < %%%
\DIFdelend The breast milk data set \DIFaddbegin \DIFadd{is a collection of 58 microbiome samples taken from lactating Caucasian Canadian women. The breast milk data set }\DIFaddend used here has also been published in \DIFdelbegin \DIFdel{the Microbiome Journal \mbox{%DIFAUXCMD
\cite{urbaniak2016human}}%DIFAUXCMD
. }\DIFdelend \DIFaddbegin \DIFadd{a recent study \mbox{%DIFAUXCMD
\cite{urbaniak2016human}}%DIFAUXCMD
. There were a total of 5.3 million reads across 115 OTUs, and a minimum and maximum of 3072 and 2.8 million reads per sample. Note that the 2.8 million reads came from a sample that was taken from a patient with an infection, and the next largest number of reads per sample was 282485 (ten times less).
}
\DIFaddend The count table was analyzed using our custom UniFrac script\DIFaddbegin \DIFadd{, which can be accessed at }\url{https://github.com/ruthgrace/ruth_unifrac_workshop} \DIFadd{\mbox{%DIFAUXCMD
\cite{ruth_unifrac_workshop}}%DIFAUXCMD
}\DIFaddend . Data was rarefied to the sample with the smallest number of read counts (3072) before the unweighted UniFrac distance matrix was calculated. Non-rarefied data was used for weighted, information, and \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac. Data was plotted using a principal \DIFdelbegin \DIFdel{components or coordinate analysis }\DIFdelend \DIFaddbegin \DIFadd{coordinates or component plot }\DIFaddend as appropriate.
\DIFdelbegin \DIFdel{The script used to run this analysis can be referenced at }%DIFDELCMD < \url{https://github.com/ruthgrace/exponentUnifrac/blob/master/breastmilk_script.r}%%%
\DIFdel{. }\DIFdelend \DIFaddbegin \paragraph{\DIFadd{Monoculture data set}}\DIFadd{\mbox{}}\\
\DIFadd{The monoculture data set is simulated based on the infected sample from the breast milk data set. Each simulated sample has exactly the same counts per taxa as the infected sample, except that the taxa are shuffled. After taxa shuffling, the data was manipulated into two groups. In one set of 20 samples the taxa with the highest count was swapped with }\textit{\DIFadd{Pasteurella}}\DIFadd{, in another set of 20 the taxa with the highest count was swapped with }\textit{\DIFadd{Staphylococcus}}\DIFadd{, and in the last set of 20 the taxa with the highest count was swapped with }\textit{\DIFadd{Pseudomonas}}\DIFadd{. These three taxa were picked because they were the most highly abundant in the original breast milk data set. This process produced three sets of monocultures, dominated by the three different taxa.
}\DIFaddend
% Results and Discussion can be combined.
\section*{Results}
\subsection{Unweighted Unifrac is highly sensitive to rarefaction \DIFdelbegin \DIFdel{variants}\DIFdelend \DIFaddbegin \DIFadd{instance}\DIFaddend }
A commentary by Lozupone et al. 2011 \cite{lozupone2011unifrac} addressed the sensitivity of Unweighted UniFrac to sampling. \DIFdelbegin \DIFdel{They utilized }\DIFdelend \DIFaddbegin \DIFadd{Lozupone's group used }\DIFaddend mean UniFrac values to compute a confidence ellipse \DIFaddbegin \DIFadd{between the first and third quartile}\DIFaddend . However, we observed that this approach under-represented the true variability of unweighted UniFrac as a distance metric by highlighting how individual samples vary. In the absence of true differences and in the presence of uneven sampling, unweighted UniFrac can be sensitive to rarefaction \DIFdelbegin \DIFdel{variants}\DIFdelend \DIFaddbegin \DIFadd{instances}\DIFaddend . We show this by analyzing two rarefactions of the same body site with the rationale that if there is no true difference in the data, separation of these samples should not be observed.
\begin{figure}[h]
\DIFdelbeginFL %DIFDELCMD < \includegraphics[scale=0.7]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/sample_migration.eps}
%DIFDELCMD < \caption{%%%
\DIFdelendFL \DIFaddbeginFL \includegraphics[scale=0.36]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/sample_migration.eps}
\caption[Sample migration in different rarefactions, plotted on principal coordinates, measured with unweighted UniFrac.]{\DIFaddendFL {\bf Sample migration in different rarefactions, plotted on principal coordinates, measured with unweighted UniFrac.}
\DIFdelbeginFL \DIFdelFL{Red }\DIFdelendFL \DIFaddbeginFL \DIFaddFL{The left plot is of the tongue data set while the right plot is the tongue dorsum vs. buccal mucosa data set. On the left panel red }\DIFaddendFL samples have moved from the left cluster to the right cluster between rarefactions. Blue samples have moved from the right cluster to the left. Samples are taken from the tongue dorsum body site from the Human Microbiome Project database. \DIFaddbeginFL \DIFaddFL{If the experiment were run once, one might mistakenly assume that there are two clusters of data, however, the inconsistent sample membership of the two groups between rarefactions proves the clustering irreproducible. The tongue dorsum and buccal mucosa data set is included for comparison, with the tongue samples colored black and the buccal mucosa samples colored red. Note that the variance explained in the tongue data set by the first and second coordinate is merely 16.1\% and 9.8\% respectively, indicating that the data is rather spherical, even though the points on the plot appear to show two separated clusters (compare with 32.6\% and 16.2\% in the tongue dorsum vs. buccal mucosa data set). The variance explained in the first and second coordinate in the 2011 UniFrac commentary \mbox{%DIFAUXCMD
\cite{lozupone2011unifrac} }%DIFAUXCMD
was even smaller, at 8.6\% and 5.6\%.}\DIFaddendFL }
\label{fig2}
\end{figure}
Sixty tongue dorsum subsamples were drawn from the Human Microbiome Project data without replacement\DIFdelbegin \DIFdel{and filtered such that each gene had a minimum sum of }\DIFdelend \DIFaddbegin \DIFadd{. Rare OTUs with less than }\DIFaddend 100 \DIFdelbegin \DIFdel{counts across samples}\DIFdelend \DIFaddbegin \DIFadd{total counts across all the samples were removed}\DIFaddend . The minimum sample count for the subset of 60 we analyzed was 659, therefore we rarefied (subsampled) to the minimum of 659 to normalize the samples\DIFaddbegin \DIFadd{, prior to performing a principal coordinates analysis (PCoA)}\DIFaddend . For Fig.~\ref{fig2}, two independent rarefactions of the data were conducted in order to observe the effect of rarefaction \DIFdelbegin \DIFdel{variants on the metrics}\DIFdelend \DIFaddbegin \DIFadd{instance on the metric}\DIFaddend . The unweighted UniFrac distance was \DIFdelbegin \DIFdel{then }\DIFdelend computed for each rarefaction, and Procrustes adjustment was applied in order to overlay the \DIFaddbegin \DIFadd{PCoA-derived }\DIFaddend second rarefaction onto the first. A \DIFdelbegin \DIFdel{PCA }\DIFdelend \DIFaddbegin \DIFadd{PCoA }\DIFaddend of rarefaction 1 was plotted, and any samples that changed between rarefactions one and two were visualized with red and blue on the plot. If the sample moved from one \DIFdelbegin \DIFdel{cluster to another between the rarefactions}\DIFdelend \DIFaddbegin \DIFadd{side of the first coordinate axis to the other between the rarefaction instances}\DIFaddend , it was indicated with either a blue or a red arrow.
In both rarefactions on Fig.~\ref{fig2}, samples separated distinctly into two clusters on principal coordinate 1. Principal coordinate 1 explains the most variation in the data, and is thus useful to visualize if any associated metadata is behind the sample separation. However, the separation was not explainable by any metadata associated with the HMP experiment, and is thus an undesirable result. When plotting the rarefactions against each other, \DIFdelbegin \DIFdel{various }\DIFdelend \DIFaddbegin \DIFadd{several }\DIFaddend samples are observed \DIFdelbegin \DIFdel{moving between the various clusters}\DIFdelend \DIFaddbegin \DIFadd{to be unstable, exhibiting large differences in location}\DIFaddend . This example demonstrates that samples with little difference can appear to be different through the unweighted UniFrac distance metric \DIFaddbegin \DIFadd{and that rarefaction can lead to misleading and non-reproducible results}\DIFaddend .
\begin{figure}[h]
\DIFdelbeginFL %DIFDELCMD < \includegraphics[scale=0.7]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/ellipses.eps}
%DIFDELCMD < \caption{%%%
\DIFdelendFL \DIFaddbeginFL \includegraphics[scale=0.4]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/ellipses.eps}
\caption[Maxiumum relative deviation of rarefactions versus median deviation for traditional and non-traditional microbiome dissimilarity metrics.]{\DIFaddendFL {\bf Maxiumum relative deviation of rarefactions versus median deviation for traditional and non-traditional microbiome dissimilarity metrics.} Sixty samples from the tongue dorsum were taken from the Human Microbiome Project \cite{turnbaugh2007human}, and rarefied 100 times. The maximum relative deviation was plotted against the median relative deviation of the rarefied data, and ellipses were drawn at the 95\% confidence interval, around the cloud of points for each metric. \DIFaddbeginFL \DIFaddFL{A higher maximum and median devation indicates lower reproducibility of results between rarefaction instances. }\DIFaddendFL Both the maximum relative deviation of rarefied data and the median relative deviation of rarefied data are greater in unweighted UniFrac than in weighted UniFrac, Bray Curtis \DIFdelbeginFL \DIFdelFL{distance}\DIFdelendFL \DIFaddbeginFL \DIFaddFL{dissimilarity}\DIFaddendFL , \DIFdelbeginFL \DIFdelFL{centered }\DIFdelendFL ratio UniFrac, and information UniFrac.}
\label{fig3}
\end{figure}
For the ellipse plot in Fig.~\ref{fig3}, 60 tongue dorsum subsamples were randomly drawn without replacement\DIFdelbegin \DIFdel{and the gene compositions per sample were also filtered to a minimum of 100. }\DIFdelend \DIFaddbegin \DIFadd{. Rare OTUs with less than 100 total counts across all samples were removed. }\DIFaddend A hundred separate rarefactions were conducted on the data to a minimum sampling depth of \DIFdelbegin \DIFdel{378. }\DIFdelend \DIFaddbegin \DIFadd{659. }\DIFaddend For each individual rarefied OTU table, a distance matrix was computed using \DIFdelbegin \DIFdel{the }\DIFdelend \DIFaddbegin \DIFadd{one of }\DIFaddend unweighted Unifrac, weighted UniFrac, Bray-Curtis Dissimilarity, information UniFrac, \DIFdelbegin \DIFdel{and centered }\DIFdelend \DIFaddbegin \DIFadd{or }\DIFaddend ratio UniFrac as \DIFdelbegin \DIFdel{a }\DIFdelend \DIFaddbegin \DIFadd{the }\DIFaddend weighting method. By generating 100 separate datasets for each metric, it is possible to assess the \DIFdelbegin \DIFdel{magnitude of difference each metric has }\DIFdelend \DIFaddbegin \DIFadd{effect of rarefaction instance on each metric }\DIFaddend by analyzing what is essentially the same data. In other words, what does the effect of random sampling (rarefaction) have on the output of each metric? Each distance matrix generated per metric was adjusted with a Procrustes adjustment to overlay the subsequent rarefactions onto the first.
\DIFdelbegin \DIFdel{Thus, given the wide use of unweighted UniFrac in the literature with small principal component 1 and 2 effects, we suggest caution in their interpretation. For example, see the use of unweighted UniFrac in these papers about the human microbiome published in Cell\mbox{%DIFAUXCMD
\cite{hsiao2013microbiota} }%DIFAUXCMD
and Nature \mbox{%DIFAUXCMD
\cite{sonnenburg2016diet}}%DIFAUXCMD
.
}%DIFDELCMD <
%DIFDELCMD < %%%
\DIFdelend The maximum value of Vres for each rarefaction is plotted against the median value per rarefaction in Fig.~\ref{fig3}. This plotting serves to highlight the maximum potential change for an analysis given that there is no difference in the data. Unweighted UniFrac shows by far the highest maximum potential change between rarefactions, compared to weighted, information, and \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac, as well as Bray-Curtis.
\DIFdelbegin \subsection{\DIFdel{Why does Unweighted Unifrac have discrepancies when analyzing rarefied data?}}
%DIFAUXCMD
\addtocounter{subsection}{-1}%DIFAUXCMD
\DIFdel{The UniFrac distance is defined as the sum of unshared branches divided by the sum of all branch lengths \mbox{%DIFAUXCMD
\cite{lozupone2005unifrac}}%DIFAUXCMD
. Samples that are dissimilar will have values closer to 1 as they should have more unshared branches relative to one another. Similar samples have a value closer to 0 since they will have fewer unshared branches. As defined previously, rarefaction serves the purpose of standardizing sample counts to a common denominator, which is usually defined as the lowest sequencing depth(cite rarefaction paper here).
}\DIFdelend \DIFaddbegin \DIFadd{Given the wide use of unweighted UniFrac in the literature with small principal coordinate 1 and 2 effects, we suggest caution in their interpretation. For example, see the use of unweighted UniFrac in these papers about the human microbiome published in Cell\mbox{%DIFAUXCMD
\cite{hsiao2013microbiota}}%DIFAUXCMD
, where the first and second principal coordinates axis explain 14\% and 9.5\% of the variation in Figure 2A, as well as in Nature \mbox{%DIFAUXCMD
\cite{sonnenburg2016diet}}%DIFAUXCMD
, where the first principal coordinate explains 14\% of the variation in Figure 1. In both of these examples, less variance is explained by the first principal coordinate than in our uniform tongue data set.
}
\FloatBarrier
\subsection{\DIFadd{The cause of rarefaction variation by Unweighted Unifrac}}
\DIFaddend One point to note is that rarefaction carries the assumption that microbiota within samples are homogeneous and randomly distributed. However, this assumption is only valid if proper sampling protocols are observed \cite{gorzelak2015methods}. A combination of unevenly sampled \DIFdelbegin \DIFdel{genes }\DIFdelend \DIFaddbegin \DIFadd{OTUs }\DIFaddend and distantly related \DIFdelbegin \DIFdel{genes }\DIFdelend \DIFaddbegin \DIFadd{OTUs }\DIFaddend will contribute to \DIFdelbegin \DIFdel{UniFrac's variability when genes }\DIFdelend \DIFaddbegin \DIFadd{the variability in unweighted UniFrac when OTUs }\DIFaddend are ultimately rarefied. Distance matrices between samples will be affected when rare \DIFdelbegin \DIFdel{genes }\DIFdelend \DIFaddbegin \DIFadd{OTUs }\DIFaddend are left out during the rarefaction processes. It becomes intuitive to see how similar samples may grow dissimilar from each other through unweighted UniFrac on rarefied samples as the number of unshared branches increases as \DIFdelbegin \DIFdel{genes disappear}\DIFdelend \DIFaddbegin \DIFadd{OTUs are removed}\DIFaddend .
\begin{figure}[h]
\includegraphics[scale=0.5]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/tree.eps}
\DIFdelbeginFL %DIFDELCMD < \caption{%%%
\DIFdelendFL \DIFaddbeginFL \caption[Phylogenetic tree with long isolated branches.]{\DIFaddendFL {\bf Phylogenetic tree with long isolated branches.}
Variation in different rarefactions of data in unweighted UniFrac analysis is exacerbated by the presence of long isolated branches in the phylogenetic tree\DIFaddbeginFL \DIFaddFL{, such as the circled OTU in this example}\DIFaddendFL .}
\label{fig4}
\end{figure}
\textsc{\begin{table}[!ht]
% \begin{adjustwidth}{-2.25in}{0in} % Comment out/remove adjustwidth environment if table fits in text column.
\DIFdelbeginFL %DIFDELCMD < \caption{%
{%DIFAUXCMD
%DIFDELCMD < {\bf %%%
\DIFdelFL{Original abundance of taxa and rarefied abundance of taxa.}%DIFDELCMD < }%%%
}
%DIFAUXCMD
\DIFdelendFL \DIFaddbeginFL \caption[Original abundance of taxa and rarefied abundance of taxa.]{
{\bf \DIFaddFL{Original abundance of taxa and rarefied abundance of taxa.}} \DIFaddFL{This data was simulated to demonstrate how rarefaction can change the distances reported by the unweighted UniFrac metric. Originally, sample A contained 1075 counts and sample B contained 221 counts in total. Both samples were rarefied to 221 counts, twice. The OTU in bold has been rarified to a zero count in sample A for one instance and a non zero count in the other instance. In Rarefaction 1, the unweighted UniFrac distance (unshared over total branches) is 0.4175, while in Rarefaction 2 the distance is 1.12.}}
\DIFaddendFL \begin{tabular}{|l|l|l|l|l|l|l|l|}
\hline
\bf{OTU.ID} & \bf{A} & \bf{B} & \bf{\shortstack{A\\R1}} & \bf{\shortstack{B\\R1}} & \bf{\shortstack{A\\R2}} & \bf{\shortstack{B\\R2}}\\ \hline
OTU.16340 & 52 & 1 & 8 & 1 & 12 & 1\\ \hline
OTU.17317 & 17 & 4 & 3 & 4 & 5 & 4\\ \hline
OTU.20 & 70 & 18 & 14 & 18 & 20 & 18\\ \hline
OTU.37867 & 59 & 10 & 9 & 10 & 11 & 10\\ \hline
\DIFdelbeginFL \DIFdelFL{OTU.37990 }\DIFdelendFL \DIFaddbeginFL \textbf{\DIFaddFL{OTU.37990}} \DIFaddendFL & 7 & 59 & \DIFdelbeginFL \DIFdelFL{0 }\DIFdelendFL \DIFaddbeginFL \textbf{\DIFaddFL{0}} \DIFaddendFL & 59 & \DIFdelbeginFL \DIFdelFL{1 }\DIFdelendFL \DIFaddbeginFL \textbf{\DIFaddFL{1}} \DIFaddendFL & 59\\ \hline
OTU.38187 & 646 & 115 & 132 & 115 & 122 & 115\\ \hline
OTU.38446 & 6 & 8 & 0 & 8 & 1 & 8\\ \hline
OTU.45429 & 218 & 6 & 55 & 6 & 49 & 6\\ \hline
\end{tabular}
\DIFaddbeginFL \label{table1}
\DIFaddendFL \begin{flushleft}
\end{flushleft}
\DIFdelbeginFL %DIFDELCMD < \label{table1}
%DIFDELCMD < %%%
\DIFdelendFL % \end{adjustwidth}
\end{table}}
\DIFdelbegin \begin{eqnarray*}
\DIFdel{Distance_{A:B} for Rarefaction 1 }&\\
\DIFdel{Distance_{A:B} }&\DIFdel{=\frac{\sum Unshared Branches}{\sum Total Branches}}\\
&\DIFdel{= \frac{\text{(0.2889 + 0.1706)}}{\text{1.12}}}\\
&\DIFdel{= \frac{\text{0.5281}}{\text{1.12}}}\\
&\DIFdel{= \text{0.4175}
}\end{eqnarray*}
%DIFAUXCMD
%DIFDELCMD <
%DIFDELCMD < %%%
\begin{eqnarray*}
\DIFdel{Distance_{A:B} for Rarefaction 2 }&\\
\DIFdel{Distance_{A:B} }&\DIFdel{=\frac{\sum Unshared Branches}{\sum Total Branches}}\\
&\DIFdel{= \frac{\text{0}}{\text{1.12}}}\\
&\DIFdel{= \text{0}
}\end{eqnarray*}
%DIFAUXCMD
%DIFDELCMD <
%DIFDELCMD < %%%
\DIFdel{With rare genes }\DIFdelend \DIFaddbegin \DIFadd{With rare OTUs }\DIFaddend and long branch lengths in the phylogenetic tree (Fig.~\ref{fig4}), the Unweighted UniFrac distance metric on rarefied data is highly variable, declaring the samples A and B identical (distance of 0) with 1 rarefaction, and different with another (distance of 0.4175), as demonstrated in Table ~\ref{table1} and the calculations above.
While an improvement on unweighted UniFrac, weighted UniFrac can overweight differences between large proportional abundances and underweight differences between small proportional abundances. If one bacterial taxa increased in proportion from 5/1000 to 10/1000 and another taxa increased in proportion from 95/1000 to 100/1000, they would have the same weight in weighted UniFrac. However, the first taxa has doubled in proportion between samples, and this is much more biologically significant than the change in proportional abundance in the second taxa. Additionally, it does not account for how the counts add up to a constrained sum determined by the sequencing machine model. Because the sum is constrained, as \DIFdelbegin \DIFdel{an example}\DIFdelend \DIFaddbegin \DIFadd{with the bacterial vaginosis sample earlier}\DIFaddend , an increase in growth of one taxa can make the data look like there is a decrease in abundance in other taxa, even if in reality the population of the other taxa stayed the same.
Here we explore some alternatives to unweighted and weighted UniFrac, and discuss their merits and shortfalls.
\DIFaddbegin \FloatBarrier
\DIFaddend \subsection{Information UniFrac}
The difference in information content between \DIFaddbegin \DIFadd{taxa with }\DIFaddend low proportional abundances (which make up the bulk of microbiome data) is generally higher than the difference between the proportional abundances themselves, potentially allowing scientists to differentiate \DIFdelbegin \DIFdel{groups }\DIFdelend \DIFaddbegin \DIFadd{samples }\DIFaddend with subtle differences\DIFdelbegin \DIFdel{.
}\DIFdelend \DIFaddbegin \DIFadd{, such as the infected breastmilk sample in Fig.~\ref{fig7}.
}\DIFaddend
\begin{figure}[h]
\includegraphics[scale=0.6]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/weightings.eps}
\DIFdelbeginFL %DIFDELCMD < \caption{%%%
\DIFdelendFL \DIFaddbeginFL \caption[UniFrac weights.]{\DIFaddendFL {\bf \DIFdelbeginFL \DIFdelFL{Unifrac }\DIFdelendFL \DIFaddbeginFL \DIFaddFL{UniFrac }\DIFaddendFL weights. }
Each UniFrac weighting is plotted with the corresponding proportional abundance. The black line is unweighted UniFrac, the red line is weighted UniFrac,and the blue line is information UniFrac. \DIFaddbeginFL \DIFaddFL{From 0 to 0.2 on the x-axis information UniFrac has a higher slope, and therefore more discovery power for smaller changes in abundance. As the x-axis approaches 1, changes in abundance add little discovery power to information Unifrac.}\DIFaddendFL }
\label{fig5}
\end{figure}
\DIFdelbegin \DIFdel{Near the 0, 0 point in }\DIFdelend \DIFaddbegin \DIFadd{For example, }\DIFaddend Fig.~\ref{fig5} \DIFaddbegin \DIFadd{shows the weighting of a taxon in unweighted, weighted, and information UniFrac as a function of the taxon proportional abundance. Near the 0}\DIFaddend , \DIFdelbegin \DIFdel{the proportional }\DIFdelend \DIFaddbegin \DIFadd{0 point the proportional }\DIFaddend abundances are low \DIFaddbegin \DIFadd{and information is ~0. However, small increases in abundance result in large changes in contribution to UniFrac weighting, as shown by the slope of the curve}\DIFaddend . Here there is higher differentiation between weights of different pairs of low proportional abundances for information UniFrac, as shown by the higher slope of the curved graph. The \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac (not depicted) depends on the geometric mean of the taxonomic abundances, and \DIFaddbegin \DIFadd{each sample }\DIFaddend would have a different slope in the weight graph depending on how evenly the abundances were distributed.
\DIFdelbegin \subsection{\DIFdel{Tongue and cheek comparison}}
%DIFAUXCMD
\addtocounter{subsection}{-1}%DIFAUXCMD
\DIFdelend \DIFaddbegin \FloatBarrier
\DIFaddend
\DIFaddbegin \subsection{\DIFadd{Tongue and buccal mucosa comparison}}
\DIFaddend \begin{figure}[h]
\includegraphics[scale=0.8]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/tongue_cheek.eps}
\DIFdelbeginFL %DIFDELCMD < \caption{%%%
\DIFdelendFL \DIFaddbeginFL \caption[Analysis of tongue and buccal mucosa data using different UniFrac weightings.]{\DIFaddendFL {\bf Analysis of tongue and \DIFdelbeginFL \DIFdelFL{cheek }\DIFdelendFL \DIFaddbeginFL \DIFaddFL{buccal mucosa }\DIFaddendFL data using different UniFrac weightings. }
A principal coordinate analysis of a 16S rRNA \DIFaddbeginFL \DIFaddFL{gene tag }\DIFaddendFL experiment done on samples from the tongue and \DIFdelbeginFL \DIFdelFL{cheek}\DIFdelendFL \DIFaddbeginFL \DIFaddFL{buccal mucosa}\DIFaddendFL , selected from the Human Microbiome Project \cite{turnbaugh2007human}. All weightings \DIFaddbeginFL \DIFaddFL{and the Bray-Curtis dissimilarity }\DIFaddendFL show separation between the samples by body site. \DIFaddbeginFL \DIFaddFL{Note that the variance explained by the first and second principal coordinate axis is higher than in the tongue-tongue data set from Figure 2, which had 16.1\% and 9.8\% variance explained, respectively.}\DIFaddendFL }
\label{fig6}
\end{figure}
We next explore two other datasets, one with a defined difference between groups (tongue dorsum compared to buccal mucosa), and one with an outlier that is only apparent when analyzed by certain dissimilarity metrics.
Fig.~\ref{fig6} shows a principal coordinate analysis plot with four different metrics: unweighted UniFrac, weighted UniFrac, information UniFrac, and \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac. We observe that the difference in the microbiome between the human tongue and \DIFdelbegin \DIFdel{cheek }\DIFdelend \DIFaddbegin \DIFadd{buccal mucosa }\DIFaddend are well defined by all metrics (Fig.~\ref{fig6}), since all of the weightings show separation between the samples according to body site. We conclude \DIFaddbegin \DIFadd{from (Fig.~\ref{fig3}) }\DIFaddend that weighted UniFrac, information UniFrac, and \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac do not tend to show spurious separation in uniform data sets to the degree that unweighted UniFrac does\DIFdelbegin \DIFdel{(Fig.~\ref{fig3})}\DIFdelend , while reliably separating samples in data with a defined difference between groups.
\DIFaddbegin \FloatBarrier
\DIFaddend \subsection{Breast milk Data}
\begin{figure}[h]
\includegraphics[scale=0.8]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/breastmilk.eps}
\DIFdelbeginFL %DIFDELCMD < \caption{%%%
\DIFdelendFL \DIFaddbeginFL \caption[Analysis of breast milk data using different UniFrac weightings.]{\DIFaddendFL {\bf Analysis of breast milk data using different UniFrac weightings. }
A principal coordinate analysis of a \DIFaddbeginFL \DIFaddFL{simulated }\DIFaddendFL 16S rRNA \DIFaddbeginFL \DIFaddFL{gene tag }\DIFaddendFL experiment \DIFdelbeginFL \DIFdelFL{done }\DIFdelendFL \DIFaddbeginFL \DIFaddFL{based }\DIFaddendFL on \DIFdelbeginFL \DIFdelFL{samples from a 16S rRNA experiment on }\DIFdelendFL \DIFaddbeginFL \DIFaddFL{the }\DIFaddendFL breast milk \DIFaddbeginFL \DIFaddFL{data}\DIFaddendFL . \DIFdelbeginFL \DIFdelFL{The circled sample is infected with 97}\DIFdelendFL \DIFaddbeginFL \DIFaddFL{Red samples are dominated at 07}\DIFaddendFL \% \DIFdelbeginFL \textit{\DIFdelFL{Pseudomonas}}%DIFAUXCMD
\DIFdelendFL \DIFaddbeginFL \DIFaddFL{by }\textit{\DIFaddFL{Pasteurella}}\DIFaddendFL , \DIFdelbeginFL \DIFdelFL{compared }\DIFdelendFL \DIFaddbeginFL \DIFaddFL{black samples are dominated by }\textit{\DIFaddFL{Staphylococcus}}\DIFaddFL{, and cyan samples are dominated by }\textit{\DIFaddFL{Pseudomonas}}\DIFaddFL{. Note that while information Unifrac appears }\DIFaddendFL to \DIFdelbeginFL \DIFdelFL{15-20\% in }\DIFdelendFL \DIFaddbeginFL \DIFaddFL{separate }\DIFaddendFL the \DIFdelbeginFL \DIFdelFL{other }\DIFdelendFL samples \DIFaddbeginFL \DIFaddFL{reasonably well visually, the amount of variance explained by the first two coordinates is much lower than even weighted UniFrac}\DIFaddendFL .}
\label{fig7}
\end{figure}
Fig.~\ref{fig7} is a principal coordinate analysis of a 16S rRNA gene sequencing experiment done on microbiome samples from breast milk \cite{urbaniak2016human}. Breast milk samples were collected and the V4 region of the 16S rRNA gene was sequenced. One of \DIFdelbegin \DIFdel{these samples was infected (circled), consisting }\DIFdelend \DIFaddbegin \DIFadd{the patients who provided a sample had an active infection, producing a sample that consisted }\DIFaddend of 97\% \DIFdelbegin \DIFdel{Pasteurella}\DIFdelend \DIFaddbegin \textit{\DIFadd{Pasteurella}}\DIFaddend . We noted that this sample was not distinct in unweighted and weighted UniFrac because the distance from the \DIFdelbegin \DIFdel{Pasteurella }\DIFdelend \DIFaddbegin \textit{\DIFadd{Pasteurella}} \DIFaddend branches of the phylogenetic tree to the root of the tree (rooted by midpoint) were not particularly short or long, measuring at just over the 3rd quartile of all root-to-leaf distances. In addition, the \DIFdelbegin \DIFdel{Pasteurella }\DIFdelend \DIFaddbegin \textit{\DIFadd{Pasteurella}} \DIFaddend leaves shared a clade with many other taxa.
The reason the infected sample in the breast milk study is so distinct from the rest of the samples in Information UniFrac and \DIFdelbegin \DIFdel{Centered }\DIFdelend Ratio UniFrac is because of the weighting. The infected sample was 97\% \DIFdelbegin \DIFdel{Pasteurella}\DIFdelend \DIFaddbegin \textit{\DIFadd{Pasteurella}}\DIFaddend , while the other samples generally had 15-20\% each of Staphylococcus and Pseudomonas, and little or no \DIFdelbegin \DIFdel{Pasteurella}\DIFdelend \DIFaddbegin \textit{\DIFadd{Pasteurella}}\DIFaddend . Unweighted UniFrac does not differentiate between high and low abundance. Weighted UniFrac does, placing the infected sample in the bottom right corner of that plot. Information UniFrac weights everything in the infected sample close to zero, as taxa are present in either very high or very low abundance, while weighting Staphylococcus and Pseudomonas in the other samples highly (around 0.4) due to their 15-20\% abundance. \DIFdelbegin \DIFdel{Centered ratio }\DIFdelend \DIFaddbegin \DIFadd{Ratio }\DIFaddend UniFrac recognizes that the infected sample has a taxonomic abundance very far from the geometric mean abundance. For these reasons information and \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac are more adept at picking up outliers with uneven distributions, even if the taxa are shared by other samples.
\DIFaddbegin \FloatBarrier
\subsection{\DIFadd{Monoculture data}}
\begin{figure}[h]
\includegraphics[scale=0.8]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/monoculture.eps}
\caption[Analysis of simulated monocultures using different UniFrac weightings.]{{\bf \DIFaddFL{Analysis of simulated monocultures using different UniFrac weightings. }}
\DIFaddFL{A principal coordinate analysis of a simulated 16S rRNA gene tag experiment based on the breast milk data. Red samples are dominated at 97\% by }\textit{\DIFaddFL{Pasteurella}}\DIFaddFL{, black samples are dominated by }\textit{\DIFaddFL{Pseudomonas}}\DIFaddFL{, and cyan samples are dominated by }\textit{\DIFaddFL{Staphylococcus}}\DIFaddFL{. Note that while information Unifrac appears to separate the samples reasonably well visually, the amount of variance explained by the first two coordinates is much lower than even weighted UniFrac.}}
\label{fig8}
\end{figure}
\DIFadd{Each sample in the monoculture dataset is 97\% dominated by one of three taxa. However, within the remaining 3\% there is variation in the counts.
}
\DIFadd{Unweighted UniFrac, being a binary test, detects only the variation in the remaining 3\% of counts, without showing the difference in the monocultures. Weighted UniFrac detects only the difference in the identity of the monoculture, and the separation is driven by phylogenetic distance - the pairwise distance from }\textit{\DIFadd{Pasteurella}} \DIFadd{to }\textit{\DIFadd{Staphylococcus}} \DIFadd{and }\textit{\DIFadd{Pseudomonas}} \DIFadd{to }\textit{\DIFadd{Staphylococcus}} \DIFadd{is just over 0.9 on the phylogenetic tree while the distance from }\textit{\DIFadd{Pasteurella}} \DIFadd{to }\textit{\DIFadd{Pseudomonas}} \DIFadd{is 0.45. This is in correspondence with the PCoA plot where the first coordinate (which separates the }\textit{\DIFadd{Staphylococcus}} \DIFadd{species from the other two) explains over 90\% of the variance in the data set.
}
\DIFadd{Information UniFrac is known to not perform very well for monocultures, due to taxa with very high and low proportional abundances having uncertainty information values close to zero (Fig.~\ref{fig5}). While the samples separate visually with information UniFrac, the variance explained by the separation is low, and the distance matrix does not separate the three groups by hierarchical clustering. Ratio UniFrac and Bray Curtis both separate the samples by monoculture, and also differentiate the samples by their minor variations, showcasing a more representative perspective of this data set.
}
\DIFadd{If the samples are hierarchically clustered, the three groups separate perfectly with weighted UniFrac, ratio UniFrac, and Bray Curtis dissimilarity, but not with unweighted UniFrac or information UniFrac.
}
\FloatBarrier
\DIFaddend \section*{Discussion}
As shown in the tongue and \DIFdelbegin \DIFdel{cheek }\DIFdelend \DIFaddbegin \DIFadd{buccal mucosa }\DIFaddend data set, unweighted UniFrac is perfectly sufficient for data sets with a notable difference. However, in data sets with no difference or a very small difference between groups such the uniform tongue dorsum data set, unweighted UniFrac is the least reliable and we found that it may produce wildly different results depending on rarefaction and sequencing depth. This can result in spurious groups, or inclusion of samples in the wrong groups.
We found weighted UniFrac, information UniFrac, \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac, and Bray-Curtis methods to be more reliable choices. We suggest that investigators use several methods as they can detect outliers in different circumstances. When an outlier is detected by any metric, an investigation is warranted, as with \DIFdelbegin \DIFdel{our }\DIFdelend \DIFaddbegin \DIFadd{the }\DIFaddend example in the breast milk data set.
\DIFaddbegin \DIFadd{We do not believe that any of these weightings are a perfect model for microbiome data. Each tool is prone to its own set of weaknesses. If the difference in groups is driven by presence/absence then UniFrac is a reasonable choice. If the difference is driven by a linear abundance, then weighted UniFrac is a good choice. Information UniFrac and ratio UniFrac are useful for examining data sets that contain a similar set of taxa between groups. Information and ratio UniFrac are especially useful for examining data sets that have more subtle variations, due to their non linear nature. In any case, inspection should be done to make sure that the tool used accurately represents the data.
}
\DIFaddend In summary, with the addition of information UniFrac and \DIFdelbegin \DIFdel{centered }\DIFdelend ratio UniFrac, biologists have more tools at their disposal to prevent spurious interpretations, detect outliers, and ultimately understand their data better.
\DIFaddbegin \section*{\DIFadd{Supporting Information}}
%DIF > Include only the SI item label in the paragraph heading. Use the \nameref{label} command to cite SI items in the text.
\begin{figure}[h]
\paragraph*{\DIFaddFL{S1 Fig.}}\DIFaddFL{\mbox{}}\\
\includegraphics[scale=0.5]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/gunifrac_0_0_25.eps}
\caption[Principal Coordinate Analysis derived from GUniFrac distance matrices.]{{\bf \DIFaddFL{Principal Coordinate Analysis derived from GUniFrac distance matrices. }} \DIFaddFL{GUniFrac was run with an alpha of 0 and 0.25. Note that GUniFrac, like QIIME, prunes the tree with every pairwise comparison. That is, the phylogenetic tree used for the distance calculation for each pair of samples can be different. The resulting measurements are a dissimilarity, not a distance. Additionally, QIIME gives slightly different values from GUniFrac, but the source of this (likely an additional normalization) is not known.}}
\label{gunifrac_1}
\end{figure}
\begin{figure}[h]
\paragraph*{\DIFaddFL{S2 Fig.}}\DIFaddFL{\mbox{}}\\
\includegraphics[scale=0.5]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/gunifrac_0_5_0_75.eps}
\caption[Principal Coordinate Analysis derived from GUniFrac distance matrices.]{{\bf \DIFaddFL{Principal Coordinate Analysis derived from GUniFrac distance matrices. }} \DIFaddFL{GUniFrac was run with an alpha of 0.5 and 0.75. Note that GUniFrac, like QIIME, prunes the tree with every pairwise comparison. That is, the phylogenetic tree used for the distance calculation for each pair of samples can be different. The resulting measurements are a dissimilarity, not a distance. Additionally, QIIME gives slightly different values from GUniFrac, but the source of this (likely an additional normalization) is not known.}}
\label{gunifrac_2}
\end{figure}
\begin{figure}[h]
\paragraph*{\DIFaddFL{S3 Fig.}}\DIFaddFL{\mbox{}}\\
\includegraphics[scale=0.5]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/gunifrac_1_UW.eps}
\caption[Principal Coordinate Analysis derived from GUniFrac distance matrices.]{{\bf \DIFaddFL{Principal Coordinate Analysis derived from GUniFrac distance matrices. }} \DIFaddFL{GUniFrac was run with an alpha of 1 (equivalent to weighted UniFrac), plus unweighted UniFrac for comparison. Note that GUniFrac, like QIIME, prunes the tree with every pairwise comparison. That is, the phylogenetic tree used for the distance calculation for each pair of samples can be different. The resulting measurements are a dissimilarity, not a distance. Additionally, QIIME gives slightly different values from GUniFrac, but the source of this (likely an additional normalization) is not known.}}
\label{gunifrac_3}
\end{figure}
\begin{figure}[h]
\paragraph*{\DIFaddFL{S4 Fig.}}\DIFaddFL{\mbox{}}\\
\includegraphics[scale=0.7]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/tongue_tongue_sample_migration_no_pruning.eps}
\caption[Principal coordinate analysis derived from tongue dorsum samples using unweighted UniFrac distance matrices with no tree pruning.]{{\bf \DIFaddFL{Principal coordinate analysis derived from tongue dorsum samples using unweighted UniFrac distance matrices with no tree pruning.}}}
\label{S2_Fig}
\end{figure}
\begin{figure}[h]
\paragraph*{\DIFaddFL{S5 Fig.}}\DIFaddFL{\mbox{}}\\
\includegraphics[scale=0.7]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/tongue_cheek_sample_migration_no_pruning.eps}
\caption[Principal coordinate analysis derived from tongue dorsum and buccal mucosa samples using unweighted UniFrac distance matrices with no tree pruning.]{{\bf \DIFaddFL{Principal coordinate analysis derived from tongue dorsum and buccal mucosa samples using unweighted UniFrac distance matrices with no tree pruning.}}}
\label{S3_Fig}
\end{figure}
\begin{figure}[h]
\paragraph*{\DIFaddFL{S6 Fig.}}\DIFaddFL{\mbox{}}\\
\includegraphics[scale=0.7]{/Users/dphamnyghonca/Desktop/gloor/msc_thesis_2016/unifrac_paper/tree_pruning_example.eps}
\caption[Weighted UniFrac is a dissimilarity with tree pruning.]{{\bf \DIFaddFL{Weighted UniFrac is a dissimilarity with tree pruning.}} \DIFaddFL{Here, the weighted UniFrac measurements without tree pruning are: $W_{AB} = \frac{121}{252}$, $W_{BC} = \frac{111}{252}$, and $W_{AC} = \frac{11}{12}$. With tree pruning, the measurements are: $W_{AB} = \frac{121}{252}$, $W_{BC} = \frac{111}{252}$, and $W_{AC} = 1$, which fails the triangle inequality.}}
\label{S4_Fig}
\end{figure}
\FloatBarrier
\DIFaddend \section*{Acknowledgments}
Thanks to \DIFaddbegin \DIFadd{Dr. }\DIFaddend Camilla Urbaniak for providing the data from \DIFdelbegin \DIFdel{her }\DIFdelend \DIFaddbegin \DIFadd{the }\DIFaddend breast milk study \cite{urbaniak2016human}.
\nolinenumbers
%\section*{References}
% Either type in your references using
% \begin{thebibliography}{}
% \bibitem{}
% Text
% \end{thebibliography}
%
% OR
%
% Compile your BiBTeX database using our plos2015.bst
% style file and paste the contents of your .bbl file
% here.
%
\bibliography{unifrac}
\end{document}