Add paper

rrwick · Mar 1, 2019 · d3a0475 · d3a0475
1 parent 9428495
commit d3a0475
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 1 deletion.
diff --git a/badread/version.py b/badread/version.py
@@ -14,4 +14,4 @@
 If not, see <http://www.gnu.org/licenses/>.
 """
 
-__version__ = '0.1.1'
+__version__ = '0.1.3'
diff --git a/paper/codemeta.json b/paper/codemeta.json
@@ -0,0 +1,23 @@
+{
+  "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
+  "@type": "Code",
+  "author": [
+    {
+      "@id": "0000-0001-8349-0778",
+      "@type": "Person",
+      "email": "rrwick@gmail.com",
+      "name": "Ryan Wick",
+      "affiliation": "Department of Infectious Diseases, Central Clinical School, Monash University, Melbourne, Victoria 3004, Australia"
+    }
+  ],
+  "identifier": "",
+  "codeRepository": "https://github.com/rrwick/Badread",
+  "datePublished": "2019-03-01",
+  "dateModified": "2019-03-01",
+  "dateCreated": "2019-03-01",
+  "description": "Badread is a long-read simulator tool that can imitate many kinds of problems one might encounter in real read sets: chimeric reads, low-quality regions, systematic basecalling errors and more.",
+  "keywords": "long-read sequencing, oxford nanopore, pacbio, read simulation",
+  "license": "GPL v3.0",
+  "title": "Badread",
+  "version": "v0.1.3"
+}
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,94 @@
+@article{Eisenstein2017,
+author = {Eisenstein, Michael},
+title = {{An ace in the hole for DNA sequencing}},
+journal = {Nature},
+year = {2017},
+volume = {550},
+number = {7675},
+pages = {285--288},
+doi = {10.1038/550285a},
+url = {https://www.nature.com/articles/550285a}
+}
+
+@article{Phillippy2017,
+author = {Phillippy, Adam M.},
+title = {{New advances in sequence assembly}},
+journal = {Genome Research},
+year = {2017},
+volume = {27},
+number = {5},
+pages = {xi--xiii},
+doi = {10.1101/gr.223057.117},
+url = {https://genome.cshlp.org/content/27/5/xi.full},
+}
+
+@article{Koren2017,
+author = {Koren, Sergey and Walenz, Brian P. and Berlin, Konstantin and Miller, Jason R. and Phillippy, Adam M.},
+title = {{Canu: scalable and accurate long-read assembly via adaptive k-mer weighting and repeat separation}},
+journal = {Genome Research},
+year = {2017},
+volume = {27},
+number = {5},
+pages = {722--736},
+doi = {10.1101/071282},
+url = {https://genome.cshlp.org/content/27/5/722.full}
+}
+
+@article{Heather2016,
+author = {Heather, James M. and Chain, Benjamin},
+title = {{The sequence of sequencers: The history of sequencing DNA}},
+journal = {Genomics},
+year = {2016},
+volume = {107},
+number = {1},
+pages = {1--8},
+doi = {10.1016/j.ygeno.2015.11.003},
+url = {https://www.sciencedirect.com/science/article/pii/S0888754315300410?via%3Dihub},
+}
+
+@article{Huang2012,
+author = {Huang, Weichun and Li, Leping and Myers, Jason R. and Marth, Gabor T.},
+title = {{ART: A next-generation sequencing read simulator}},
+journal = {Bioinformatics},
+year = {2012},
+volume = {28},
+number = {4},
+pages = {593--594},
+doi = {10.1093/bioinformatics/btr708},
+url = {https://academic.oup.com/bioinformatics/article/28/4/593/213322}
+}
+
+@article{Ono2013,
+author = {Ono, Yukiteru and Asai, Kiyoshi and Hamada, Michiaki},
+title = {{PBSIM: PacBio reads simulator—toward accurate genome assembly}},
+journal = {Bioinformatics},
+year = {2013},
+volume = {29},
+number = {1},
+pages = {119--121},
+doi = {10.1093/bioinformatics/bts649},
+url = {https://academic.oup.com/bioinformatics/article/29/1/119/273243}
+}
+
+@article{Mu2016,
+author = {Mu, John C. and Mohiyuddin, Marghoob and Dallett, Carolina and Lau, Bayo and {Bani Asadi}, Narges and Fang, Li Tai and Lam, Hugo Y. K.},
+title = {{LongISLND: in silico sequencing of lengthy and noisy datatypes}},
+journal = {Bioinformatics},
+year = {2016},
+volume = {32},
+number = {24},
+pages = {3829--3832},
+doi = {10.1093/bioinformatics/btw602},
+url = {https://academic.oup.com/bioinformatics/article/32/24/3829/2525710}
+}
+
+@article{Yang2017,
+author = {Yang, Chen and Chu, Justin and Warren, Ren{\'{e}} L. and Birol, Inan{\c{c}}},
+title = {{NanoSim: Nanopore sequence read simulator based on statistical characterization}},
+journal = {GigaScience},
+year = {2017},
+volume = {6},
+number = {4},
+doi = {10.1093/gigascience/gix010},
+url = {https://academic.oup.com/gigascience/article/6/4/gix010/3051934}
+}
diff --git a/paper/paper.md b/paper/paper.md
@@ -0,0 +1,43 @@
+---
+title: 'Badread: simulation of error-prone long reads'
+tags:
+  - long-read sequencing
+  - oxford nanopore
+  - ont
+  - pacific biosciences
+  - pacbio
+authors:
+  - name: Ryan R Wick
+    orcid: 0000-0001-8349-0778
+    affiliation: 1
+affiliations:
+ - name: Department of Infectious Diseases, Central Clinical School, Monash University, Melbourne, Victoria 3004, Australia
+   index: 1
+date: 1 March 2019
+bibliography: paper.bib
+---
+
+
+# Background
+
+DNA sequencing platforms aim to measure the sequence of nucleotides (A, C, G and T) in a sample of DNA. Sequencers made by Illumina have been the dominant technology for much of the past decade, but their platforms generate fragments of sequence ('reads') that are relatively small (~100–300 nucleotides in length). In contrast, Oxford Nanopore Technologies (ONT) and Pacific Biosciences (PacBio) produce 'long-read' sequencers that can generate sequence fragments with tens of thousands of nucleotides or more [@Eisenstein2017]. Long reads from these platforms can be very beneficial for genome assembly and other bioinformatic analyses [@Phillippy2017;@Koren2017]. ONT and PacBio sequencers achieve their long read lengths because they detect nucleotides in individual molecules of DNA, a.k.a. single-molecule sequencing [@Heather2016]. However, the stochastic nature of measuring at the single-molecule scale means that ONT and PacBio reads are 'noisy' – they contain a significant amount of errors.
+
+Since sequencing reads from ONT and PacBio platforms are qualitatively different from Illumina reads (long and noisy vs short and accurate), they often require novel methods of analysis. The last few years have seen much research in this space, and one useful technique for evaluating new methods is read simulation: generating fake sequencing reads from a reference nucleotide sequence [@Huang2012]. This approach has some key advantages over using real sequencing data: it can be faster, more affordable and allow for a greater number of tests. Additionally, when using simulated reads, the reference nucleotide sequence provides a confident ground truth which may not be otherwise available.
+
+
+
+# Summary
+
+Here we introduce Badread, a software tool for _in silico_ simulation of long reads. Its primary aim is to generate simulated read sets for the purpose of evaluating tools or methods that take long reads as input. Badread differs from existing tools (e.g. PBSIM [@Ono2013], LongISLND [@Mu2016] and NanoSim [@Yang2017]) in two key ways. First, it can simulate types of read errors that other tools cannot. While other long read simulation tools focus on modelling read length and sequencing errors, Badread can additionally include chimeras (when a single read which consists of two or more non-contiguous sequences), adapters (additional sequences from the library preparation at the start or end of a read), glitches (localised regions of low accuracy) and junk reads (low-complexity repetitive sequences).
+
+The second way Badread differs from existing tools is that it prioritises control over realism. Using read length as an example, other long read simulation tools may sample read lengths from a real read set, so their simulated reads follow a realistic distribution. Badread instead uses a gamma distribution for read lengths where the user specifies the mean and standard deviation – less realistic but highly tuneable. Users can therefore generate many read sets which quantitatively vary, e.g. mean lengths of 1000, 2000, 3000, etc. Other characteristics of the read set (read accuracy, chimera rate, glitch rate, etc.) can be similarly tuned in Badread, allowing users to systematically evaluate how they affect the performance of a tool or method.
+
+
+
+# Availability
+
+Badread is open-source and available via the GPLv3 license at [github.com/rrwick/Badread](https://github.com/rrwick/Badread).
+
+
+
+# References