Fix for issue #432 in Picard Tools. CreateSequenceDictionary stalls indefinitely with large genomes #744

Merged
merged 8 commits into from Nov 24, 2016
@@ -0,0 +1,114 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package htsjdk.samtools;
+
+import htsjdk.samtools.util.LineReader;
+import java.io.BufferedWriter;
+
+/**
+ * "On the fly" codec SAMSequenceDictionaryCodec.
+ * Encodes each sequence and directly writes it to the Dictionary file.
+ *
+ * To use this class you should provide BufferedWriter to it, and so you should close it as you stop using this class.
+ * You can work with this class as shown below.
+ *
+ * Example of using this class:
+ *
+ * List<SAMSequenceRecord> dict = ...;
+ *
+ * //open BufferedReader and close in try-with-resources
+ * try(BufferedWriter writer = new BufferedWriter(new FileWriter("path/to/file"))) {
+ * SAMSequenceDictionaryCodec codec = new SAMSequenceDictionaryCodec(writer);
+ *
+ * //we have list of sequences, so encode header line and after that encode each sequence
+ * codec.encodeHeaderLine(false);
+ * dict.forEach(codec::encodeSequenceRecord);
+ *}
+ *
+ * or
+ *
+ * SAMSequenceDictionary dict = ...;
+ *
+ * //open BufferedReader and close in try-with-resources
+ * try(BufferedWriter writer = new BufferedWriter(new FileWriter("path/to/file"))) {
+ * SAMSequenceDictionaryCodec codec = new SAMSequenceDictionaryCodec(writer);
+ *
+ * //we have complete {@link SAMSequenceDictionary}, so just encode it.
+ * codec.encode(dict);
+ *}
+ *
+ * @author Pavel_Silin@epam.com, EPAM Systems, Inc. <www.epam.com>
+ */
+public class SAMSequenceDictionaryCodec {
+
+ private static final SAMFileHeader EMPTY_HEADER = new SAMFileHeader();
+
+ private final SAMTextHeaderCodec codec;
+
+ public SAMSequenceDictionaryCodec(final BufferedWriter writer) {
+ codec = new SAMTextHeaderCodec();
+ codec.setmFileHeader(EMPTY_HEADER);
+ codec.setWriter(writer);
+ }
+
+ /**
+ * Write {@link SAMSequenceRecord}.
+ * @param sequenceRecord object to be converted to text.
+ */
+ public void encodeSequenceRecord(final SAMSequenceRecord sequenceRecord) {
+ codec.encodeSequenceRecord(sequenceRecord);
+ }
+
+ /**
+ * Write Header line.
+ * @param keepExistingVersionNumber boolean flag to keep existing version number.
+ */
+ public void encodeHeaderLine(final boolean keepExistingVersionNumber) {
+ codec.encodeHeaderLine(keepExistingVersionNumber);
+ }
+
+ /**
+ * Reads text SAM header and converts to a SAMSequenceDictionary object.
+ * @param reader Where to get header text from.
+ * @param source Name of the input file, for error messages. May be null.
+ * @return complete SAMSequenceDictionary object.
+ */
+ public SAMSequenceDictionary decode(final LineReader reader, final String source) {
+ return codec.decode(reader, source).getSequenceDictionary();
+ }
+
+ /**
+ * Convert {@link SAMSequenceDictionary} from in-memory representation to text representation.
+ * @param dictionary object to be converted to text.
+ */
+ public void encode(final SAMSequenceDictionary dictionary) {
+ codec.encodeHeaderLine(false);
+ dictionary.getSequences().forEach(this::encodeSequenceRecord);
+ }
+
+ public void setValidationStringency(final ValidationStringency validationStringency) {
+ codec.setValidationStringency(validationStringency);
+ }
+}
@@ -70,6 +70,14 @@
public static final String COMMENT_PREFIX = HEADER_LINE_START + HeaderRecordType.CO.name() + FIELD_SEPARATOR;
+ void setWriter(final BufferedWriter writer) {
+ this.writer = writer;
+ }
+
+ void setmFileHeader(final SAMFileHeader header) {
+ this.mFileHeader = header;
+ }
+
/**
* Reads text SAM header and converts to a SAMFileHeader object.
* @param reader Where to get header text from.
@@ -80,8 +88,8 @@ public SAMFileHeader decode(final LineReader reader, final String source) {
mFileHeader = new SAMFileHeader();
mReader = reader;
mSource = source;
- sequences = new ArrayList<SAMSequenceRecord>();
- readGroups = new ArrayList<SAMReadGroupRecord>();
+ sequences = new ArrayList<>();
+ readGroups = new ArrayList<>();
while (advanceLine() != null) {
final ParsedHeaderLine parsedHeaderLine = new ParsedHeaderLine(mCurrentLine);
@@ -387,6 +395,30 @@ public void encode(final Writer writer, final SAMFileHeader header, final boolea
}
}
+ /**
+ * Encode {@link SAMSequenceRecord}.
+ * Designed for using in {@link SAMSequenceDictionaryCodec}, allows to implement recording on the fly.
+ * @throws IllegalStateException, if writer is null.
+ */
+ void encodeSequenceRecord(final SAMSequenceRecord sequenceRecord) {
+ if (writer == null) {
+ throw new IllegalStateException("writer couldn't be null");
+ }
+ writeSQLine(sequenceRecord);
+ }
+
+ /**
+ * Encode HD line.
+ * Designed for using in {@link SAMSequenceDictionaryCodec}, allows to implement recording on the fly.
+ * @throws IllegalStateException, if writer is null.
+ */
+ void encodeHeaderLine(final boolean keepExistingVersionNumber) {
+ if (writer == null) {
+ throw new IllegalStateException("writer couldn't be null");
+ }
+ writeHDLine(keepExistingVersionNumber);
+ }
+
private void println(final String s) {
try {
writer.append(s);
@@ -438,7 +470,7 @@ private void writeHDLine(final boolean keepExistingVersionNumber) {
}
private void writeSQLine(final SAMSequenceRecord sequenceRecord) {
- final int numAttributes =sequenceRecord.getAttributes() != null ? sequenceRecord.getAttributes().size() : 0;
+ final int numAttributes = sequenceRecord.getAttributes() != null ? sequenceRecord.getAttributes().size() : 0;
final String[] fields = new String[3 + numAttributes];
fields[0] = HEADER_LINE_START + HeaderRecordType.SQ;
fields[1] = SAMSequenceRecord.SEQUENCE_NAME_TAG + TAG_KEY_VALUE_SEPARATOR + sequenceRecord.getSequenceName();
@@ -0,0 +1,122 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 20016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package htsjdk.samtools;
+
+import htsjdk.samtools.util.LineReader;
+import htsjdk.samtools.util.StringLineReader;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+import javax.sound.sampled.Line;
+import java.io.BufferedWriter;
+import java.io.StringWriter;
+import java.util.List;
+import java.util.Random;
+
+import static org.testng.Assert.*;
+
+/**
+ * @author Pavel_Silin@epam.com, EPAM Systems, Inc. <www.epam.com>
+ */
+public class SAMSequenceDictionaryCodecTest {
+
+ private static final Random random = new Random();
+ private SAMSequenceDictionary dictionary;
+ private StringWriter writer;
+ private SAMSequenceDictionaryCodec codec;
+ private BufferedWriter bufferedWriter;
+
+ @BeforeMethod
+ public void setUp() throws Exception {
+ String[] seqs = new String[]{"chr1", "chr2", "chr12", "chr16", "chrX"};
+ dictionary = new SAMSequenceDictionary();
+ for (String seq : seqs) {
+ dictionary.addSequence(new SAMSequenceRecord(seq, random.nextInt(10_000_000)));
+ }
+ writer = new StringWriter();
+ bufferedWriter = new BufferedWriter(writer);
+ codec = new SAMSequenceDictionaryCodec(bufferedWriter);
+ }
+
+ @Test
+ public void testEncodeDecodeDictionary() throws Exception {
+ LineReader readerOne = null;
+ LineReader readerTwo = null;
+ try {
+ codec.encode(dictionary);
+ bufferedWriter.close();
+ readerOne = new StringLineReader(writer.toString());
+ SAMSequenceDictionary actual = codec.decode(readerOne, null);
+ assertEquals(actual, dictionary);
+
+ readerTwo = new StringLineReader(writer.toString());
+
+ String line = readerTwo.readLine();
+ assertTrue(line.startsWith("@HD"));
+
+ line = readerTwo.readLine();
+ while (line != null) {
+ assertTrue(line.startsWith("@SQ"));
+ line = readerTwo.readLine();
+ }
+ } finally {
+ assert readerOne != null;
+ assert readerTwo != null;
+ readerOne.close();
+ readerTwo.close();
+ }
+ }
+
+ @Test
+ public void testEncodeDecodeListOfSeqs() throws Exception {
+ LineReader readerOne = null;
+ LineReader readerTwo = null;
+
+ try {
+ List<SAMSequenceRecord> sequences = dictionary.getSequences();
+ codec.encodeHeaderLine(false);
+ sequences.forEach(codec::encodeSequenceRecord);
+ bufferedWriter.close();
+ readerOne = new StringLineReader(writer.toString());
+ SAMSequenceDictionary actual = codec.decode(readerOne, null);
+ assertEquals(actual, dictionary);
+ readerTwo = new StringLineReader(writer.toString());
+
+ String line = readerTwo.readLine();
+ assertTrue(line.startsWith("@HD"));
+
+ line = readerTwo.readLine();
+ while (line != null) {
+ assertTrue(line.startsWith("@SQ"));
+ line = readerTwo.readLine();
+ }
+ } finally {
+ assert readerOne != null;
+ assert readerTwo != null;
+ readerOne.close();
+ readerTwo.close();
+ }
+ }
+}