From 9e0199a0036f05e0c0f8bdb6e959c757d69a462b Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Thu, 8 Jun 2017 19:12:28 -0400 Subject: [PATCH 01/23] Fix SAMRecord.getReadLength() so it does not throw an exception if null bases --- src/main/java/htsjdk/samtools/SAMRecord.java | 3 ++- .../java/htsjdk/samtools/SAMRecordUnitTest.java | 24 ++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMRecord.java b/src/main/java/htsjdk/samtools/SAMRecord.java index 8e61d150d..9ba0b9966 100644 --- a/src/main/java/htsjdk/samtools/SAMRecord.java +++ b/src/main/java/htsjdk/samtools/SAMRecord.java @@ -262,7 +262,8 @@ public void setReadBases(final byte[] value) { * @return number of bases in the read. */ public int getReadLength() { - return getReadBases().length; + final byte[] readBases = getReadBases(); + return readBases == null ? 0 : readBases.length; } /** diff --git a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java index 29118197f..acb9a636d 100644 --- a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java +++ b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java @@ -776,7 +776,7 @@ public void testNullHeaderRecordValidation() { } @Test - private void testNullHeaderDeepCopy() { + public void testNullHeaderDeepCopy() { SAMRecord sam = createTestRecordHelper(); sam.setHeader(null); final SAMRecord deepCopy = sam.deepCopy(); @@ -805,13 +805,13 @@ private void testNullHeaderCigar(SAMRecord rec) { } @Test - private void testNullHeadGetCigarSAM() { - SAMRecord sam = createTestRecordHelper(); + public void testNullHeadGetCigarSAM() { + final SAMRecord sam = createTestRecordHelper(); testNullHeaderCigar(sam); } @Test - private void testNullHeadGetCigarBAM() { + public void testNullHeadGetCigarBAM() { SAMRecord sam = createTestRecordHelper(); SAMRecordFactory factory = new DefaultSAMRecordFactory(); BAMRecord bamRec = factory.createBAMRecord( @@ -1039,4 +1039,20 @@ public SAMRecord createTestSamRec() { return(rec); } + + @DataProvider + public Object [][] readBasesGetReadLengthData() { + return new Object[][]{ + { null, 0 }, + { SAMRecord.NULL_SEQUENCE, 0 }, + { new byte[] {'A', 'C'}, 2 } + }; + } + + @Test(dataProvider = "readBasesGetReadLengthData") + public void testNullReadBasesGetReadLength(final byte[] readBases, final int readLength) { + final SAMRecord sam = createTestRecordHelper(); + sam.setReadBases(readBases); + Assert.assertEquals(sam.getReadLength(), readLength); + } } From 7e79bbac3be60eeec86ea705498098e023ee6999 Mon Sep 17 00:00:00 2001 From: Pierre Lindenbaum Date: Fri, 9 Jun 2017 05:00:25 +0200 Subject: [PATCH 02/23] use Locatable interface (#887) --- src/main/java/htsjdk/samtools/util/IntervalTreeMap.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/IntervalTreeMap.java b/src/main/java/htsjdk/samtools/util/IntervalTreeMap.java index 259308732..ebec2f484 100644 --- a/src/main/java/htsjdk/samtools/util/IntervalTreeMap.java +++ b/src/main/java/htsjdk/samtools/util/IntervalTreeMap.java @@ -165,16 +165,16 @@ public int size() { } /** * Test overlapping interval - * @param key the interval + * @param key the Locatable * @return true if it contains an object overlapping the interval */ - public boolean containsOverlapping(final Interval key) { + public boolean containsOverlapping(final Locatable key) { final IntervalTree tree = mSequenceMap.get(key.getContig()); return tree!=null && tree.overlappers(key.getStart(), key.getEnd()).hasNext(); } - public Collection getOverlapping(final Interval key) { + public Collection getOverlapping(final Locatable key) { final List result = new ArrayList(); final IntervalTree tree = mSequenceMap.get(key.getContig()); if (tree != null) { @@ -187,10 +187,10 @@ public boolean containsOverlapping(final Interval key) { } /** * Test if this contains an object that is contained by 'key' - * @param key the interval + * @param key the Locatable * @return true if it contains an object is contained by 'key' */ - public boolean containsContained(final Interval key) { + public boolean containsContained(final Locatable key) { final IntervalTree tree = mSequenceMap.get(key.getContig()); if(tree==null) return false; final Iterator> iterator = tree.overlappers(key.getStart(), key.getEnd()); @@ -204,7 +204,7 @@ public boolean containsContained(final Interval key) { } - public Collection getContained(final Interval key) { + public Collection getContained(final Locatable key) { final List result = new ArrayList(); final IntervalTree tree = mSequenceMap.get(key.getContig()); if (tree != null) { From 112758bd4a7546dd85e70d5ec4ad55a58374b53d Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 9 Jun 2017 09:01:35 -0400 Subject: [PATCH 03/23] Added test using setReadString() --- src/main/java/htsjdk/samtools/SAMRecord.java | 4 +++- src/main/java/htsjdk/samtools/util/StringUtil.java | 6 ++++++ .../java/htsjdk/samtools/SAMRecordUnitTest.java | 22 +++++++++++++++++++--- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMRecord.java b/src/main/java/htsjdk/samtools/SAMRecord.java index 9ba0b9966..f93b2d72c 100644 --- a/src/main/java/htsjdk/samtools/SAMRecord.java +++ b/src/main/java/htsjdk/samtools/SAMRecord.java @@ -238,7 +238,9 @@ public void setReadString(final String value) { mReadBases = NULL_SEQUENCE; } else { final byte[] bases = StringUtil.stringToBytes(value); - SAMUtils.normalizeBases(bases); + if (bases != null) { + SAMUtils.normalizeBases(bases); + } setReadBases(bases); } } diff --git a/src/main/java/htsjdk/samtools/util/StringUtil.java b/src/main/java/htsjdk/samtools/util/StringUtil.java index 90492533e..a885ba2db 100644 --- a/src/main/java/htsjdk/samtools/util/StringUtil.java +++ b/src/main/java/htsjdk/samtools/util/StringUtil.java @@ -312,6 +312,9 @@ public static String bytesToString(final byte[] buffer, final int offset, final } return byteBuffer; */ + if (s == null) { + return null; + } final byte[] byteBuffer = new byte[s.length()]; s.getBytes(0, byteBuffer.length, byteBuffer, 0); return byteBuffer; @@ -319,6 +322,9 @@ public static String bytesToString(final byte[] buffer, final int offset, final @SuppressWarnings("deprecation") public static byte[] stringToBytes(final String s, final int offset, final int length) { + if (s == null) { + return null; + } final byte[] byteBuffer = new byte[length]; s.getBytes(offset, offset + length, byteBuffer, 0); return byteBuffer; diff --git a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java index acb9a636d..1bfe26345 100644 --- a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java +++ b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java @@ -1041,7 +1041,7 @@ public SAMRecord createTestSamRec() { } @DataProvider - public Object [][] readBasesGetReadLengthData() { + public Object [][] readBasesArrayGetReadLengthData() { return new Object[][]{ { null, 0 }, { SAMRecord.NULL_SEQUENCE, 0 }, @@ -1049,10 +1049,26 @@ public SAMRecord createTestSamRec() { }; } - @Test(dataProvider = "readBasesGetReadLengthData") - public void testNullReadBasesGetReadLength(final byte[] readBases, final int readLength) { + @Test(dataProvider = "readBasesArrayGetReadLengthData") + public void testReadBasesGetReadLength(final byte[] readBases, final int readLength) { final SAMRecord sam = createTestRecordHelper(); sam.setReadBases(readBases); Assert.assertEquals(sam.getReadLength(), readLength); } + + @DataProvider + public Object [][] readBasesStringGetReadLengthData() { + return new Object[][]{ + { null, 0 }, + { SAMRecord.NULL_SEQUENCE_STRING, 0 }, + { "AC", 2 } + }; + } + + @Test(dataProvider = "readBasesStringGetReadLengthData") + public void testReadStringGetReadLength(final String readBases, final int readLength) { + final SAMRecord sam = createTestRecordHelper(); + sam.setReadString(readBases); + Assert.assertEquals(sam.getReadLength(), readLength); + } } From 9b84fe816039aba122c421191b13824f099cefdb Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 9 Jun 2017 11:00:11 -0400 Subject: [PATCH 04/23] Cleaned up SAMUtils (mostly cosmetic) (#888) * Fixing-up SamUtils: - adding more information to some exceptions - fixing java-doc - java8 <> - if/else alignment & braces - removed use of deprecated SAMRecordUtils class - @Deprecated all methods and members of SAMRecordUtils - fixed javadoc @deprecated across the board. --- .../java/htsjdk/samtools/DuplicateSetIterator.java | 2 +- src/main/java/htsjdk/samtools/SAMRecordUtil.java | 27 +- src/main/java/htsjdk/samtools/SAMUtils.java | 302 ++++----- .../htsjdk/samtools/filter/FilteringIterator.java | 2 +- src/main/java/htsjdk/tribble/util/HTTPHelper.java | 3 + .../variantcontext/GenotypeLikelihoods.java | 6 + .../variantcontext/filter/FilteringIterator.java | 2 +- src/main/java/htsjdk/variant/vcf/VCFEncoder.java | 715 +++++++++++---------- 8 files changed, 547 insertions(+), 512 deletions(-) diff --git a/src/main/java/htsjdk/samtools/DuplicateSetIterator.java b/src/main/java/htsjdk/samtools/DuplicateSetIterator.java index f3b9b072a..6e833035b 100644 --- a/src/main/java/htsjdk/samtools/DuplicateSetIterator.java +++ b/src/main/java/htsjdk/samtools/DuplicateSetIterator.java @@ -114,7 +114,7 @@ public DuplicateSetIterator(final CloseableIterator iterator, } @Deprecated - /** Do not use this method as the first duplicate set will not be compared with this scoring strategy. + /** @deprecated Do not use this method as the first duplicate set will not be compared with this scoring strategy. * Instead, provide a comparator to the constructor that has the scoring strategy set. */ public void setScoringStrategy(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy) { this.comparator.setScoringStrategy(scoringStrategy); diff --git a/src/main/java/htsjdk/samtools/SAMRecordUtil.java b/src/main/java/htsjdk/samtools/SAMRecordUtil.java index d778789d7..9435934c5 100644 --- a/src/main/java/htsjdk/samtools/SAMRecordUtil.java +++ b/src/main/java/htsjdk/samtools/SAMRecordUtil.java @@ -23,23 +23,28 @@ */ package htsjdk.samtools; -import htsjdk.samtools.util.SequenceUtil; -import htsjdk.samtools.util.StringUtil; - import java.util.Arrays; import java.util.Collection; import java.util.List; /** * - * Use {@link SAMRecord#reverseComplement()} instead, which defaults to making a copy of attributes for reverse - * complement rather than changing them in-place. - * * @author alecw@broadinstitute.org + * + * @deprecated 10/27/2016 Use {@link SAMRecord} constants and functions */ @Deprecated public class SAMRecordUtil { + /** + * @deprecated 6/5/2017 Use {@link SAMRecord#TAGS_TO_REVERSE_COMPLEMENT} + */ + @Deprecated public static List TAGS_TO_REVERSE_COMPLEMENT = Arrays.asList(SAMTag.E2.name(), SAMTag.SQ.name()); + + /** + * @deprecated 6/5/2017 Use {@link SAMRecord#TAGS_TO_REVERSE} + */ + @Deprecated public static List TAGS_TO_REVERSE = Arrays.asList(SAMTag.OQ.name(), SAMTag.U2.name()); /** @@ -48,7 +53,11 @@ * or attributes. If a copy is needed use {@link #reverseComplement(SAMRecord, boolean)}. * See {@link #TAGS_TO_REVERSE_COMPLEMENT} {@link #TAGS_TO_REVERSE} * for the default set of tags that are handled. + * + * @deprecated 6/5/2017 Use {@link SAMRecord#reverseComplement} but note that the default behavior there is different + * It will default to making a copy, not reverse-complementing in-place! */ + @Deprecated public static void reverseComplement(final SAMRecord rec) { rec.reverseComplement(TAGS_TO_REVERSE_COMPLEMENT, TAGS_TO_REVERSE, true); } @@ -61,7 +70,10 @@ public static void reverseComplement(final SAMRecord rec) { * * @param rec Record to reverse complement. * @param inplace Setting this to false will clone all attributes, bases and qualities before changing the values. + * + * @deprecated 6/5/2017 Use {@link SAMRecord#reverseComplement} */ + @Deprecated public static void reverseComplement(final SAMRecord rec, boolean inplace) { rec.reverseComplement(TAGS_TO_REVERSE_COMPLEMENT, TAGS_TO_REVERSE, inplace); } @@ -70,7 +82,10 @@ public static void reverseComplement(final SAMRecord rec, boolean inplace) { * Reverse complement bases and reverse quality scores. In addition reverse complement any * non-null attributes specified by tagsToRevcomp and reverse and non-null attributes * specified by tagsToReverse. + * + * @deprecated 6/5/2017 Use {@link SAMRecord#reverseComplement} */ + @Deprecated public static void reverseComplement(final SAMRecord rec, final Collection tagsToRevcomp, final Collection tagsToReverse, boolean inplace) { rec.reverseComplement(tagsToRevcomp, tagsToReverse, inplace); } diff --git a/src/main/java/htsjdk/samtools/SAMUtils.java b/src/main/java/htsjdk/samtools/SAMUtils.java index d439a4a83..5b81de979 100644 --- a/src/main/java/htsjdk/samtools/SAMUtils.java +++ b/src/main/java/htsjdk/samtools/SAMUtils.java @@ -43,14 +43,17 @@ import java.util.TreeMap; import java.util.regex.Pattern; - /** * Utilty methods. */ public final class SAMUtils { - /** regex for semicolon, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */ + /** + * regex for semicolon, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} + */ private static final Pattern SEMICOLON_PAT = Pattern.compile("[;]"); - /** regex for comma, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */ + /** + * regex for comma, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} + */ private static final Pattern COMMA_PAT = Pattern.compile("[,]"); // Representation of bases, one for when in low-order nybble, one for when in high-order nybble. @@ -87,27 +90,26 @@ private static final byte COMPRESSED_K_HIGH = (byte) (COMPRESSED_K_LOW << 4); private static final byte COMPRESSED_D_HIGH = (byte) (COMPRESSED_D_LOW << 4); private static final byte COMPRESSED_B_HIGH = (byte) (COMPRESSED_B_LOW << 4); - - private static final byte [] COMPRESSED_LOOKUP_TABLE = - new byte[]{ - '=', - 'A', - 'C', - 'M', - 'G', - 'R', - 'S', - 'V', - 'T', - 'W', - 'Y', - 'H', - 'K', - 'D', - 'B', - 'N' - }; - + + private static final byte[] COMPRESSED_LOOKUP_TABLE = { + '=', + 'A', + 'C', + 'M', + 'G', + 'R', + 'S', + 'V', + 'T', + 'W', + 'Y', + 'H', + 'K', + 'D', + 'B', + 'N' + }; + public static final int MAX_PHRED_SCORE = 93; /** @@ -135,8 +137,8 @@ * Convert from a byte array with bases stored in nybbles, with for example,=, A, C, G, T, N represented as 0, 1, 2, 4, 8, 15, * to a a byte array containing =AaCcGgTtNn represented as ASCII. * - * @param length Number of bases (not bytes) to convert. - * @param compressedBases Bases represented as nybbles, in BAM binary format. + * @param length Number of bases (not bytes) to convert. + * @param compressedBases Bases represented as nybbles, in BAM binary format. * @param compressedOffset Byte offset in compressedBases to start. * @return New byte array with bases as ASCII bytes. */ @@ -215,7 +217,7 @@ private static byte charToCompressedBaseLow(final byte base) { case 'b': return COMPRESSED_B_LOW; default: - throw new IllegalArgumentException("Bad base passed to charToCompressedBaseLow: " + Character.toString((char)base) + "(" + base + ")"); + throw new IllegalArgumentException("Bad base passed to charToCompressedBaseLow: " + Character.toString((char) base) + "(" + base + ")"); } } @@ -279,21 +281,22 @@ private static byte charToCompressedBaseHigh(final byte base) { case 'b': return COMPRESSED_B_HIGH; default: - throw new IllegalArgumentException("Bad base passed to charToCompressedBaseHigh: " + Character.toString((char)base) + "(" + base + ")"); + throw new IllegalArgumentException("Bad base passed to charToCompressedBaseHigh: " + Character.toString((char) base) + "(" + base + ")"); } } - + /** * Returns the byte corresponding to a certain nybble + * * @param base One of COMPRESSED_*_LOW, a low-order nybble encoded base. * @return ASCII base, one of =ACGTNMRSVWYHKDB. * @throws IllegalArgumentException if the base is not one of =ACGTNMRSVWYHKDB. */ - private static byte compressedBaseToByte(byte base){ - try{ + private static byte compressedBaseToByte(byte base) { + try { return COMPRESSED_LOOKUP_TABLE[base]; - }catch(IndexOutOfBoundsException e){ - throw new IllegalArgumentException("Bad base passed to charToCompressedBase: " + Character.toString((char)base) + "(" + base + ")"); + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException("Bad base passed to charToCompressedBase: " + Character.toString((char) base) + "(" + base + ")"); } } @@ -304,7 +307,7 @@ private static byte compressedBaseToByte(byte base){ * @return ASCII base, one of ACGTN=. */ private static byte compressedBaseToByteLow(final int base) { - return compressedBaseToByte((byte)(base & 0xf)); + return compressedBaseToByte((byte) (base & 0xf)); } /** @@ -314,13 +317,13 @@ private static byte compressedBaseToByteLow(final int base) { * @return ASCII base, one of ACGTN=. */ private static byte compressedBaseToByteHigh(final int base) { - return compressedBaseToByte((byte)((base >> 4) & 0xf)); + return compressedBaseToByte((byte) ((base >> 4) & 0xf)); } /** * Convert bases in place into canonical form, upper case, and with no-call represented as N. * - * @param bases + * @param bases byte array of bases to "normalize", in place. */ static void normalizeBases(final byte[] bases) { for (int i = 0; i < bases.length; ++i) { @@ -434,11 +437,11 @@ static int reg2bin(final int beg, final int end) { /** * Handle a list of validation errors according to the validation stringency. * - * @param validationErrors List of errors to report, or null if there are no errors. - * @param samRecordIndex Record number of the SAMRecord corresponding to the validation errors, or -1 if - * the record number is not known. + * @param validationErrors List of errors to report, or null if there are no errors. + * @param samRecordIndex Record number of the SAMRecord corresponding to the validation errors, or -1 if + * the record number is not known. * @param validationStringency If STRICT, throw a SAMFormatException. If LENIENT, print the validation - * errors to stderr. If SILENT, do nothing. + * errors to stderr. If SILENT, do nothing. */ public static void processValidationErrors(final List validationErrors, final long samRecordIndex, @@ -464,11 +467,10 @@ public static void processValidationError(final SAMValidationError validationErr } else if (validationStringency == ValidationStringency.LENIENT) { System.err.println("Ignoring SAM validation error: " + validationError); } - } private static final SAMHeaderRecordComparator HEADER_RECORD_COMPARATOR = - new SAMHeaderRecordComparator( + new SAMHeaderRecordComparator<>( SAMReadGroupRecord.PLATFORM_UNIT_TAG, SAMReadGroupRecord.LIBRARY_TAG, SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG, @@ -476,7 +478,8 @@ public static void processValidationError(final SAMValidationError validationErr SAMReadGroupRecord.SEQUENCING_CENTER_TAG, SAMReadGroupRecord.PLATFORM_TAG, SAMReadGroupRecord.DESCRIPTION_TAG, - SAMReadGroupRecord.READ_GROUP_ID_TAG // We don't actually want to compare with ID but it's suitable + SAMReadGroupRecord.READ_GROUP_ID_TAG + // We don't actually want to compare with ID but it's suitable // "just in case" since it's the only one that's actually required ); @@ -497,11 +500,11 @@ public static String calculateReadGroupRecordChecksum(final File input, final Fi // Sort the read group records by their first final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(input); - final List sortedRecords = new ArrayList(reader.getFileHeader().getReadGroups()); + final List sortedRecords = new ArrayList<>(reader.getFileHeader().getReadGroups()); Collections.sort(sortedRecords, HEADER_RECORD_COMPARATOR); for (final SAMReadGroupRecord rgRecord : sortedRecords) { - final TreeMap sortedAttributes = new TreeMap(); + final TreeMap sortedAttributes = new TreeMap<>(); for (final Map.Entry attributeEntry : rgRecord.getAttributes()) { sortedAttributes.put(attributeEntry.getKey(), attributeEntry.getValue()); } @@ -539,7 +542,7 @@ public static void chainSAMProgramRecord(final SAMFileHeader header, final SAMPr final List pgs = header.getProgramRecords(); if (!pgs.isEmpty()) { - final List referencedIds = new ArrayList(); + final List referencedIds = new ArrayList<>(); for (final SAMProgramRecord pg : pgs) { if (pg.getPreviousProgramGroupId() != null) { referencedIds.add(pg.getPreviousProgramGroupId()); @@ -560,7 +563,7 @@ public static void chainSAMProgramRecord(final SAMFileHeader header, final SAMPr /** * Strip mapping information from a SAMRecord. - * + *

* WARNING: by clearing the secondary and supplementary flags, * this may have the affect of producing multiple distinct records with the * same read name and flags, which may lead to invalid SAM/BAM output. @@ -568,7 +571,7 @@ public static void chainSAMProgramRecord(final SAMFileHeader header, final SAMPr */ public static void makeReadUnmapped(final SAMRecord rec) { if (rec.getReadNegativeStrandFlag()) { - SAMRecordUtil.reverseComplement(rec); + rec.reverseComplement(true); rec.setReadNegativeStrandFlag(false); } rec.setDuplicateReadFlag(false); @@ -622,13 +625,13 @@ public static boolean cigarMapsNoBasesToRef(final Cigar cigar) { /** * Tests if the provided record is mapped entirely beyond the end of the reference (i.e., the alignment start is greater than the * length of the sequence to which the record is mapped). + * * @param record must not have a null SamFileHeader */ public static boolean recordMapsEntirelyBeyondEndOfReference(final SAMRecord record) { if (record.getHeader() == null) { throw new SAMException("A non-null SAMHeader is required to resolve the mapping position: " + record.getReadName()); - } - else { + } else { return record.getHeader().getSequence(record.getReferenceIndex()).getSequenceLength() < record.getAlignmentStart(); } } @@ -646,7 +649,6 @@ public static int compareMapqs(final int mapq1, final int mapq2) { else return mapq1 - mapq2; } - /** * Hokey algorithm for combining two MAPQs into values that are comparable, being cognizant of the fact * that in MAPQ world, 1 > 255 > 0. In this algorithm, 255 is treated as if it were 0.01, so that @@ -655,11 +657,17 @@ public static int compareMapqs(final int mapq1, final int mapq2) { * invocations of this method. */ public static int combineMapqs(int m1, int m2) { - if (m1 == 255) m1 = 1; - else m1 *= 100; + if (m1 == 255) { + m1 = 1; + } else { + m1 *= 100; + } - if (m2 == 255) m2 = 1; - else m2 *= 100; + if (m2 == 255) { + m2 = 1; + } else { + m2 *= 100; + } return m1 + m2; @@ -682,15 +690,15 @@ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) { * reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference) * are not represented in the alignment blocks. * - * @param cigar The cigar containing the alignment information + * @param cigar The cigar containing the alignment information * @param alignmentStart The start (1-based) of the alignment - * @param cigarTypeName The type of cigar passed - for error logging. + * @param cigarTypeName The type of cigar passed - for error logging. * @return List of alignment blocks */ public static List getAlignmentBlocks(final Cigar cigar, final int alignmentStart, final String cigarTypeName) { if (cigar == null) return Collections.emptyList(); - final List alignmentBlocks = new ArrayList(); + final List alignmentBlocks = new ArrayList<>(); int readBase = 1; int refBase = alignmentStart; @@ -721,7 +729,7 @@ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) { refBase += length; break; default: - throw new IllegalStateException("Case statement didn't deal with " + cigarTypeName + " op: " + e.getOperator()); + throw new IllegalStateException("Case statement didn't deal with " + cigarTypeName + " op: " + e.getOperator() + "in CIGAR: " + cigar); } } return Collections.unmodifiableList(alignmentBlocks); @@ -729,7 +737,7 @@ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) { /** * @param alignmentStart The start (1-based) of the alignment - * @param cigar The cigar containing the alignment information + * @param cigar The cigar containing the alignment information * @return the alignment start (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) * then this method will return 96. @@ -753,7 +761,7 @@ public static int getUnclippedStart(final int alignmentStart, final Cigar cigar) /** * @param alignmentEnd The end (1-based) of the alignment - * @param cigar The cigar containing the alignment information + * @param cigar The cigar containing the alignment information * @return the alignment end (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) * then this method will return 107. @@ -791,7 +799,7 @@ public static String getMateCigarString(final SAMRecord rec) { /** * Returns the Mate Cigar or null if there is none. * - * @param rec the SAM record + * @param rec the SAM record * @param withValidation true if we are to validate the mate cigar before returning, false otherwise. * @return Cigar object for the read's mate, or null if there is none. */ @@ -835,11 +843,11 @@ public static int getMateCigarLength(final SAMRecord rec) { */ public static int getMateAlignmentEnd(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) { - throw new RuntimeException("getMateAlignmentEnd called on an unmapped mate."); + throw new RuntimeException("getMateAlignmentEnd called on an unmapped mate: " + rec); } final Cigar mateCigar = SAMUtils.getMateCigar(rec); if (mateCigar == null) { - throw new SAMException("Mate CIGAR (Tag MC) not found."); + throw new SAMException("Mate CIGAR (Tag MC) not found:" + rec); } return CoordMath.getEnd(rec.getMateAlignmentStart(), mateCigar.getReferenceLength()); } @@ -854,15 +862,14 @@ public static int getMateAlignmentEnd(final SAMRecord rec) { */ public static int getMateUnclippedStart(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) - throw new RuntimeException("getMateUnclippedStart called on an unmapped mate."); + throw new RuntimeException("getMateUnclippedStart called on an unmapped mate: " + rec); final Cigar mateCigar = getMateCigar(rec); if (mateCigar == null) { - throw new SAMException("Mate CIGAR (Tag MC) not found."); + throw new SAMException("Mate CIGAR (Tag MC) not found: " + rec); } return SAMUtils.getUnclippedStart(rec.getMateAlignmentStart(), mateCigar); } - /** * @param rec the SAM record * @return the mate alignment end (1-based, inclusive) adjusted for clipped bases. For example if the mate @@ -873,20 +880,20 @@ public static int getMateUnclippedStart(final SAMRecord rec) { */ public static int getMateUnclippedEnd(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) { - throw new RuntimeException("getMateUnclippedEnd called on an unmapped mate."); + throw new RuntimeException("getMateUnclippedEnd called on an unmapped mate: " + rec); } final Cigar mateCigar = SAMUtils.getMateCigar(rec); if (mateCigar == null) { - throw new SAMException("Mate CIGAR (Tag MC) not found."); + throw new SAMException("Mate CIGAR (Tag MC) not found: " + rec); } return SAMUtils.getUnclippedEnd(getMateAlignmentEnd(rec), mateCigar); } /** * @param rec the SAM record - * Returns blocks of the mate sequence that have been aligned directly to the - * reference sequence. Note that clipped portions of the mate and inserted and - * deleted bases (vs. the reference) are not represented in the alignment blocks. + * Returns blocks of the mate sequence that have been aligned directly to the + * reference sequence. Note that clipped portions of the mate and inserted and + * deleted bases (vs. the reference) are not represented in the alignment blocks. */ public static List getMateAlignmentBlocks(final SAMRecord rec) { return getAlignmentBlocks(getMateCigar(rec), rec.getMateAlignmentStart(), "mate cigar"); @@ -896,12 +903,12 @@ public static int getMateUnclippedEnd(final SAMRecord rec) { * Run all validations of the mate's CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. * - * @param rec the SAM record - * @param cigar The cigar containing the alignment information - * @param referenceIndex The reference index + * @param rec the SAM record + * @param cigar The cigar containing the alignment information + * @param referenceIndex The reference index * @param alignmentBlocks The alignment blocks (parsed from the cigar) - * @param recordNumber For error reporting. -1 if not known. - * @param cigarTypeName For error reporting. "Read CIGAR" or "Mate Cigar" + * @param recordNumber For error reporting. -1 if not known. + * @param cigarTypeName For error reporting. "Read CIGAR" or "Mate Cigar" * @return List of errors, or null if no errors. */ @@ -916,16 +923,15 @@ public static int getMateUnclippedEnd(final SAMRecord rec) { if (referenceIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { SAMFileHeader samHeader = rec.getHeader(); if (null == samHeader) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.MISSING_HEADER, cigarTypeName + " A non-null SAMHeader is required to validate cigar elements for: ", rec.getReadName(), recordNumber)); - } - else { + } else { final SAMSequenceRecord sequence = samHeader.getSequence(referenceIndex); final int referenceSequenceLength = sequence.getSequenceLength(); for (final AlignmentBlock alignmentBlock : alignmentBlocks) { if (alignmentBlock.getReferenceStart() + alignmentBlock.getLength() - 1 > referenceSequenceLength) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.CIGAR_MAPS_OFF_REFERENCE, cigarTypeName + " M operator maps off end of reference", rec.getReadName(), recordNumber)); break; @@ -940,7 +946,7 @@ public static int getMateUnclippedEnd(final SAMRecord rec) { * Run all validations of the mate's CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. * - * @param rec the SAM record + * @param rec the SAM record * @param recordNumber For error reporting. -1 if not known. * @return List of errors, or null if no errors. */ @@ -954,7 +960,7 @@ public static int getMateUnclippedEnd(final SAMRecord rec) { } } else { if (getMateCigarString(rec) != null) { - ret = new ArrayList(); + ret = new ArrayList<>(); if (!rec.getReadPairedFlag()) { // If the read is not paired, and the Mate Cigar String (MC Attribute) exists, that is a validation error ret.add(new SAMValidationError(SAMValidationError.Type.MATE_CIGAR_STRING_INVALID_PRESENCE, @@ -984,11 +990,11 @@ public static boolean hasMateCigar(SAMRecord rec) { } /** - * Returns a string that is the the read group ID and read name separated by a colon. This is meant to cannonically + * Returns a string that is the the read group ID and read name separated by a colon. This is meant to canonically * identify a given record within a set of records. * - * @param record - * @return + * @param record SAMRecord for which "canonical" read name is requested + * @return The record's readgroup-id (if non-null) and the read name, separated by a colon, ':' */ public static String getCanonicalRecordName(final SAMRecord record) { String name = record.getStringAttribute(ReservedTagConstants.READ_GROUP_ID); @@ -1002,7 +1008,7 @@ public static String getCanonicalRecordName(final SAMRecord record) { * or the given record's start position is greater than its mate's start position, zero is automatically returned. * NB: This method assumes that the record's mate is not contained within the given record's alignment. * - * @param rec + * @param rec SAMRecord that needs clipping due to overlapping pairs. * @return the number of bases at the end of the read that need to be clipped such that there would be no overlapping bases with its mate. * Read bases include only those from insertion, match, or mismatch Cigar operators. */ @@ -1013,7 +1019,8 @@ public static int getNumOverlappingAlignedBasesToClip(final SAMRecord rec) { // Only clip records that are left-most in genomic order and overlapping. if (rec.getMateAlignmentStart() < rec.getAlignmentStart()) return 0; // right-most, so ignore. - else if (rec.getMateAlignmentStart() == rec.getAlignmentStart() && rec.getFirstOfPairFlag()) return 0; // same start, so pick the first end + else if (rec.getMateAlignmentStart() == rec.getAlignmentStart() && rec.getFirstOfPairFlag()) + return 0; // same start, so pick the first end // Find the number of read bases after the given mate's alignment start. int numBasesToClip = 0; @@ -1026,12 +1033,11 @@ public static int getNumOverlappingAlignedBasesToClip(final SAMRecord rec) { if (refStartPos <= refPos + refBasesLength - 1) { // add to clipped bases if (operator == CigarOperator.MATCH_OR_MISMATCH) { // M if (refStartPos < refPos) numBasesToClip += refBasesLength; // use all of the bases - else numBasesToClip += (refPos + refBasesLength) - refStartPos; // since the mate's alignment start can be in the middle of a cigar element - } - else if (operator == CigarOperator.SOFT_CLIP || operator == CigarOperator.HARD_CLIP || operator == CigarOperator.PADDING || operator == CigarOperator.SKIPPED_REGION) { + else + numBasesToClip += (refPos + refBasesLength) - refStartPos; // since the mate's alignment start can be in the middle of a cigar element + } else if (operator == CigarOperator.SOFT_CLIP || operator == CigarOperator.HARD_CLIP || operator == CigarOperator.PADDING || operator == CigarOperator.SKIPPED_REGION) { // ignore - } - else { // ID + } else { // ID numBasesToClip += operator.consumesReadBases() ? el.getLength() : 0; // clip all the bases in the read from this operator } } @@ -1044,14 +1050,14 @@ else if (operator == CigarOperator.SOFT_CLIP || operator == CigarOperator.HARD_C } /** - * Returns a (possibly new) record that has been clipped if isa mapped paired and has overlapping bases with its mate. + * Returns a (possibly new) record that has been clipped if input is a mapped paired and has overlapping bases with its mate. * See {@link #getNumOverlappingAlignedBasesToClip(SAMRecord)} for how the number of overlapping bases is computed. * NB: this does not properly consider a cigar like: 100M20S10H. * NB: This method assumes that the record's mate is not contained within the given record's alignment. * - * @param record the record from which to clip bases. + * @param record the record from which to clip bases. * @param noSideEffects if true a modified clone of the original record is returned, otherwise we modify the record directly. - * @return + * @return a (possibly new) record that has been clipped */ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, final boolean noSideEffects) { return clipOverlappingAlignedBases(record, getNumOverlappingAlignedBasesToClip(record), noSideEffects); @@ -1063,18 +1069,20 @@ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, fina * NB: this does not properly consider a cigar like: 100M20S10H. * NB: This method assumes that the record's mate is not contained within the given record's alignment. * - * @param record the record from which to clip bases. + * @param record the record from which to clip bases. * @param numOverlappingBasesToClip the number of bases to clip at the end of the read. - * @param noSideEffects if true a modified clone of the original record is returned, otherwise we modify the record directly. - * @return + * @param noSideEffects if true a modified clone of the original record is returned, otherwise we modify the record directly. + * @return Returns a (possibly new) SAMRecord with the given number of bases soft-clipped */ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, final int numOverlappingBasesToClip, final boolean noSideEffects) { // NB: ignores how to handle supplemental records when present for both ends by just using the mate information in the record. - if (numOverlappingBasesToClip <= 0 || record.getReadUnmappedFlag() || record.getMateUnmappedFlag()) return record; + if (numOverlappingBasesToClip <= 0 || record.getReadUnmappedFlag() || record.getMateUnmappedFlag()) { + return record; + } try { - final SAMRecord rec = noSideEffects ? ((SAMRecord)record.clone()) : record; + final SAMRecord rec = noSideEffects ? ((SAMRecord) record.clone()) : record; // watch out for when the second read overlaps all of the first read if (rec.getMateAlignmentStart() <= rec.getAlignmentStart()) { // make it unmapped @@ -1085,7 +1093,7 @@ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, fina // 1-based index of first base in read to clip. int clipFrom = rec.getReadLength() - numOverlappingBasesToClip + 1; // we have to check if the last cigar element is soft-clipping, so we can subtract that from clipFrom - final CigarElement cigarElement = rec.getCigar().getCigarElement(rec.getCigarLength()-1); + final CigarElement cigarElement = rec.getCigar().getCigarElement(rec.getCigarLength() - 1); if (CigarOperator.SOFT_CLIP == cigarElement.getOperator()) clipFrom -= cigarElement.getLength(); // FIXME: does not properly consider a cigar like: 100M20S10H @@ -1111,100 +1119,102 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { * Extract a List of 'other canonical alignments' from a SAM record. Those alignments are stored as a string in the 'SA' tag as defined * in the SAM specification. * The name, sequence and qualities, mate data are copied from the original record. + * * @param record must be non null and must have a non-null associated header. * @return a list of 'other canonical alignments' SAMRecords. The list is empty if the 'SA' attribute is missing. */ public static List getOtherCanonicalAlignments(final SAMRecord record) { - if( record == null ) throw new IllegalArgumentException("record is null"); - if( record.getHeader() == null ) throw new IllegalArgumentException("record.getHeader() is null"); + if (record == null) throw new IllegalArgumentException("record is null"); + if (record.getHeader() == null) throw new IllegalArgumentException("record.getHeader() is null"); /* extract value of SA tag */ - final Object saValue = record.getAttribute( SAMTagUtil.getSingleton().SA ); - if( saValue == null ) return Collections.emptyList(); - if( ! (saValue instanceof String) ) throw new SAMException( - "Expected a String for attribute 'SA' but got " + saValue.getClass() ); + final Object saValue = record.getAttribute(SAMTagUtil.getSingleton().SA); + if (saValue == null) return Collections.emptyList(); + if (!(saValue instanceof String)) throw new SAMException( + "Expected a String for attribute 'SA' but got " + saValue.getClass() + ". Record: " + record); final SAMRecordFactory samReaderFactory = new DefaultSAMRecordFactory(); /* the spec says: "Other canonical alignments in a chimeric alignment, formatted as a * semicolon-delimited list: (rname,pos,strand,CIGAR,mapQ,NM;)+. * Each element in the list represents a part of the chimeric alignment. - * Conventionally, at a supplementary line, the 1rst element points to the primary line. + * Conventionally, at a supplementary line, the 1st element points to the primary line. */ /* break string using semicolon */ - final String semiColonStrs[] = SEMICOLON_PAT.split((String)saValue); + final String semiColonStrs[] = SEMICOLON_PAT.split((String) saValue); /* the result list */ - final List alignments = new ArrayList<>( semiColonStrs.length ); + final List alignments = new ArrayList<>(semiColonStrs.length); /* base SAM flag */ - int record_flag = record.getFlags() ; + int record_flag = record.getFlags(); record_flag &= ~SAMFlag.PROPER_PAIR.flag; record_flag &= ~SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag; record_flag &= ~SAMFlag.READ_REVERSE_STRAND.flag; - - for(int i=0; i< semiColonStrs.length;++i ) { + for (int i = 0; i < semiColonStrs.length; ++i) { final String semiColonStr = semiColonStrs[i]; /* ignore empty string */ - if( semiColonStr.isEmpty() ) continue; + if (semiColonStr.isEmpty()) continue; /* break string using comma */ final String commaStrs[] = COMMA_PAT.split(semiColonStr); - if( commaStrs.length != 6 ) throw new SAMException("Bad 'SA' attribute in " + semiColonStr); + if (commaStrs.length != 6) + throw new SAMException("Bad 'SA' attribute in " + semiColonStr + ". Record: " + record); /* create the new record */ - final SAMRecord otherRec = samReaderFactory.createSAMRecord( record.getHeader() ); + final SAMRecord otherRec = samReaderFactory.createSAMRecord(record.getHeader()); /* copy fields from the original record */ - otherRec.setReadName( record.getReadName() ); - otherRec.setReadBases( record.getReadBases() ); - otherRec.setBaseQualities( record.getBaseQualities() ); - if( record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { - otherRec.setMateReferenceIndex( record.getMateReferenceIndex() ); - otherRec.setMateAlignmentStart( record.getMateAlignmentStart() ); + otherRec.setReadName(record.getReadName()); + otherRec.setReadBases(record.getReadBases()); + otherRec.setBaseQualities(record.getBaseQualities()); + if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { + otherRec.setMateReferenceIndex(record.getMateReferenceIndex()); + otherRec.setMateAlignmentStart(record.getMateAlignmentStart()); } /* get reference sequence */ - final int tid = record.getHeader().getSequenceIndex( commaStrs[0] ); - if( tid == -1 ) throw new SAMException("Unknown contig in " + semiColonStr); - otherRec.setReferenceIndex( tid ); + final int tid = record.getHeader().getSequenceIndex(commaStrs[0]); + if (tid == -1) + throw new SAMException("Unknown contig in " + semiColonStr + ". Record: " + record); + otherRec.setReferenceIndex(tid); /* fill POS */ final int alignStart; try { alignStart = Integer.parseInt(commaStrs[1]); - } catch( final NumberFormatException err ) { - throw new SAMException("bad POS in "+semiColonStr, err); + } catch (final NumberFormatException err) { + throw new SAMException("bad POS in " + semiColonStr + ". Record: " + record, err); } - otherRec.setAlignmentStart( alignStart ); + otherRec.setAlignmentStart(alignStart); /* set TLEN */ - if( record.getReadPairedFlag() && - !record.getMateUnmappedFlag() && - record.getMateReferenceIndex() == tid ) { - otherRec.setInferredInsertSize( record.getMateAlignmentStart() - alignStart ); + if (record.getReadPairedFlag() && + !record.getMateUnmappedFlag() && + record.getMateReferenceIndex() == tid) { + otherRec.setInferredInsertSize(record.getMateAlignmentStart() - alignStart); } /* set FLAG */ - int other_flag = record_flag; - other_flag |= (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag) ; + int other_flag = record_flag; + other_flag |= (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag); /* spec: Conventionally, at a supplementary line, the 1st element points to the primary line */ - if( !( record.getSupplementaryAlignmentFlag() && i==0 ) ) { - other_flag |= SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag; - } - otherRec.setFlags(other_flag); + if (!(record.getSupplementaryAlignmentFlag() && i == 0)) { + other_flag |= SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag; + } + otherRec.setFlags(other_flag); /* set CIGAR */ - otherRec.setCigar( TextCigarCodec.decode( commaStrs[3] ) ); + otherRec.setCigar(TextCigarCodec.decode(commaStrs[3])); /* set MAPQ */ try { - otherRec.setMappingQuality( Integer.parseInt(commaStrs[4]) ); + otherRec.setMappingQuality(Integer.parseInt(commaStrs[4])); } catch (final NumberFormatException err) { - throw new SAMException("bad MAPQ in "+semiColonStr, err); + throw new SAMException("bad MAPQ in " + semiColonStr + ". Record: " + record, err); } /* fill NM */ @@ -1213,16 +1223,16 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { otherRec.setAttribute(SAMTagUtil.getSingleton().NM, Integer.parseInt(commaStrs[5])); } } catch (final NumberFormatException err) { - throw new SAMException("bad NM in "+semiColonStr, err); + throw new SAMException("bad NM in " + semiColonStr + ". Record: " + record, err); } /* if strand is not the same: reverse-complement */ - if( otherRec.getReadNegativeStrandFlag() != record.getReadNegativeStrandFlag() ) { - SAMRecordUtil.reverseComplement(otherRec); + if (otherRec.getReadNegativeStrandFlag() != record.getReadNegativeStrandFlag()) { + otherRec.reverseComplement(true); } /* add the alignment */ - alignments.add( otherRec ); + alignments.add(otherRec); } return alignments; } diff --git a/src/main/java/htsjdk/samtools/filter/FilteringIterator.java b/src/main/java/htsjdk/samtools/filter/FilteringIterator.java index 3ce9f96ce..4cdaebe89 100644 --- a/src/main/java/htsjdk/samtools/filter/FilteringIterator.java +++ b/src/main/java/htsjdk/samtools/filter/FilteringIterator.java @@ -36,7 +36,7 @@ * * @author Kathleen Tibbetts * - * use {@link FilteringSamIterator} instead + * @deprecated use {@link FilteringSamIterator} instead */ @Deprecated /** use {@link FilteringSamIterator} instead **/ diff --git a/src/main/java/htsjdk/tribble/util/HTTPHelper.java b/src/main/java/htsjdk/tribble/util/HTTPHelper.java index 1e89bc26b..cdd6b277e 100644 --- a/src/main/java/htsjdk/tribble/util/HTTPHelper.java +++ b/src/main/java/htsjdk/tribble/util/HTTPHelper.java @@ -101,6 +101,9 @@ public InputStream openInputStream() throws IOException { * @param end end of range ni bytes * @return * @throws IOException + * + * @deprecated since 12/10/14 Will be removed in a future release, as is somewhat fragile + * and not used. */ @Override @Deprecated diff --git a/src/main/java/htsjdk/variant/variantcontext/GenotypeLikelihoods.java b/src/main/java/htsjdk/variant/variantcontext/GenotypeLikelihoods.java index ee3e08d47..605f2985f 100644 --- a/src/main/java/htsjdk/variant/variantcontext/GenotypeLikelihoods.java +++ b/src/main/java/htsjdk/variant/variantcontext/GenotypeLikelihoods.java @@ -183,6 +183,10 @@ public String getAsString() { * If you know you're biallelic, use getGQLog10FromLikelihoods directly. * @param genotype - actually a genotype type (no call, hom ref, het, hom var) * @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best). + * + * @deprecated since 2/5/13 use + * {@link GenotypeLikelihoods#getLog10GQ(Genotype, VariantContext)} or + * {@link GenotypeLikelihoods#getLog10GQ(Genotype, List)} */ @Deprecated public double getLog10GQ(GenotypeType genotype){ @@ -554,6 +558,8 @@ public static synchronized void initializeAnyploidPLIndexToAlleleIndices(final i * * @param PLindex the PL index * @return the allele index pair + * + * @deprecated since 2/5/13 */ @Deprecated public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) { diff --git a/src/main/java/htsjdk/variant/variantcontext/filter/FilteringIterator.java b/src/main/java/htsjdk/variant/variantcontext/filter/FilteringIterator.java index 04609a89b..44362d6c1 100644 --- a/src/main/java/htsjdk/variant/variantcontext/filter/FilteringIterator.java +++ b/src/main/java/htsjdk/variant/variantcontext/filter/FilteringIterator.java @@ -36,7 +36,7 @@ * * @author Yossi Farjoun * - * use {@link FilteringVariantContextIterator} instead + * @deprecated since 2/29/16 use {@link FilteringVariantContextIterator} instead */ @Deprecated diff --git a/src/main/java/htsjdk/variant/vcf/VCFEncoder.java b/src/main/java/htsjdk/variant/vcf/VCFEncoder.java index a90906684..0605b73b9 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFEncoder.java +++ b/src/main/java/htsjdk/variant/vcf/VCFEncoder.java @@ -22,361 +22,362 @@ */ public class VCFEncoder { - /** - * The encoding used for VCF files: ISO-8859-1 - */ - public static final Charset VCF_CHARSET = Charset.forName("ISO-8859-1"); - private static final String QUAL_FORMAT_STRING = "%.2f"; - private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00"; - - private final IntGenotypeFieldAccessors GENOTYPE_FIELD_ACCESSORS = new IntGenotypeFieldAccessors(); - - private VCFHeader header; - - private boolean allowMissingFieldsInHeader = false; - - private boolean outputTrailingFormatFields = false; - - /** - * Prepare a VCFEncoder that will encode records appropriate to the given VCF header, optionally - * allowing missing fields in the header. - */ - public VCFEncoder(final VCFHeader header, final boolean allowMissingFieldsInHeader, final boolean outputTrailingFormatFields) { - if (header == null) throw new NullPointerException("The VCF header must not be null."); - this.header = header; - this.allowMissingFieldsInHeader = allowMissingFieldsInHeader; - this.outputTrailingFormatFields = outputTrailingFormatFields; - } - - /** - * Please see the notes in the default constructor - */ - @Deprecated - public void setVCFHeader(final VCFHeader header) { - this.header = header; - } - - /** - * Please see the notes in the default constructor - */ - @Deprecated - public void setAllowMissingFieldsInHeader(final boolean allow) { - this.allowMissingFieldsInHeader = allow; - } - - public String encode(final VariantContext context) { - if (this.header == null) { - throw new NullPointerException("The header field must be set on the VCFEncoder before encoding records."); - } - - final StringBuilder stringBuilder = new StringBuilder(); - - // CHROM - stringBuilder.append(context.getContig()).append(VCFConstants.FIELD_SEPARATOR) - // POS - .append(String.valueOf(context.getStart())).append(VCFConstants.FIELD_SEPARATOR) - // ID - .append(context.getID()).append(VCFConstants.FIELD_SEPARATOR) - // REF - .append(context.getReference().getDisplayString()).append(VCFConstants.FIELD_SEPARATOR); - - // ALT - if ( context.isVariant() ) { - Allele altAllele = context.getAlternateAllele(0); - String alt = altAllele.getDisplayString(); - stringBuilder.append(alt); - - for (int i = 1; i < context.getAlternateAlleles().size(); i++) { - altAllele = context.getAlternateAllele(i); - alt = altAllele.getDisplayString(); - stringBuilder.append(','); - stringBuilder.append(alt); - } - } else { - stringBuilder.append(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); - } - - stringBuilder.append(VCFConstants.FIELD_SEPARATOR); - - // QUAL - if ( ! context.hasLog10PError()) stringBuilder.append(VCFConstants.MISSING_VALUE_v4); - else stringBuilder.append(formatQualValue(context.getPhredScaledQual())); - stringBuilder.append(VCFConstants.FIELD_SEPARATOR) - // FILTER - .append(getFilterString(context)).append(VCFConstants.FIELD_SEPARATOR); - - // INFO - final Map infoFields = new TreeMap(); - for (final Map.Entry field : context.getAttributes().entrySet() ) { - if ( ! this.header.hasInfoLine(field.getKey())) fieldIsMissingFromHeaderError(context, field.getKey(), "INFO"); - - final String outputValue = formatVCFField(field.getValue()); - if (outputValue != null) infoFields.put(field.getKey(), outputValue); - } - writeInfoString(infoFields, stringBuilder); - - // FORMAT - final GenotypesContext gc = context.getGenotypes(); - if (gc.isLazyWithData() && ((LazyGenotypesContext) gc).getUnparsedGenotypeData() instanceof String) { - stringBuilder.append(VCFConstants.FIELD_SEPARATOR); - stringBuilder.append(((LazyGenotypesContext) gc).getUnparsedGenotypeData().toString()); - } else { - final List genotypeAttributeKeys = context.calcVCFGenotypeKeys(this.header); - if ( ! genotypeAttributeKeys.isEmpty()) { - for (final String format : genotypeAttributeKeys) - if ( ! this.header.hasFormatLine(format)) - fieldIsMissingFromHeaderError(context, format, "FORMAT"); - - final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); - - stringBuilder.append(VCFConstants.FIELD_SEPARATOR); - stringBuilder.append(genotypeFormatString); - - final Map alleleStrings = buildAlleleStrings(context); - addGenotypeData(context, alleleStrings, genotypeAttributeKeys, stringBuilder); - } - } - - return stringBuilder.toString(); - } - - VCFHeader getVCFHeader() { - return this.header; - } - - boolean getAllowMissingFieldsInHeader() { - return this.allowMissingFieldsInHeader; - } - - private String getFilterString(final VariantContext vc) { - if (vc.isFiltered()) { - for (final String filter : vc.getFilters()) { - if ( ! this.header.hasFilterLine(filter)) fieldIsMissingFromHeaderError(vc, filter, "FILTER"); - } - - return ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())); - } - else if (vc.filtersWereApplied()) return VCFConstants.PASSES_FILTERS_v4; - else return VCFConstants.UNFILTERED; - } - - private String formatQualValue(final double qual) { - String s = String.format(QUAL_FORMAT_STRING, qual); - if ( s.endsWith(QUAL_FORMAT_EXTENSION_TO_TRIM) ) - s = s.substring(0, s.length() - QUAL_FORMAT_EXTENSION_TO_TRIM.length()); - return s; - } - - private void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) { - if ( ! allowMissingFieldsInHeader) - throw new IllegalStateException("Key " + id + " found in VariantContext field " + field - + " at " + vc.getContig() + ":" + vc.getStart() - + " but this key isn't defined in the VCFHeader. We require all VCFs to have" - + " complete VCF headers by default."); - } - - String formatVCFField(final Object val) { - final String result; - if ( val == null ) - result = VCFConstants.MISSING_VALUE_v4; - else if ( val instanceof Double ) - result = formatVCFDouble((Double) val); - else if ( val instanceof Boolean ) - result = (Boolean)val ? "" : null; // empty string for true, null for false - else if ( val instanceof List ) { - result = formatVCFField(((List)val).toArray()); - } else if ( val.getClass().isArray() ) { - final int length = Array.getLength(val); - if ( length == 0 ) - return formatVCFField(null); - final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0))); - for ( int i = 1; i < length; i++) { - sb.append(','); - sb.append(formatVCFField(Array.get(val, i))); - } - result = sb.toString(); - } else - result = val.toString(); - - return result; - } - - /** - * Takes a double value and pretty prints it to a String for display - * - * Large doubles => gets %.2f style formatting - * Doubles < 1 / 10 but > 1/100 => get %.3f style formatting - * Double < 1/100 => %.3e formatting - * @param d - * @return - */ - public static String formatVCFDouble(final double d) { - final String format; - if ( d < 1 ) { - if ( d < 0.01 ) { - if ( Math.abs(d) >= 1e-20 ) - format = "%.3e"; - else { - // return a zero format - return "0.00"; - } - } else { - format = "%.3f"; - } - } else { - format = "%.2f"; - } - - return String.format(format, d); - } - - static int countOccurrences(final char c, final String s) { - int count = 0; - for (int i = 0; i < s.length(); i++) { - count += s.charAt(i) == c ? 1 : 0; - } - return count; - } - - static boolean isMissingValue(final String s) { - // we need to deal with the case that it's a list of missing values - return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length()); - } - - /* - * Add the genotype data - */ - public void addGenotypeData(final VariantContext vc, final Map alleleMap, final List genotypeFormatKeys, final StringBuilder builder) { - final int ploidy = vc.getMaxPloidy(2); - - for (final String sample : this.header.getGenotypeSamples()) { - builder.append(VCFConstants.FIELD_SEPARATOR); - - Genotype g = vc.getGenotype(sample); - if (g == null) g = GenotypeBuilder.createMissing(sample, ploidy); - - final List attrs = new ArrayList(genotypeFormatKeys.size()); - for (final String field : genotypeFormatKeys) { - if (field.equals(VCFConstants.GENOTYPE_KEY)) { - if ( ! g.isAvailable()) { - throw new IllegalStateException("GTs cannot be missing for some samples if they are available for others in the record"); - } - - writeAllele(g.getAllele(0), alleleMap, builder); - for (int i = 1; i < g.getPloidy(); i++) { - builder.append(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); - writeAllele(g.getAllele(i), alleleMap, builder); - } - continue; - - } else { - final String outputValue; - if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY ) ) { - outputValue = g.isFiltered() ? g.getFilters() : VCFConstants.PASSES_FILTERS_v4; - } else { - final IntGenotypeFieldAccessors.Accessor accessor = GENOTYPE_FIELD_ACCESSORS.getAccessor(field); - if ( accessor != null ) { - final int[] intValues = accessor.getValues(g); - if ( intValues == null ) - outputValue = VCFConstants.MISSING_VALUE_v4; - else if ( intValues.length == 1 ) // fast path - outputValue = Integer.toString(intValues[0]); - else { - final StringBuilder sb = new StringBuilder(); - sb.append(intValues[0]); - for ( int i = 1; i < intValues.length; i++) { - sb.append(','); - sb.append(intValues[i]); - } - outputValue = sb.toString(); - } - } else { - Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; - - final VCFFormatHeaderLine metaData = this.header.getFormatHeaderLine(field); - if ( metaData != null ) { - final int numInFormatField = metaData.getCount(vc); - if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { - // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. - // For example, if Number=2, the string has to be ".,." - final StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); - for ( int i = 1; i < numInFormatField; i++ ) { - sb.append(','); - sb.append(VCFConstants.MISSING_VALUE_v4); - } - val = sb.toString(); - } - } - - // assume that if key is absent, then the given string encoding suffices - outputValue = formatVCFField(val); - } - } - - if ( outputValue != null ) - attrs.add(outputValue); - } - } - - // strip off trailing missing values - if (!outputTrailingFormatFields) { - for (int i = attrs.size() - 1; i >= 0; i--) { - if (isMissingValue(attrs.get(i))) attrs.remove(i); - else break; - } - } - - for (int i = 0; i < attrs.size(); i++) { - if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY)) { - builder.append(VCFConstants.GENOTYPE_FIELD_SEPARATOR); - } - builder.append(attrs.get(i)); - } - } - } - - /* - * Create the info string; assumes that no values are null - */ - private void writeInfoString(final Map infoFields, final StringBuilder builder) { - if ( infoFields.isEmpty() ) { - builder.append(VCFConstants.EMPTY_INFO_FIELD); - return; - } - - boolean isFirst = true; - for (final Map.Entry entry : infoFields.entrySet()) { - if (isFirst) isFirst = false; - else builder.append(VCFConstants.INFO_FIELD_SEPARATOR); - - builder.append(entry.getKey()); - - if ( ! entry.getValue().equals("")) { - final VCFInfoHeaderLine metaData = this.header.getInfoHeaderLine(entry.getKey()); - if ( metaData == null || metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() != 0 ) { - builder.append('='); - builder.append(entry.getValue()); - } - } - } - } - - public Map buildAlleleStrings(final VariantContext vc) { - final Map alleleMap = new HashMap(vc.getAlleles().size()+1); - alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup - - final List alleles = vc.getAlleles(); - for ( int i = 0; i < alleles.size(); i++ ) { - alleleMap.put(alleles.get(i), String.valueOf(i)); - } - - return alleleMap; - } - - private void writeAllele(final Allele allele, final Map alleleMap, final StringBuilder builder) { - final String encoding = alleleMap.get(allele); - if ( encoding == null ) - throw new RuntimeException("Allele " + allele + " is not an allele in the variant context"); - builder.append(encoding); - } + /** + * The encoding used for VCF files: ISO-8859-1 + */ + public static final Charset VCF_CHARSET = Charset.forName("ISO-8859-1"); + private static final String QUAL_FORMAT_STRING = "%.2f"; + private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00"; + + private final IntGenotypeFieldAccessors GENOTYPE_FIELD_ACCESSORS = new IntGenotypeFieldAccessors(); + + private VCFHeader header; + + private boolean allowMissingFieldsInHeader = false; + + private boolean outputTrailingFormatFields = false; + + /** + * Prepare a VCFEncoder that will encode records appropriate to the given VCF header, optionally + * allowing missing fields in the header. + */ + public VCFEncoder(final VCFHeader header, final boolean allowMissingFieldsInHeader, final boolean outputTrailingFormatFields) { + if (header == null) throw new NullPointerException("The VCF header must not be null."); + this.header = header; + this.allowMissingFieldsInHeader = allowMissingFieldsInHeader; + this.outputTrailingFormatFields = outputTrailingFormatFields; + } + + /** + * @deprecated since 10/24/13 use the constructor + */ + @Deprecated + public void setVCFHeader(final VCFHeader header) { + this.header = header; + } + + /** + * @deprecated since 10/24/13 use the constructor + */ + @Deprecated + public void setAllowMissingFieldsInHeader(final boolean allow) { + this.allowMissingFieldsInHeader = allow; + } + + public String encode(final VariantContext context) { + if (this.header == null) { + throw new NullPointerException("The header field must be set on the VCFEncoder before encoding records."); + } + + final StringBuilder stringBuilder = new StringBuilder(); + + // CHROM + stringBuilder.append(context.getContig()).append(VCFConstants.FIELD_SEPARATOR) + // POS + .append(String.valueOf(context.getStart())).append(VCFConstants.FIELD_SEPARATOR) + // ID + .append(context.getID()).append(VCFConstants.FIELD_SEPARATOR) + // REF + .append(context.getReference().getDisplayString()).append(VCFConstants.FIELD_SEPARATOR); + + // ALT + if (context.isVariant()) { + Allele altAllele = context.getAlternateAllele(0); + String alt = altAllele.getDisplayString(); + stringBuilder.append(alt); + + for (int i = 1; i < context.getAlternateAlleles().size(); i++) { + altAllele = context.getAlternateAllele(i); + alt = altAllele.getDisplayString(); + stringBuilder.append(','); + stringBuilder.append(alt); + } + } else { + stringBuilder.append(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD); + } + + stringBuilder.append(VCFConstants.FIELD_SEPARATOR); + + // QUAL + if (!context.hasLog10PError()) stringBuilder.append(VCFConstants.MISSING_VALUE_v4); + else stringBuilder.append(formatQualValue(context.getPhredScaledQual())); + stringBuilder.append(VCFConstants.FIELD_SEPARATOR) + // FILTER + .append(getFilterString(context)).append(VCFConstants.FIELD_SEPARATOR); + + // INFO + final Map infoFields = new TreeMap<>(); + for (final Map.Entry field : context.getAttributes().entrySet()) { + if (!this.header.hasInfoLine(field.getKey())) + fieldIsMissingFromHeaderError(context, field.getKey(), "INFO"); + + final String outputValue = formatVCFField(field.getValue()); + if (outputValue != null) infoFields.put(field.getKey(), outputValue); + } + writeInfoString(infoFields, stringBuilder); + + // FORMAT + final GenotypesContext gc = context.getGenotypes(); + if (gc.isLazyWithData() && ((LazyGenotypesContext) gc).getUnparsedGenotypeData() instanceof String) { + stringBuilder.append(VCFConstants.FIELD_SEPARATOR); + stringBuilder.append(((LazyGenotypesContext) gc).getUnparsedGenotypeData().toString()); + } else { + final List genotypeAttributeKeys = context.calcVCFGenotypeKeys(this.header); + if (!genotypeAttributeKeys.isEmpty()) { + for (final String format : genotypeAttributeKeys) + if (!this.header.hasFormatLine(format)) + fieldIsMissingFromHeaderError(context, format, "FORMAT"); + + final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); + + stringBuilder.append(VCFConstants.FIELD_SEPARATOR); + stringBuilder.append(genotypeFormatString); + + final Map alleleStrings = buildAlleleStrings(context); + addGenotypeData(context, alleleStrings, genotypeAttributeKeys, stringBuilder); + } + } + + return stringBuilder.toString(); + } + + VCFHeader getVCFHeader() { + return this.header; + } + + boolean getAllowMissingFieldsInHeader() { + return this.allowMissingFieldsInHeader; + } + + private String getFilterString(final VariantContext vc) { + if (vc.isFiltered()) { + for (final String filter : vc.getFilters()) { + if (!this.header.hasFilterLine(filter)) fieldIsMissingFromHeaderError(vc, filter, "FILTER"); + } + + return ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())); + } else if (vc.filtersWereApplied()) return VCFConstants.PASSES_FILTERS_v4; + else return VCFConstants.UNFILTERED; + } + + private String formatQualValue(final double qual) { + String s = String.format(QUAL_FORMAT_STRING, qual); + if (s.endsWith(QUAL_FORMAT_EXTENSION_TO_TRIM)) + s = s.substring(0, s.length() - QUAL_FORMAT_EXTENSION_TO_TRIM.length()); + return s; + } + + private void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) { + if (!allowMissingFieldsInHeader) + throw new IllegalStateException("Key " + id + " found in VariantContext field " + field + + " at " + vc.getContig() + ":" + vc.getStart() + + " but this key isn't defined in the VCFHeader. We require all VCFs to have" + + " complete VCF headers by default."); + } + + String formatVCFField(final Object val) { + final String result; + if (val == null) + result = VCFConstants.MISSING_VALUE_v4; + else if (val instanceof Double) + result = formatVCFDouble((Double) val); + else if (val instanceof Boolean) + result = (Boolean) val ? "" : null; // empty string for true, null for false + else if (val instanceof List) { + result = formatVCFField(((List) val).toArray()); + } else if (val.getClass().isArray()) { + final int length = Array.getLength(val); + if (length == 0) + return formatVCFField(null); + final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0))); + for (int i = 1; i < length; i++) { + sb.append(','); + sb.append(formatVCFField(Array.get(val, i))); + } + result = sb.toString(); + } else + result = val.toString(); + + return result; + } + + /** + * Takes a double value and pretty prints it to a String for display + *

+ * Large doubles => gets %.2f style formatting + * Doubles < 1 / 10 but > 1/100 => get %.3f style formatting + * Double < 1/100 => %.3e formatting + * + * @param d + * @return + */ + public static String formatVCFDouble(final double d) { + final String format; + if (d < 1) { + if (d < 0.01) { + if (Math.abs(d) >= 1e-20) + format = "%.3e"; + else { + // return a zero format + return "0.00"; + } + } else { + format = "%.3f"; + } + } else { + format = "%.2f"; + } + + return String.format(format, d); + } + + static int countOccurrences(final char c, final String s) { + int count = 0; + for (int i = 0; i < s.length(); i++) { + count += s.charAt(i) == c ? 1 : 0; + } + return count; + } + + static boolean isMissingValue(final String s) { + // we need to deal with the case that it's a list of missing values + return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length()); + } + + /* + * Add the genotype data + */ + public void addGenotypeData(final VariantContext vc, final Map alleleMap, final List genotypeFormatKeys, final StringBuilder builder) { + final int ploidy = vc.getMaxPloidy(2); + + for (final String sample : this.header.getGenotypeSamples()) { + builder.append(VCFConstants.FIELD_SEPARATOR); + + Genotype g = vc.getGenotype(sample); + if (g == null) g = GenotypeBuilder.createMissing(sample, ploidy); + + final List attrs = new ArrayList(genotypeFormatKeys.size()); + for (final String field : genotypeFormatKeys) { + if (field.equals(VCFConstants.GENOTYPE_KEY)) { + if (!g.isAvailable()) { + throw new IllegalStateException("GTs cannot be missing for some samples if they are available for others in the record"); + } + + writeAllele(g.getAllele(0), alleleMap, builder); + for (int i = 1; i < g.getPloidy(); i++) { + builder.append(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); + writeAllele(g.getAllele(i), alleleMap, builder); + } + continue; + + } else { + final String outputValue; + if (field.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { + outputValue = g.isFiltered() ? g.getFilters() : VCFConstants.PASSES_FILTERS_v4; + } else { + final IntGenotypeFieldAccessors.Accessor accessor = GENOTYPE_FIELD_ACCESSORS.getAccessor(field); + if (accessor != null) { + final int[] intValues = accessor.getValues(g); + if (intValues == null) + outputValue = VCFConstants.MISSING_VALUE_v4; + else if (intValues.length == 1) // fast path + outputValue = Integer.toString(intValues[0]); + else { + final StringBuilder sb = new StringBuilder(); + sb.append(intValues[0]); + for (int i = 1; i < intValues.length; i++) { + sb.append(','); + sb.append(intValues[i]); + } + outputValue = sb.toString(); + } + } else { + Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; + + final VCFFormatHeaderLine metaData = this.header.getFormatHeaderLine(field); + if (metaData != null) { + final int numInFormatField = metaData.getCount(vc); + if (numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4)) { + // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. + // For example, if Number=2, the string has to be ".,." + final StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); + for (int i = 1; i < numInFormatField; i++) { + sb.append(','); + sb.append(VCFConstants.MISSING_VALUE_v4); + } + val = sb.toString(); + } + } + + // assume that if key is absent, then the given string encoding suffices + outputValue = formatVCFField(val); + } + } + + if (outputValue != null) + attrs.add(outputValue); + } + } + + // strip off trailing missing values + if (!outputTrailingFormatFields) { + for (int i = attrs.size() - 1; i >= 0; i--) { + if (isMissingValue(attrs.get(i))) attrs.remove(i); + else break; + } + } + + for (int i = 0; i < attrs.size(); i++) { + if (i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY)) { + builder.append(VCFConstants.GENOTYPE_FIELD_SEPARATOR); + } + builder.append(attrs.get(i)); + } + } + } + + /* + * Create the info string; assumes that no values are null + */ + private void writeInfoString(final Map infoFields, final StringBuilder builder) { + if (infoFields.isEmpty()) { + builder.append(VCFConstants.EMPTY_INFO_FIELD); + return; + } + + boolean isFirst = true; + for (final Map.Entry entry : infoFields.entrySet()) { + if (isFirst) isFirst = false; + else builder.append(VCFConstants.INFO_FIELD_SEPARATOR); + + builder.append(entry.getKey()); + + if (!entry.getValue().equals("")) { + final VCFInfoHeaderLine metaData = this.header.getInfoHeaderLine(entry.getKey()); + if (metaData == null || metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() != 0) { + builder.append('='); + builder.append(entry.getValue()); + } + } + } + } + + public Map buildAlleleStrings(final VariantContext vc) { + final Map alleleMap = new HashMap(vc.getAlleles().size() + 1); + alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup + + final List alleles = vc.getAlleles(); + for (int i = 0; i < alleles.size(); i++) { + alleleMap.put(alleles.get(i), String.valueOf(i)); + } + + return alleleMap; + } + + private void writeAllele(final Allele allele, final Map alleleMap, final StringBuilder builder) { + final String encoding = alleleMap.get(allele); + if (encoding == null) + throw new RuntimeException("Allele " + allele + " is not an allele in the variant context"); + builder.append(encoding); + } } From 4626fe66c680c4fc65696b462391d695a8e65e71 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Fri, 9 Jun 2017 13:24:19 -0400 Subject: [PATCH 05/23] updating artifactory link to point to new artifactory (#892) * the Broad artifactory moved from a self hosted repository at artifactory.broadinstitute.org to a web based artifactory at broadinstitute.jfrog.io/broadinstitute/ this was causing our snapshot uploads to fail * updating our snapshot repository to point to the new link * the old artifactory has a redirect in place to the new artifactory so artifacts relying on the old url should still resolve. For unknown reasons this redirect doesn't work with gradle uploads, probably a gradle bug. --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index a60afe5f5..96811bbeb 100644 --- a/build.gradle +++ b/build.gradle @@ -169,7 +169,7 @@ uploadArchives { authentication(userName: project.findProperty("sonatypeUsername"), password: project.findProperty("sonatypePassword")) } - snapshotRepository(url: "https://artifactory.broadinstitute.org/artifactory/libs-snapshot-local/") { + snapshotRepository(url: "https://broadinstitute.jfrog.io/broadinstitute/libs-snapshot-local/") { authentication(userName: System.env.ARTIFACTORY_USERNAME, password: System.env.ARTIFACTORY_PASSWORD) } From 6383b25e76bc05a29852965472ab0091421365bb Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 9 Jun 2017 14:14:11 -0400 Subject: [PATCH 06/23] Fixed flipped sign when comparing read paired flags (#897) Fixed flipped sign when comparing read paired flags --- .../htsjdk/samtools/DuplicateScoringStrategy.java | 4 ++-- .../samtools/DuplicateScoringStrategyTest.java | 25 ++++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 src/test/java/htsjdk/samtools/DuplicateScoringStrategyTest.java diff --git a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java index 292ab0654..26c83a584 100644 --- a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java +++ b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java @@ -36,7 +36,7 @@ public enum ScoringStrategy { SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH, - RANDOM, + RANDOM } /** Hash used for the RANDOM scoring strategy. */ @@ -128,7 +128,7 @@ public static int compare(final SAMRecord rec1, final SAMRecord rec2, final Scor int cmp; // always prefer paired over non-paired - if (rec1.getReadPairedFlag() != rec2.getReadPairedFlag()) return rec1.getReadPairedFlag() ? 1 : -1; + if (rec1.getReadPairedFlag() != rec2.getReadPairedFlag()) return rec1.getReadPairedFlag() ? -1 : 1; cmp = computeDuplicateScore(rec2, scoringStrategy, assumeMateCigar) - computeDuplicateScore(rec1, scoringStrategy, assumeMateCigar); diff --git a/src/test/java/htsjdk/samtools/DuplicateScoringStrategyTest.java b/src/test/java/htsjdk/samtools/DuplicateScoringStrategyTest.java new file mode 100644 index 000000000..5943535af --- /dev/null +++ b/src/test/java/htsjdk/samtools/DuplicateScoringStrategyTest.java @@ -0,0 +1,25 @@ +package htsjdk.samtools; + +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class DuplicateScoringStrategyTest { + + @DataProvider + public Object [][] compareData() { + return new Object[][]{ + {SAMFlag.READ_PAIRED.flag, 0, true, DuplicateScoringStrategy.ScoringStrategy.RANDOM, -1}, + {0, SAMFlag.READ_PAIRED.flag, true, DuplicateScoringStrategy.ScoringStrategy.RANDOM, 1}, + }; + } + + @Test(dataProvider = "compareData") + public static void testCompare(final int samFlag1, final int samFlag2, final boolean assumeMateCigar, final DuplicateScoringStrategy.ScoringStrategy strategy, final int expected) { + final SAMRecord rec1 = new SAMRecordSetBuilder().addFrag("test", 0, 1, false, false, "36M", null, 2); + rec1.setFlags(samFlag1); + final SAMRecord rec2 = new SAMRecordSetBuilder().addFrag("test", 0, 1, true, false, "36M", null, 3); + rec2.setFlags(samFlag2); + Assert.assertEquals(DuplicateScoringStrategy.compare(rec1, rec2, strategy, assumeMateCigar), expected); + } +} \ No newline at end of file From 9f1f19111aaa9502211510b7bc6ddc5a2039f6fa Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Fri, 9 Jun 2017 11:18:40 -0700 Subject: [PATCH 07/23] Standardizes mate info computation in SAMRecordSetBuilder. (#692) --- src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java b/src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java index 60aae473a..b55265f71 100644 --- a/src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java +++ b/src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java @@ -363,13 +363,8 @@ public void addPair(final String name, final int contig, final int start1, final end1.setMappingQuality(255); end1.setReadPairedFlag(true); end1.setProperPairFlag(true); - end1.setMateReferenceIndex(contig); - end1.setAttribute(SAMTag.MC.name(), readLength + "M"); - end1.setMateAlignmentStart(start2); - end1.setMateNegativeStrandFlag(true); end1.setFirstOfPairFlag(end1IsFirstOfPair); end1.setSecondOfPairFlag(!end1IsFirstOfPair); - end1.setInferredInsertSize((int) CoordMath.getLength(start1, CoordMath.getEnd(start2, this.readLength))); end1.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); if (programRecord != null) { end1.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); @@ -388,13 +383,8 @@ public void addPair(final String name, final int contig, final int start1, final end2.setMappingQuality(255); end2.setReadPairedFlag(true); end2.setProperPairFlag(true); - end2.setMateReferenceIndex(contig); - end2.setAttribute(SAMTag.MC.name(), readLength + "M"); - end2.setMateAlignmentStart(start1); - end2.setMateNegativeStrandFlag(false); end2.setFirstOfPairFlag(!end1IsFirstOfPair); end2.setSecondOfPairFlag(end1IsFirstOfPair); - end2.setInferredInsertSize(end1.getInferredInsertSize()); end2.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); if (programRecord != null) { end2.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); @@ -404,6 +394,9 @@ public void addPair(final String name, final int contig, final int start1, final } fillInBasesAndQualities(end2); + // set mate info + SamPairUtil.setMateInfo(end1, end2, true); + this.records.add(end1); this.records.add(end2); } @@ -492,7 +485,7 @@ public void addUnmappedPair(final String name) { end1.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); } if (this.unmappedHasBasesAndQualities) { - fillInBasesAndQualities(end1); + fillInBasesAndQualities(end1); } end2.setReadName(name); @@ -508,7 +501,7 @@ public void addUnmappedPair(final String name) { end2.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); } if (this.unmappedHasBasesAndQualities) { - fillInBasesAndQualities(end2); + fillInBasesAndQualities(end2); } this.records.add(end1); From cc538e2832310d9a7e1cdada61c3540b92ac275f Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 10 Jun 2017 13:26:23 -0400 Subject: [PATCH 08/23] Typekpb patch 1 (replacing #725) (#883) * seek() checks for negative position argument value * tests --- .../seekablestream/ByteArraySeekableStream.java | 23 ++-- .../ByteArraySeekableStreamTest.java | 116 +++++++++++++++++++++ .../java/htsjdk/samtools/sra/AbstractSRATest.java | 4 +- 3 files changed, 132 insertions(+), 11 deletions(-) create mode 100644 src/test/java/htsjdk/samtools/seekablestream/ByteArraySeekableStreamTest.java diff --git a/src/main/java/htsjdk/samtools/seekablestream/ByteArraySeekableStream.java b/src/main/java/htsjdk/samtools/seekablestream/ByteArraySeekableStream.java index 4f8c322c5..bb3b95af0 100644 --- a/src/main/java/htsjdk/samtools/seekablestream/ByteArraySeekableStream.java +++ b/src/main/java/htsjdk/samtools/seekablestream/ByteArraySeekableStream.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2016 The Broad Institute + * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -24,13 +24,11 @@ package htsjdk.samtools.seekablestream; -import htsjdk.samtools.seekablestream.SeekableStream; - import java.io.IOException; /** -* Created by vadim on 23/03/2015. -*/ + * Created by vadim on 23/03/2015. + */ public class ByteArraySeekableStream extends SeekableStream { private byte[] bytes; private long position = 0; @@ -51,21 +49,27 @@ public long position() throws IOException { @Override public void seek(long position) throws IOException { - this.position = position; + if (position < 0) { + throw new IllegalArgumentException("Cannot seek to a negative position, position=" + position + "."); + } else { + this.position = position; + } } @Override public int read() throws IOException { - if (position < bytes.length) + if (position < bytes.length) { return 0xFF & bytes[((int) position++)]; - else return -1; + } else { + return -1; + } } @Override public int read(byte[] b, int off, int len) throws IOException { if (b == null) { throw new NullPointerException(); - } else if (off < 0 || len < 0 || len > b.length - off) { + } else if (off < 0 || len < 0 || len + off > b.length) { throw new IndexOutOfBoundsException(); } if (position >= bytes.length) { @@ -85,6 +89,7 @@ public int read(byte[] b, int off, int len) throws IOException { @Override public void close() throws IOException { bytes = null; + position = -1; } @Override diff --git a/src/test/java/htsjdk/samtools/seekablestream/ByteArraySeekableStreamTest.java b/src/test/java/htsjdk/samtools/seekablestream/ByteArraySeekableStreamTest.java new file mode 100644 index 000000000..04a228f94 --- /dev/null +++ b/src/test/java/htsjdk/samtools/seekablestream/ByteArraySeekableStreamTest.java @@ -0,0 +1,116 @@ +/* + * The MIT License + * + * Copyright (c) 2017 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + */ + +package htsjdk.samtools.seekablestream; + +import htsjdk.HtsjdkTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.IOException; + +/** + * Created by farjoun on 5/27/17. + */ +public class ByteArraySeekableStreamTest extends HtsjdkTest { + private final byte[] bytes = "ABCDE12345".getBytes(); + + @Test + public void testNormalBehavior() throws IOException { + ByteArraySeekableStream byteArraySeekableStream = new ByteArraySeekableStream(bytes); + + Assert.assertEquals(byteArraySeekableStream.length(), 10); + for (int i = 0; i < 10; i++) { + Assert.assertFalse(byteArraySeekableStream.eof()); + Assert.assertEquals(byteArraySeekableStream.position(), i); + Assert.assertEquals(byteArraySeekableStream.read(), bytes[i]); + } + + Assert.assertTrue(byteArraySeekableStream.eof()); + Assert.assertEquals(byteArraySeekableStream.position(), 10); + Assert.assertEquals(byteArraySeekableStream.read(), -1); + + final long i = 0; + byteArraySeekableStream.seek(i); + + Assert.assertEquals(byteArraySeekableStream.position(), i); + Assert.assertEquals(byteArraySeekableStream.read(), bytes[(int) i]); + + byte[] copy = new byte[10]; + + Assert.assertEquals(byteArraySeekableStream.read(copy), 9); + Assert.assertEquals(byteArraySeekableStream.position(), 10); + + byteArraySeekableStream.seek(0L); + + Assert.assertEquals(byteArraySeekableStream.read(copy), 10); + Assert.assertEquals(byteArraySeekableStream.position(), 10); + + Assert.assertEquals(copy, bytes); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testCantSeekNegative() throws IOException { + + ByteArraySeekableStream byteArraySeekableStream = new ByteArraySeekableStream(bytes); + + byteArraySeekableStream.seek(-1L); + + // if allowed to seek, this will throw OutOfBounds + final int f = byteArraySeekableStream.read(); + } + + @Test + public void testCantReadPostEof() throws IOException { + + ByteArraySeekableStream byteArraySeekableStream = new ByteArraySeekableStream(bytes); + byte[] copy = new byte[10]; + + byteArraySeekableStream.seek(10); + Assert.assertEquals(byteArraySeekableStream.read(copy), -1); + Assert.assertEquals(byteArraySeekableStream.read(), -1); + } + + @DataProvider(name = "abnormalReadRequests") + public Object[][] abnormalReadRequestsProvider() { + return new Object[][]{ + {new byte[10], -1, 0}, + {new byte[10], -1, -1}, + {new byte[10], 0, -1}, + {new byte[10], 0, -1}, + {new byte[10], 0, 11}, + {new byte[10], 6, 6}, + {new byte[10], 11, 0}, + }; + } + + @Test(dataProvider = "abnormalReadRequests", expectedExceptions = IndexOutOfBoundsException.class) + public void testAbnormalReadRequest(final byte[] b, final int off, final int length) throws IOException { + + ByteArraySeekableStream byteArraySeekableStream = new ByteArraySeekableStream(bytes); + int i = byteArraySeekableStream.read(b, off, length); + + Assert.assertEquals(i, -2); ///impossible + } +} diff --git a/src/test/java/htsjdk/samtools/sra/AbstractSRATest.java b/src/test/java/htsjdk/samtools/sra/AbstractSRATest.java index 1776a9f91..eeba1d2ea 100644 --- a/src/test/java/htsjdk/samtools/sra/AbstractSRATest.java +++ b/src/test/java/htsjdk/samtools/sra/AbstractSRATest.java @@ -25,14 +25,14 @@ public final void checkIfCanResolve() { canResolveNetworkAccession = SRAAccession.isValid(checkAccession); } - @BeforeMethod + @BeforeMethod(groups = "sra") public final void assertSRAIsSupported() { if(SRAAccession.checkIfInitialized() != null){ throw new SkipException("Skipping SRA Test because SRA native code is unavailable."); } } - @BeforeMethod + @BeforeMethod(groups = "sra") public final void skipIfCantResolve(Method method, Object[] params) { String accession = null; From 04fd13e75a180573c58818f37037b78a49717657 Mon Sep 17 00:00:00 2001 From: Tim Fennell Date: Fri, 16 Jun 2017 17:25:01 -0400 Subject: [PATCH 09/23] A little more caution when checking terminator blocks on close of BlockCompressedOutputStream. (#901) --- src/main/java/htsjdk/samtools/util/BlockCompressedOutputStream.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/samtools/util/BlockCompressedOutputStream.java b/src/main/java/htsjdk/samtools/util/BlockCompressedOutputStream.java index 4e9a59487..a1fc6c80a 100644 --- a/src/main/java/htsjdk/samtools/util/BlockCompressedOutputStream.java +++ b/src/main/java/htsjdk/samtools/util/BlockCompressedOutputStream.java @@ -28,6 +28,7 @@ import java.io.File; import java.io.IOException; import java.io.OutputStream; +import java.nio.file.Files; import java.util.zip.CRC32; import java.util.zip.Deflater; @@ -282,7 +283,7 @@ public void close() throws IOException { codec.writeBytes(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); codec.close(); // Can't re-open something that is not a regular file, e.g. a named pipe or an output stream - if (this.file == null || !this.file.isFile()) return; + if (this.file == null || !this.file.isFile() || !Files.isRegularFile(this.file.toPath())) return; if (BlockCompressedInputStream.checkTermination(this.file) != BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) { throw new IOException("Terminator block not found after closing BGZF file " + this.file); From 8f89e27588416b8057470d875b87ccbde7313337 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Wed, 21 Jun 2017 13:44:59 -0700 Subject: [PATCH 10/23] Sort and group order not updated when using setAttribute. (#905) --- .../htsjdk/samtools/AbstractSAMHeaderRecord.java | 3 +- src/main/java/htsjdk/samtools/SAMFileHeader.java | 39 ++++++++++++- .../java/htsjdk/samtools/SAMFileHeaderTest.java | 64 ++++++++++++++++++++++ 3 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 src/test/java/htsjdk/samtools/SAMFileHeaderTest.java diff --git a/src/main/java/htsjdk/samtools/AbstractSAMHeaderRecord.java b/src/main/java/htsjdk/samtools/AbstractSAMHeaderRecord.java index 7078bf1dc..0c3d48420 100644 --- a/src/main/java/htsjdk/samtools/AbstractSAMHeaderRecord.java +++ b/src/main/java/htsjdk/samtools/AbstractSAMHeaderRecord.java @@ -59,8 +59,6 @@ public void setAttribute(final String key, final Object value) { /** * Set the given value for the attribute named 'key'. Replaces an existing value, if any. * If value is null, the attribute is removed. - * Supported types are Character, Integer, Float and String. Byte and Short may also be - * passed in but they will be converted to Integer. * @param key attribute name * @param value attribute value */ @@ -71,6 +69,7 @@ public void setAttribute(final String key, final String value) { mAttributes.put(key, value); } } + /** * Returns the Set of attributes. */ diff --git a/src/main/java/htsjdk/samtools/SAMFileHeader.java b/src/main/java/htsjdk/samtools/SAMFileHeader.java index f2750d4cc..eff595341 100644 --- a/src/main/java/htsjdk/samtools/SAMFileHeader.java +++ b/src/main/java/htsjdk/samtools/SAMFileHeader.java @@ -270,7 +270,7 @@ public SortOrder getSortOrder() { public void setSortOrder(final SortOrder so) { sortOrder = so; - setAttribute(SORT_ORDER_TAG, so.name()); + super.setAttribute(SORT_ORDER_TAG, so.name()); } public GroupOrder getGroupOrder() { @@ -292,7 +292,42 @@ public GroupOrder getGroupOrder() { public void setGroupOrder(final GroupOrder go) { groupOrder = go; - setAttribute(GROUP_ORDER_TAG, go.name()); + super.setAttribute(GROUP_ORDER_TAG, go.name()); + } + + + /** + * Set the given value for the attribute named 'key'. Replaces an existing value, if any. + * If value is null, the attribute is removed. + * Otherwise, the value will be converted to a String with toString. + * @param key attribute name + * @param value attribute value + * @deprecated Use {@link #setAttribute(String, String) instead + */ + @Deprecated + @Override + public void setAttribute(final String key, final Object value) { + if (key.equals(SORT_ORDER_TAG) || key.equals(GROUP_ORDER_TAG)) { + this.setAttribute(key, value.toString()); + } else { + super.setAttribute(key, value); + } + } + + /** + * Set the given value for the attribute named 'key'. Replaces an existing value, if any. + * If value is null, the attribute is removed. + * @param key attribute name + * @param value attribute value + */ + @Override + public void setAttribute(final String key, final String value) { + if (key.equals(SORT_ORDER_TAG)) { + this.sortOrder = null; + } else if (key.equals(GROUP_ORDER_TAG)) { + this.groupOrder = null; + } + super.setAttribute(key, value); } /** diff --git a/src/test/java/htsjdk/samtools/SAMFileHeaderTest.java b/src/test/java/htsjdk/samtools/SAMFileHeaderTest.java new file mode 100644 index 000000000..0723ed9e4 --- /dev/null +++ b/src/test/java/htsjdk/samtools/SAMFileHeaderTest.java @@ -0,0 +1,64 @@ +/* + * The MIT License + * + * Copyright (c) 2017 Nils Homer + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + */ +package htsjdk.samtools; + +import htsjdk.HtsjdkTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class SAMFileHeaderTest extends HtsjdkTest { + + @Test + public void testSortOrder() { + final SAMFileHeader header = new SAMFileHeader(); + + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + Assert.assertEquals(header.getSortOrder(), SAMFileHeader.SortOrder.coordinate); + Assert.assertEquals(header.getAttribute(SAMFileHeader.SORT_ORDER_TAG), SAMFileHeader.SortOrder.coordinate.name()); + + header.setAttribute(SAMFileHeader.SORT_ORDER_TAG, SAMFileHeader.SortOrder.queryname.name()); + Assert.assertEquals(header.getSortOrder(), SAMFileHeader.SortOrder.queryname); + Assert.assertEquals(header.getAttribute(SAMFileHeader.SORT_ORDER_TAG), SAMFileHeader.SortOrder.queryname.name()); + + header.setAttribute(SAMFileHeader.SORT_ORDER_TAG, SAMFileHeader.SortOrder.coordinate); + Assert.assertEquals(header.getSortOrder(), SAMFileHeader.SortOrder.coordinate); + Assert.assertEquals(header.getAttribute(SAMFileHeader.SORT_ORDER_TAG), SAMFileHeader.SortOrder.coordinate.name()); + } + + @Test + public void testGroupOrder() { + final SAMFileHeader header = new SAMFileHeader(); + + header.setGroupOrder(SAMFileHeader.GroupOrder.query); + Assert.assertEquals(header.getGroupOrder(), SAMFileHeader.GroupOrder.query); + Assert.assertEquals(header.getAttribute(SAMFileHeader.GROUP_ORDER_TAG), SAMFileHeader.GroupOrder.query.name()); + + header.setAttribute(SAMFileHeader.GROUP_ORDER_TAG, SAMFileHeader.GroupOrder.reference.name()); + Assert.assertEquals(header.getGroupOrder(), SAMFileHeader.GroupOrder.reference); + Assert.assertEquals(header.getAttribute(SAMFileHeader.GROUP_ORDER_TAG), SAMFileHeader.GroupOrder.reference.name()); + + header.setAttribute(SAMFileHeader.GROUP_ORDER_TAG, SAMFileHeader.GroupOrder.query); + Assert.assertEquals(header.getGroupOrder(), SAMFileHeader.GroupOrder.query); + Assert.assertEquals(header.getAttribute(SAMFileHeader.GROUP_ORDER_TAG), SAMFileHeader.GroupOrder.query.name()); + } +} From c20e0edd361189c3f2bc3718b018dac2ec90530b Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Sun, 25 Jun 2017 22:35:24 -0400 Subject: [PATCH 11/23] Add more checks to SamFileValidator (#907) --- src/main/java/htsjdk/samtools/SAMRecord.java | 85 +++++++++++++++------- .../htsjdk/samtools/SAMSequenceDictionary.java | 50 ++++++++++--- src/main/java/htsjdk/samtools/SAMTestUtil.java | 50 ++++--------- .../java/htsjdk/samtools/SAMValidationError.java | 17 ++++- .../java/htsjdk/samtools/SamFileValidator.java | 42 ++++++----- .../java/htsjdk/samtools/SAMRecordUnitTest.java | 2 +- .../htsjdk/samtools/SAMSequenceDictionaryTest.java | 24 ++++++ .../htsjdk/samtools/SAMSequenceRecordTest.java | 45 +++++++++++- .../java/htsjdk/samtools/ValidateSamFileTest.java | 25 +++++-- .../cram/lossy/QualityScorePreservationTest.java | 2 +- .../htsjdk/samtools/util/SequenceUtilTest.java | 4 +- .../SequenceUtil/upper_and_lowercase_read.sam | 2 +- .../ValidateSamFileTest/seq_qual_len_mismatch.sam | 21 ++++++ 13 files changed, 261 insertions(+), 108 deletions(-) create mode 100644 src/test/resources/htsjdk/samtools/ValidateSamFileTest/seq_qual_len_mismatch.sam diff --git a/src/main/java/htsjdk/samtools/SAMRecord.java b/src/main/java/htsjdk/samtools/SAMRecord.java index f93b2d72c..ec394ca17 100644 --- a/src/main/java/htsjdk/samtools/SAMRecord.java +++ b/src/main/java/htsjdk/samtools/SAMRecord.java @@ -1519,7 +1519,7 @@ public SAMTagAndValue(final String tag, final Object value) { */ public List getAttributes() { SAMBinaryTagAndValue binaryAttributes = getBinaryAttributes(); - final List ret = new ArrayList(); + final List ret = new ArrayList<>(); while (binaryAttributes != null) { ret.add(new SAMTagAndValue(SAMTagUtil.getSingleton().makeStringTag(binaryAttributes.tag), binaryAttributes.value)); @@ -1769,7 +1769,7 @@ protected void eagerDecode() { /** * Run all validations of CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. - * @param recordNumber For error reporting. -1 if not known. + * @param recordNumber For error reporting, the record number in the SAM/BAM file. -1 if not known. * @return List of errors, or null if no errors. */ public List validateCigar(final long recordNumber) { @@ -1878,35 +1878,40 @@ public int hashCode() { ArrayList ret = null; if (!getReadPairedFlag()) { if (getProperPairFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getMateUnmappedFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_UNMAPPED, "Mate unmapped flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getMateNegativeStrandFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_MATE_NEG_STRAND, "Mate negative strand flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getFirstOfPairFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_FIRST_OF_PAIR, "First of pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (getSecondOfPairFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_SECOND_OF_PAIR, "Second of pair flag should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } if (null != getHeader() && getMateReferenceIndex() != NO_ALIGNMENT_REFERENCE_INDEX) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MATE_REF_INDEX, "MRNM should not be set for unpaired read.", getReadName())); if (firstOnly) return ret; } + if (!getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { + if (ret == null) ret = new ArrayList<>(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_UNPAIRED_MATE_REFERENCE, "Unpaired read mate reference is " + getMateReferenceName() + " not " + SAMRecord.NO_ALIGNMENT_REFERENCE_NAME + " for unpaired read", getReadName())); + if (firstOnly) return ret; + } } else { final List errors = isValidReferenceIndexAndPosition(mMateReferenceIndex, mMateReferenceName, getMateAlignmentStart(), true, firstOnly); @@ -1937,23 +1942,23 @@ public int hashCode() { */ } if (getInferredInsertSize() > MAX_INSERT_SIZE || getInferredInsertSize() < -MAX_INSERT_SIZE) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INSERT_SIZE, "Insert size out of range", getReadName())); if (firstOnly) return ret; } if (getReadUnmappedFlag()) { if (getNotPrimaryAlignmentFlag()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_NOT_PRIM_ALIGNMENT, "Not primary alignment flag should not be set for unmapped read.", getReadName())); if (firstOnly) return ret; } if (getSupplementaryAlignmentFlag()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_SUPPLEMENTARY_ALIGNMENT, "Supplementary alignment flag should not be set for unmapped read.", getReadName())); if (firstOnly) return ret; } if (getMappingQuality() != 0) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MAPPING_QUALITY, "MAPQ should be 0 for unmapped read.", getReadName())); if (firstOnly) return ret; } @@ -1962,22 +1967,22 @@ public int hashCode() { TODO: PIC-97 This validation should be enabled, but probably at this point there are too many BAM files that have the proper pair flag set when read or mate is unmapped. if (getProperPairFlagUnchecked()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_PROPER_PAIR, "Proper pair flag should not be set for unmapped read.", getReadName())); } */ } else { if (getMappingQuality() >= 256) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_MAPPING_QUALITY, "MAPQ should be < 256.", getReadName())); if (firstOnly) return ret; } if (getCigarLength() == 0) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "CIGAR should have > zero elements for mapped read.", getReadName())); /* todo - will uncomment once unit tests are added } else if (getCigar().getReadLength() != getReadLength()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_CIGAR, "CIGAR read length " + getCigar().getReadLength() + " doesn't match read length " + getReadLength(), getReadName())); */ if (firstOnly) return ret; @@ -1988,7 +1993,7 @@ public int hashCode() { if (firstOnly) return ret; } if (!hasReferenceName()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_FLAG_READ_UNMAPPED, "Mapped read should have valid reference name", getReadName())); if (firstOnly) return ret; } @@ -2006,14 +2011,14 @@ public int hashCode() { // Validate the RG ID is found in header final String rgId = (String)getAttribute(SAMTagUtil.getSingleton().RG); if (rgId != null && getHeader() != null && getHeader().getReadGroup(rgId) == null) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.READ_GROUP_NOT_FOUND, "RG ID on SAMRecord not found in header: " + rgId, getReadName())); if (firstOnly) return ret; } final List errors = isValidReferenceIndexAndPosition(mReferenceIndex, mReferenceName, getAlignmentStart(), false); if (errors != null) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.addAll(errors); if (firstOnly) return ret; } @@ -2024,7 +2029,7 @@ public int hashCode() { final String cq = (String)getAttribute(SAMTagUtil.getSingleton().CQ); final String cs = (String)getAttribute(SAMTagUtil.getSingleton().CS); if (cq == null || cq.isEmpty() || cs == null || cs.isEmpty()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.EMPTY_READ, "Zero-length read without FZ, CS or CQ tag", getReadName())); if (firstOnly) return ret; @@ -2038,7 +2043,7 @@ public int hashCode() { } } if (!hasIndel) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.EMPTY_READ, "Colorspace read with zero-length bases but no indel", getReadName())); if (firstOnly) return ret; @@ -2047,7 +2052,7 @@ public int hashCode() { } } if (this.getReadLength() != getBaseQualities().length && !Arrays.equals(getBaseQualities(), NULL_QUALS)) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.MISMATCH_READ_LENGTH_AND_QUALS_LENGTH, "Read length does not match quals length", getReadName())); if (firstOnly) return ret; @@ -2055,13 +2060,39 @@ public int hashCode() { if (this.getAlignmentStart() != NO_ALIGNMENT_START && this.getIndexingBin() != null && this.computeIndexingBin() != this.getIndexingBin()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_INDEXING_BIN, "bin field of BAM record does not equal value computed based on alignment start and end, and length of sequence to which read is aligned", getReadName())); if (firstOnly) return ret; } + if (getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME) && + getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { + if (ret == null) ret = new ArrayList<>(); + ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_UNALIGNED_MATE_START, + "The unaligned mate start position is " + getAlignmentStart() + ", should be " + SAMRecord.NO_ALIGNMENT_START, + getReadName())); + if (firstOnly) return ret; + } + + if (getCigar().getReadLength() != 0 && getCigar().getReadLength() != getReadLength()) { + if (ret == null) ret = new ArrayList<>(); + ret.add(new SAMValidationError(SAMValidationError.Type.MISMATCH_CIGAR_SEQ_LENGTH, + "CIGAR covers " + getCigar().getReadLength() + " bases but the sequence is " + getReadLength() + " read bases ", + getReadName())); + if (firstOnly) return ret; + } + + if (getBaseQualities().length != 0 && getReadLength() != getBaseQualities().length) { + if (ret == null) ret = new ArrayList<>(); + ret.add(new SAMValidationError( + SAMValidationError.Type.MISMATCH_SEQ_QUAL_LENGTH, + "Read length is " + getReadLength() + " bases but have " + mBaseQualities.length + " qualities ", + getReadName())); + if (firstOnly) return ret; + } + if (ret == null || ret.isEmpty()) { return null; } @@ -2099,13 +2130,13 @@ protected void setFileSource(final SAMFileSource fileSource) { ArrayList ret = null; if (!hasReference) { if (alignmentStart != 0) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start should be 0 because reference name = *.", isMate), getReadName())); if (firstOnly) return ret; } } else { if (alignmentStart == 0) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start should != 0 because reference name != *.", isMate), getReadName())); if (firstOnly) return ret; } @@ -2113,12 +2144,12 @@ protected void setFileSource(final SAMFileSource fileSource) { final SAMSequenceRecord sequence = (referenceIndex != null? getHeader().getSequence(referenceIndex): getHeader().getSequence(referenceName)); if (sequence == null) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_REFERENCE_INDEX, buildMessage("Reference sequence not found in sequence dictionary.", isMate), getReadName())); if (firstOnly) return ret; } else { if (alignmentStart > sequence.getSequenceLength()) { - if (ret == null) ret = new ArrayList(); + if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.INVALID_ALIGNMENT_START, buildMessage("Alignment start (" + alignmentStart + ") must be <= reference sequence length (" + sequence.getSequenceLength() + ") on reference " + sequence.getSequenceName(), isMate), getReadName())); if (firstOnly) return ret; diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index b7744d796..86ffa6c9f 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -29,7 +29,6 @@ import java.math.BigInteger; import java.security.MessageDigest; import java.util.*; -import java.util.stream.Collector; import java.util.stream.Collectors; import javax.xml.bind.annotation.XmlElement; @@ -50,8 +49,8 @@ getter because the later wraps the list into an unmodifiable List see http://tech.joshuacummings.com/2010/10/problems-with-defensive-collection.html */ @XmlElement(name="Reference") - private List mSequences = new ArrayList(); - private final Map mSequenceMap = new HashMap(); + private List mSequences = new ArrayList<>(); + private final Map mSequenceMap = new HashMap<>(); public SAMSequenceDictionary() { } @@ -150,7 +149,7 @@ public boolean isEmpty() { private static String DICT_MISMATCH_TEMPLATE = "SAM dictionaries are not the same: %s."; /** * Non-comprehensive {@link #equals(Object)}-assertion: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent - * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call + * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call * {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}. * Aliases are ignored. * @@ -161,20 +160,49 @@ public void assertSameDictionary(final SAMSequenceDictionary that) { final Iterator thatSequences = that.mSequences.iterator(); for (final SAMSequenceRecord thisSequence : mSequences) { - if (!thatSequences.hasNext()) + if (!thatSequences.hasNext()) { throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thisSequence + " is present in only one dictionary")); - else { + } else { final SAMSequenceRecord thatSequence = thatSequences.next(); - if(!thatSequence.isSameSequence(thisSequence)) + if(!thatSequence.isSameSequence(thisSequence)) { throw new AssertionError( String.format(DICT_MISMATCH_TEMPLATE, thatSequence + " was found when " + thisSequence + " was expected") ); + } } } if (thatSequences.hasNext()) throw new AssertionError(String.format(DICT_MISMATCH_TEMPLATE, thatSequences.next() + " is present in only one dictionary")); } + /** + * Non-comprehensive {@link #equals(Object)}-validation: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent + * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call + * {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}. + * + * @param that {@link SAMSequenceDictionary} to compare against + * @return true if the dictionaries are the same, false otherwise + * + */ + public boolean isSameDictionary(final SAMSequenceDictionary that) { + if (that == null || that.mSequences == null) return false; + if (this == that) return true; + + final Iterator thatSequences = that.mSequences.iterator(); + for (final SAMSequenceRecord thisSequence : mSequences) { + if (!thatSequences.hasNext()) { + return false; + } else { + final SAMSequenceRecord thatSequence = thatSequences.next(); + if (!thatSequence.isSameSequence(thisSequence)) { + return false; + } + } + } + + return !thatSequences.hasNext(); + } + /** returns true if the two dictionaries are the same, aliases are NOT considered */ @Override public boolean equals(Object o) { @@ -183,9 +211,7 @@ public boolean equals(Object o) { SAMSequenceDictionary that = (SAMSequenceDictionary) o; - if (!mSequences.equals(that.mSequences)) return false; - - return true; + return mSequences.equals(that.mSequences); } /** @@ -318,8 +344,8 @@ static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionar finalDict.addSequence(sMerged); final Set allTags = new HashSet<>(); - s1.getAttributes().stream().forEach(a -> allTags.add(a.getKey())); - s2.getAttributes().stream().forEach(a -> allTags.add(a.getKey())); + s1.getAttributes().forEach(a -> allTags.add(a.getKey())); + s2.getAttributes().forEach(a -> allTags.add(a.getKey())); for (final String tag : allTags) { final String value1 = s1.getAttribute(tag); diff --git a/src/main/java/htsjdk/samtools/SAMTestUtil.java b/src/main/java/htsjdk/samtools/SAMTestUtil.java index 83766f367..ec85ce2da 100644 --- a/src/main/java/htsjdk/samtools/SAMTestUtil.java +++ b/src/main/java/htsjdk/samtools/SAMTestUtil.java @@ -23,6 +23,8 @@ */ package htsjdk.samtools; +import java.util.List; + /** * Misc methods for SAM-related unit tests. These are in the src tree rather than the tests tree * so that they will be included in sam.jar, and therefore can be used by tests outside of htsjdk.samtools. @@ -55,47 +57,21 @@ public void assertPairValid(final SAMRecord firstEnd, final SAMRecord secondEnd) } /** - * Basic sanity check for a SAMRecord. - * @throws SanityCheckFailedException if the sanity check failed + * Basic sanity check for a SAMRecord. Print errors to screen. + * @param read SAM record + * @throws IllegalArgumentException if read is null + * @throws SanityCheckFailedException if errors */ - public void assertReadValid(final SAMRecord read) throws SanityCheckFailedException { - assertEquals(read.getReadBases().length, read.getBaseQualities().length); - // Note that it is possible to have an unmapped read that has a coordinate - if (read.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { - assertEquals(read.getAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); - assertTrue(read.getReadUnmappedFlag()); - } else { - assertNotSame(read.getAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); - } - if (read.getReadUnmappedFlag()) { - assertEquals(read.getMappingQuality(), SAMRecord.NO_MAPPING_QUALITY); - assertEquals(read.getCigar().getCigarElements().size(), 0); - } else { - assertNotSame(read.getCigar().getCigarElements(), 0); + public static void assertReadValid(final SAMRecord read) throws SanityCheckFailedException { + if (read == null) { + throw new IllegalArgumentException("SAMRecord is null"); } - if (read.getReadPairedFlag()) { - if (read.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { - assertEquals(read.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); - assertTrue(read.getMateUnmappedFlag()); - } else { - // Even if the mate is unmapped, if it has a reference name, it should have a position. - assertNotSame(read.getMateAlignmentStart(), SAMRecord.NO_ALIGNMENT_START); - } - if (read.getReadUnmappedFlag() || read.getMateUnmappedFlag() || - !read.getReferenceName().equals(read.getMateReferenceName())) { - assertEquals(read.getInferredInsertSize(), 0); - } else { - assertNotSame(read.getInferredInsertSize(), 0); - } - if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) { - assertNotSame(read.getReadNegativeStrandFlag(), read.getMateNegativeStrandFlag()); - assertNotSame(read.getMateNegativeStrandFlag(), - read.getReadName()); - } - } else { - assertEquals(read.getInferredInsertSize(), 0); + final List errors = read.isValid(false); + if ( errors != null) { + errors.forEach(v -> System.out.println(v.toString())); } + assertTrue(errors.isEmpty()); } private static void assertEquals(T a, T b) { diff --git a/src/main/java/htsjdk/samtools/SAMValidationError.java b/src/main/java/htsjdk/samtools/SAMValidationError.java index 452e92cf5..edd49c13c 100644 --- a/src/main/java/htsjdk/samtools/SAMValidationError.java +++ b/src/main/java/htsjdk/samtools/SAMValidationError.java @@ -208,7 +208,22 @@ MISMATCH_MATE_CIGAR_STRING, /** There is a Cigar String (stored in the MC Tag) for a read whose mate is NOT mapped. */ - MATE_CIGAR_STRING_INVALID_PRESENCE; + MATE_CIGAR_STRING_INVALID_PRESENCE, + + /** The mate reference of the unpaired read should be "*" */ + INVALID_UNPAIRED_MATE_REFERENCE, + + /** The unaligned mate read start position should be 0 */ + INVALID_UNALIGNED_MATE_START, + + /** Mismatch between the number of bases covered by the CIGAR and sequence */ + MISMATCH_CIGAR_SEQ_LENGTH, + + /** Mismatch between the sequence and quality length */ + MISMATCH_SEQ_QUAL_LENGTH, + + /** Mismatch between file and sequence dictionaries */ + MISMATCH_FILE_SEQ_DICT; public final Severity severity; diff --git a/src/main/java/htsjdk/samtools/SamFileValidator.java b/src/main/java/htsjdk/samtools/SamFileValidator.java index d0b745e7f..3e316a235 100644 --- a/src/main/java/htsjdk/samtools/SamFileValidator.java +++ b/src/main/java/htsjdk/samtools/SamFileValidator.java @@ -88,6 +88,7 @@ private Histogram errorsByType; private PairEndInfoMap pairEndInfoByName; private ReferenceSequenceFileWalker refFileWalker; + private SAMSequenceDictionary samSequenceDictionary; private boolean verbose; private int maxVerboseOutput; private SAMSortOrderChecker orderChecker; @@ -154,7 +155,7 @@ public boolean validateSamFileSummary(final SamReader samReader, final Reference for (final Histogram.Bin bin : errorsByType.values()) { errorsAndWarningsByType.increment(bin.getId().getHistogramString(), bin.getValue()); } - final MetricsFile metricsFile = new MetricsFile(); + final MetricsFile metricsFile = new MetricsFile<>(); errorsByType.setBinLabel("Error Type"); errorsByType.setValueLabel("Count"); metricsFile.setHistogram(errorsAndWarningsByType); @@ -180,7 +181,7 @@ public boolean validateSamFileVerbose(final SamReader samReader, final Reference } catch (MaxOutputExceededException e) { out.println("Maximum output of [" + maxVerboseOutput + "] errors reached."); } - boolean result = errorsByType.isEmpty(); + final boolean result = errorsByType.isEmpty(); cleanup(); return result; } @@ -249,13 +250,13 @@ private void validateUnmatchedPairs() { // For the coordinate-sorted map, need to detect mate pairs in which the mateReferenceIndex on one end // does not match the readReference index on the other end, so the pairs weren't united and validated. inMemoryPairMap = new InMemoryPairEndInfoMap(); - CloseableIterator> it = ((CoordinateSortedPairEndInfoMap) pairEndInfoByName).iterator(); + final CloseableIterator> it = pairEndInfoByName.iterator(); while (it.hasNext()) { - Map.Entry entry = it.next(); - PairEndInfo pei = inMemoryPairMap.remove(entry.getValue().readReferenceIndex, entry.getKey()); + final Map.Entry entry = it.next(); + final PairEndInfo pei = inMemoryPairMap.remove(entry.getValue().readReferenceIndex, entry.getKey()); if (pei != null) { // Found a mismatch btw read.mateReferenceIndex and mate.readReferenceIndex - List errors = pei.validateMates(entry.getValue(), entry.getKey()); + final List errors = pei.validateMates(entry.getValue(), entry.getKey()); for (final SAMValidationError error : errors) { addError(error); } @@ -405,10 +406,7 @@ private void validateSecondaryBaseCalls(final SAMRecord record, final long recor } private boolean validateCigar(final SAMRecord record, final long recordNumber) { - if (record.getReadUnmappedFlag()) { - return true; - } - return validateCigar(record, recordNumber, true); + return record.getReadUnmappedFlag() || validateCigar(record, recordNumber, true); } private boolean validateMateCigar(final SAMRecord record, final long recordNumber) { @@ -458,6 +456,7 @@ private void init(final ReferenceSequenceFile reference, final SAMFileHeader hea } if (reference != null) { this.refFileWalker = new ReferenceSequenceFileWalker(reference); + this.samSequenceDictionary = reference.getSequenceDictionary(); } } @@ -525,6 +524,12 @@ private void validateHeader(final SAMFileHeader fileHeader) { } if (fileHeader.getSequenceDictionary().isEmpty()) { sequenceDictionaryEmptyAndNoWarningEmitted = true; + } else { + if (samSequenceDictionary != null) { + if (!fileHeader.getSequenceDictionary().isSameDictionary(samSequenceDictionary)) { + addError(new SAMValidationError(Type.MISMATCH_FILE_SEQ_DICT, "Mismatch between file and sequence dictionary", null)); + } + } } if (fileHeader.getReadGroups().isEmpty()) { addError(new SAMValidationError(Type.MISSING_READ_GROUP, "Read groups is empty", null)); @@ -540,7 +545,7 @@ private void validateHeader(final SAMFileHeader fileHeader) { } final List rgs = fileHeader.getReadGroups(); - final Set readGroupIDs = new HashSet(); + final Set readGroupIDs = new HashSet<>(); for (final SAMReadGroupRecord record : rgs) { final String readGroupID = record.getReadGroupId(); @@ -692,11 +697,10 @@ public PairEndInfo(final SAMRecord record, final long recordNumber) { this.firstOfPairFlag = record.getFirstOfPairFlag(); } - private PairEndInfo(int readAlignmentStart, int readReferenceIndex, boolean readNegStrandFlag, boolean readUnmappedFlag, - String readCigarString, - int mateAlignmentStart, int mateReferenceIndex, boolean mateNegStrandFlag, boolean mateUnmappedFlag, - String mateCigarString, - boolean firstOfPairFlag, long recordNumber) { + private PairEndInfo(final int readAlignmentStart, final int readReferenceIndex, final boolean readNegStrandFlag, final boolean readUnmappedFlag, + final String readCigarString, + final int mateAlignmentStart, final int mateReferenceIndex, final boolean mateNegStrandFlag, final boolean mateUnmappedFlag, + final String mateCigarString, final boolean firstOfPairFlag, final long recordNumber) { this.readAlignmentStart = readAlignmentStart; this.readReferenceIndex = readReferenceIndex; this.readNegStrandFlag = readNegStrandFlag; @@ -712,7 +716,7 @@ private PairEndInfo(int readAlignmentStart, int readReferenceIndex, boolean read } public List validateMates(final PairEndInfo mate, final String readName) { - final List errors = new ArrayList(); + final List errors = new ArrayList<>(); validateMateFields(this, mate, readName, errors); validateMateFields(mate, this, readName, errors); // Validations that should not be repeated on both ends @@ -789,7 +793,7 @@ private void validateMateFields(final PairEndInfo end1, final PairEndInfo end2, private class CoordinateSortedPairEndInfoMap implements PairEndInfoMap { private final CoordinateSortedPairInfoMap onDiskMap = - new CoordinateSortedPairInfoMap(maxTempFiles, new Codec()); + new CoordinateSortedPairInfoMap<>(maxTempFiles, new Codec()); @Override public void put(int mateReferenceIndex, String key, PairEndInfo value) { @@ -877,7 +881,7 @@ public void encode(final String key, final PairEndInfo record) { } private static class InMemoryPairEndInfoMap implements PairEndInfoMap { - private final Map map = new HashMap(); + private final Map map = new HashMap<>(); @Override public void put(int mateReferenceIndex, String key, PairEndInfo value) { diff --git a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java index 1bfe26345..5fa35f3e9 100644 --- a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java +++ b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java @@ -463,7 +463,7 @@ public void test_setAttribute_null_removes_tag() { } private SAMRecord createTestRecordHelper() { - return new SAMRecordSetBuilder().addFrag("test", 0, 1, false, false, "3S9M", null, 2); + return new SAMRecordSetBuilder().addFrag("test", 0, 1, false, false, "3S33M", null, 2); } @Test diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java index 8b1763067..a8e60ed50 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java @@ -39,6 +39,7 @@ import java.io.StringWriter; import java.util.Arrays; import java.util.Collections; +import java.util.List; public class SAMSequenceDictionaryTest extends HtsjdkTest { @Test @@ -142,4 +143,27 @@ public void testMergeDictionaries(final SAMSequenceRecord rec1, final SAMSequenc throw new Exception("Expected to not be able to merge dictionaries, but was able"); } } + + @DataProvider + public Object[][] testIsSameDictionaryData() { + + final SAMSequenceRecord rec1, rec2; + rec1 = new SAMSequenceRecord("chr1", 100); + rec2 = new SAMSequenceRecord("chr2", 101); + + return new Object[][]{ + new Object[]{Arrays.asList(rec1), Arrays.asList(rec1), true}, + new Object[]{Arrays.asList(rec1), Arrays.asList(rec2), false}, + new Object[]{Arrays.asList(rec1, rec2), Arrays.asList(rec1), false} + }; + } + + @Test(dataProvider = "testIsSameDictionaryData") + public void testIsSameDictionary(final List recs1, final List recs2, final boolean isSameDictionary) { + + final SAMSequenceDictionary dict1 = new SAMSequenceDictionary(recs1); + final SAMSequenceDictionary dict2 = new SAMSequenceDictionary(recs2); + + Assert.assertEquals(dict1.isSameDictionary(dict2), isSameDictionary); + } } diff --git a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java index 1035d1b10..e0c73d502 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java @@ -24,8 +24,11 @@ package htsjdk.samtools; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.Arrays; + /** * Test for SAMReadGroupRecordTest */ @@ -33,10 +36,50 @@ @Test public void testGetSAMString() { - SAMSequenceRecord r = new SAMSequenceRecord("chr5_but_without_a_prefix", 271828); + final SAMSequenceRecord r = new SAMSequenceRecord("chr5_but_without_a_prefix", 271828); r.setSpecies("Psephophorus terrypratchetti"); r.setAssembly("GRCt01"); r.setMd5("7a6dd3d307de916b477e7bf304ac22bc"); Assert.assertEquals("@SQ\tSN:chr5_but_without_a_prefix\tLN:271828\tSP:Psephophorus terrypratchetti\tAS:GRCt01\tM5:7a6dd3d307de916b477e7bf304ac22bc", r.getSAMString()); } + + @DataProvider + public Object[][] testIsSameSequenceData() { + final SAMSequenceRecord rec1 = new SAMSequenceRecord("chr1", 100); + final SAMSequenceRecord rec2 = new SAMSequenceRecord("chr2", 101); + final SAMSequenceRecord rec3 = new SAMSequenceRecord("chr3", 0); + final SAMSequenceRecord rec4 = new SAMSequenceRecord("chr1", 100); + + final String md5One = "1"; + final String md5Two = "2"; + final int index1 = 1; + final int index2 = 2; + + return new Object[][]{ + new Object[]{rec1, rec1, md5One, md5One, index1, index1, true}, + new Object[]{rec1, null, md5One, md5One, index1, index1, false}, + new Object[]{rec1, rec4, md5One, md5One, index1, index1, true}, + new Object[]{rec1, rec4, md5One, md5One, index1, index2, false}, + new Object[]{rec1, rec3, md5One, md5Two, index1, index1, false}, + new Object[]{rec1, rec2, md5One, md5Two, index1, index1, false}, + new Object[]{rec1, rec4, md5One, null, index1, index1, true}, + new Object[]{rec1, rec4, null, md5One, index1, index1, true}, + new Object[]{rec1, rec4, md5One, md5One, index1, index2, false} + }; + } + + @Test(dataProvider = "testIsSameSequenceData") + public void testIsSameSequence(final SAMSequenceRecord rec1 , final SAMSequenceRecord rec2, final String md5One, final String md5Two, + final int index1, final int index2, final boolean isSame) { + if (rec2 != null) { + rec2.setMd5(md5Two); + rec2.setSequenceIndex(index2); + } + + if (rec1 != null) { + rec1.setMd5(md5One); + rec1.setSequenceIndex(index1); + Assert.assertEquals(rec1.isSameSequence(rec2), isSame); + } + } } diff --git a/src/test/java/htsjdk/samtools/ValidateSamFileTest.java b/src/test/java/htsjdk/samtools/ValidateSamFileTest.java index 292758b8c..8aac6e2e3 100644 --- a/src/test/java/htsjdk/samtools/ValidateSamFileTest.java +++ b/src/test/java/htsjdk/samtools/ValidateSamFileTest.java @@ -147,6 +147,7 @@ public void testUnpairedRecords() throws IOException { Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_FLAG_FIRST_OF_PAIR.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_FLAG_SECOND_OF_PAIR.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_MATE_REF_INDEX.getHistogramString()).getValue(), 1.0); + Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_UNPAIRED_MATE_REFERENCE.getHistogramString()).getValue(), 1.0); } @Test @@ -173,6 +174,7 @@ public void testPairedRecords() throws IOException { Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_FLAG_MATE_UNMAPPED.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_MATE_ALIGNMENT_START.getHistogramString()).getValue(), 2.0); Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_MATE_REF_INDEX.getHistogramString()).getValue(), 2.0); + Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_UNALIGNED_MATE_START.getHistogramString()).getValue(), 1.0); } @Test(dataProvider = "missingMateTestCases") @@ -232,6 +234,7 @@ public void testMappedRecords() throws IOException { Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_CIGAR.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.INVALID_FLAG_READ_UNMAPPED.getHistogramString()).getValue(), 1.0); Assert.assertEquals(results.get(SAMValidationError.Type.MISSING_TAG_NM.getHistogramString()).getValue(), 1.0); + Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_CIGAR_SEQ_LENGTH.getHistogramString()).getValue(), 1.0); } @Test @@ -300,11 +303,10 @@ public void testMateCigarScenarios(final String scenario, final String inputFile throws Exception { final SamReader reader = SamReaderFactory.makeDefault().open(new File(TEST_DATA_DIR, inputFile)); final Histogram results = executeValidation(reader, null, IndexValidationStringency.EXHAUSTIVE); - Assert.assertNotNull(results.get(expectedError.getHistogramString())); - Assert.assertEquals(results.get(expectedError.getHistogramString()).getValue(), 1.0); + Assert.assertNotNull(results.get(expectedError.getHistogramString()), scenario); + Assert.assertEquals(results.get(expectedError.getHistogramString()).getValue(), 1.0, scenario); } - @DataProvider(name = "testMateCigarScenarios") public Object[][] testMateCigarScenarios() { return new Object[][]{ @@ -318,8 +320,8 @@ public void testTruncated(final String scenario, final String inputFile, final S throws Exception { final SamReader reader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(new File(TEST_DATA_DIR, inputFile)); final Histogram results = executeValidation(reader, null, IndexValidationStringency.EXHAUSTIVE); - Assert.assertNotNull(results.get(expectedError.getHistogramString())); - Assert.assertEquals(results.get(expectedError.getHistogramString()).getValue(), 1.0); + Assert.assertNotNull(results.get(expectedError.getHistogramString()), scenario); + Assert.assertEquals(results.get(expectedError.getHistogramString()).getValue(), 1.0, scenario); } @DataProvider(name = "testTruncatedScenarios") @@ -400,9 +402,20 @@ public void testRedundantTags() throws Exception { public void testHeaderValidation() throws Exception { final SamReader samReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT) .open(new File(TEST_DATA_DIR, "buggyHeader.sam")); - final Histogram results = executeValidation(samReader, null, IndexValidationStringency.EXHAUSTIVE); + final File referenceFile = new File(TEST_DATA_DIR, "../hg19mini.fasta"); + final ReferenceSequenceFile reference = new FastaSequenceFile(referenceFile, false); + final Histogram results = executeValidation(samReader, reference, IndexValidationStringency.EXHAUSTIVE); Assert.assertEquals(results.get(SAMValidationError.Type.UNRECOGNIZED_HEADER_TYPE.getHistogramString()).getValue(), 3.0); Assert.assertEquals(results.get(SAMValidationError.Type.HEADER_TAG_MULTIPLY_DEFINED.getHistogramString()).getValue(), 1.0); + Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_FILE_SEQ_DICT.getHistogramString()).getValue(), 1.0); + } + + @Test + public void testSeqQualMismatch() throws Exception { + final SamReader samReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT) + .open(new File(TEST_DATA_DIR, "seq_qual_len_mismatch.sam")); + final Histogram results = executeValidation(samReader, null, IndexValidationStringency.EXHAUSTIVE); + Assert.assertEquals(results.get(SAMValidationError.Type.MISMATCH_SEQ_QUAL_LENGTH.getHistogramString()).getValue(), 8.0); } @Test diff --git a/src/test/java/htsjdk/samtools/cram/lossy/QualityScorePreservationTest.java b/src/test/java/htsjdk/samtools/cram/lossy/QualityScorePreservationTest.java index a33766762..73859a46a 100644 --- a/src/test/java/htsjdk/samtools/cram/lossy/QualityScorePreservationTest.java +++ b/src/test/java/htsjdk/samtools/cram/lossy/QualityScorePreservationTest.java @@ -119,7 +119,7 @@ private SAMRecord buildSAMRecord(String seqName, String line) { @Test public void test3() { - String line1 = "98573 0 20 1 10 40M * 0 0 AAAAAAAAAA !!!!!!!!!!"; + String line1 = "98573 0 20 1 10 10M * 0 0 AAAAAAAAAA !!!!!!!!!!"; String seqName = "20"; byte[] ref = new byte[40]; diff --git a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java index e57b8fd08..6a115db9b 100644 --- a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java @@ -262,11 +262,11 @@ public void testCountInsertedAndDeletedBases(final String cigarString, final int @Test(dataProvider = "testKmerGenerationTestCases") public void testKmerGeneration(final int length, final String[] expectedKmers) { - final Set actualSet = new HashSet(); + final Set actualSet = new HashSet<>(); for (final byte[] kmer : SequenceUtil.generateAllKmers(length)) { actualSet.add(StringUtil.bytesToString(kmer)); } - final Set expectedSet = new HashSet(Arrays.asList(expectedKmers)); + final Set expectedSet = new HashSet<>(Arrays.asList(expectedKmers)); Assert.assertTrue(actualSet.equals(expectedSet)); } diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam index 82efe858e..335d8159c 100644 --- a/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam @@ -7,4 +7,4 @@ read1 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0 read2 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0 read3 0 chr2 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:8 MD:Z:0T2A0T2A0t2a0t2a0 read4 0 chr2 1 0 8M * 0 0 TCGATCGA AAAAAAAA NM:i:0 -read5 0 chr2 1 0 4M1D2M1S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2 +read5 0 chr2 1 0 4M1D2M2S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2 diff --git a/src/test/resources/htsjdk/samtools/ValidateSamFileTest/seq_qual_len_mismatch.sam b/src/test/resources/htsjdk/samtools/ValidateSamFileTest/seq_qual_len_mismatch.sam new file mode 100644 index 000000000..3c689b135 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/ValidateSamFileTest/seq_qual_len_mismatch.sam @@ -0,0 +1,21 @@ +@HD VN:1.0 SO:coordinate +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:404 +@SQ SN:chr8 LN:202 +@RG ID:0 SM:Hi,Mom! LB:my-library PL:ILLUMINA +@RG ID:1 SM:Hi,Mom! LB:my-library PL:ILLUMINA +@RG ID:2 SM:Hi,Mom! LB:my-library PL:Illumina +@PG ID:1 PN:Hey! VN:2.0 +both_reads_align_clip_marked 1107 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:0 PG:Z:1 NM:i:0 MQ:i:255 XT:Z:foo OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:1 PG:Z:1 NM:i:3 MQ:i:255 XT:Z:foo OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +read_2_too_many_gaps 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:2 PG:Z:1 NM:i:8 MQ:i:255 XT:Z:foo2 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_align_clip_adapter 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:1 PG:Z:1 NM:i:1 MQ:i:255 XT:Z:foo2 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_align_clip_adapter 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/ RG:Z:1 PG:Z:1 NM:i:1 MQ:i:255 XT:Z:foo2 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_align_clip_marked 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1' RG:Z:0 PG:Z:1 NM:i:5 MQ:i:255 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +read_2_too_many_gaps 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1' RG:Z:2 PG:Z:1 NM:i:6 MQ:i:255 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 +both_reads_present_only_first_aligns 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1' RG:Z:1 PG:Z:1 From 7d31bc2a5c4f7ad79de7fa804898289d6fd556de Mon Sep 17 00:00:00 2001 From: Tim Fennell Date: Mon, 26 Jun 2017 16:14:08 -0400 Subject: [PATCH 12/23] Adding the gitter badge. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1981fd182..c2af30592 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.samtools/htsjdk/badge.svg)](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22com.github.samtools%22%20AND%20a%3A%22htsjdk%22) [![License](http://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/samtools/htsjdk) [![Language](http://img.shields.io/badge/language-java-brightgreen.svg)](https://www.java.com/) +[![Join the chat at https://gitter.im/samtools/htsjdk](https://badges.gitter.im/samtools/htsjdk.svg)](https://gitter.im/samtools/htsjdk?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) Status of downstream projects automatically built on top of the current htsjdk master branch. See [gatk-jenkins](https://gatk-jenkins.broadinstitute.org/view/HTSJDK%20Release%20Tests/) for detailed logs. Failure may indicate problems in htsjdk, but may also be due to expected incompatibilities between versions, or unrelated failures in downstream projects. - [Picard](https://github.com/broadinstitute/picard): [![Build Status](https://gatk-jenkins.broadinstitute.org/buildStatus/icon?job=picard-on-htsjdk-master)](https://gatk-jenkins.broadinstitute.org/job/picard-on-htsjdk-master/) From 1cd23dacecd6b837a54ba801d4f08a48006d3215 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Wed, 5 Jul 2017 19:51:35 -0400 Subject: [PATCH 13/23] Make stream seek error message informative (#927) --- .../samtools/util/BlockCompressedInputStream.java | 17 +++++++++++++++-- .../util/BlockCompressedOutputStreamTest.java | 20 +++++++++++++++----- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/BlockCompressedInputStream.java b/src/main/java/htsjdk/samtools/util/BlockCompressedInputStream.java index e108d1bb3..622ca67ac 100755 --- a/src/main/java/htsjdk/samtools/util/BlockCompressedInputStream.java +++ b/src/main/java/htsjdk/samtools/util/BlockCompressedInputStream.java @@ -57,10 +57,12 @@ public final static String INCORRECT_HEADER_SIZE_MSG = "Incorrect header size for file: "; public final static String UNEXPECTED_BLOCK_LENGTH_MSG = "Unexpected compressed block length: "; public final static String PREMATURE_END_MSG = "Premature end of file: "; - public final static String CANNOT_SEEK_STREAM_MSG = "Cannot seek on stream based file "; + public final static String CANNOT_SEEK_STREAM_MSG = "Cannot seek a position for a non-file stream"; + public final static String CANNOT_SEEK_CLOSED_STREAM_MSG = "Cannot seek a position for a closed stream"; public final static String INVALID_FILE_PTR_MSG = "Invalid file pointer: "; private InputStream mStream = null; + private boolean mIsClosed = false; private SeekableStream mFile = null; private byte[] mFileBuffer = null; private DecompressedBlock mCurrentBlock = null; @@ -222,6 +224,9 @@ public void close() throws IOException { // Encourage garbage collection mFileBuffer = null; mCurrentBlock = null; + + // Mark as closed + mIsClosed = true; } /** @@ -344,12 +349,20 @@ public int read(final byte[] buffer, int offset, int length) throws IOException * Seek to the given position in the file. Note that pos is a special virtual file pointer, * not an actual byte offset. * - * @param pos virtual file pointer + * @param pos virtual file pointer position + * @throws IOException if stream is closed or not a file based stream */ public void seek(final long pos) throws IOException { + // Must be before the mFile == null check because mFile == null for closed files and streams + if (mIsClosed) { + throw new IOException(CANNOT_SEEK_CLOSED_STREAM_MSG); + } + + // Cannot seek on streams that are not file based if (mFile == null) { throw new IOException(CANNOT_SEEK_STREAM_MSG); } + // Decode virtual file pointer // Upper 48 bits is the byte offset into the compressed stream of a // block. diff --git a/src/test/java/htsjdk/samtools/util/BlockCompressedOutputStreamTest.java b/src/test/java/htsjdk/samtools/util/BlockCompressedOutputStreamTest.java index a678c8dca..35175cd1d 100644 --- a/src/test/java/htsjdk/samtools/util/BlockCompressedOutputStreamTest.java +++ b/src/test/java/htsjdk/samtools/util/BlockCompressedOutputStreamTest.java @@ -81,6 +81,7 @@ public void testBasic() throws Exception { Assert.assertEquals(bcis2.read(buffer), available, "Should read to end of block"); Assert.assertTrue(bcis2.endOfBlock(), "Should be at end of block"); bcis2.close(); + Assert.assertEquals(bcis2.read(buffer), -1, "Should be end of file"); } @DataProvider(name = "seekReadExceptionsData") @@ -89,24 +90,32 @@ public void testBasic() throws Exception { return new Object[][]{ {HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.truncated.gz", FileTruncatedException.class, BlockCompressedInputStream.PREMATURE_END_MSG + System.getProperty("user.dir") + "/" + - HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.truncated.gz", true, false, 0}, + HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.truncated.gz", true, false, false, 0}, {HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.truncated.hdr.gz", IOException.class, BlockCompressedInputStream.INCORRECT_HEADER_SIZE_MSG + System.getProperty("user.dir") + "/" + - HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.truncated.hdr.gz", true, false, 0}, + HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.truncated.hdr.gz", true, false, false, 0}, {HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.gz", IOException.class, - BlockCompressedInputStream.CANNOT_SEEK_STREAM_MSG, false, true, 0}, + BlockCompressedInputStream.CANNOT_SEEK_STREAM_MSG, false, true, false, 0}, + {HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.gz", IOException.class, + BlockCompressedInputStream.CANNOT_SEEK_CLOSED_STREAM_MSG, false, true, true, 0}, {HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.gz", IOException.class, BlockCompressedInputStream.INVALID_FILE_PTR_MSG + 1000 + " for " + System.getProperty("user.dir") + "/" + - HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.gz", true, true, 1000 } + HTSJDK_TRIBBLE_RESOURCES + "vcfexample.vcf.gz", true, true, false, 1000 } }; } @Test(dataProvider = "seekReadExceptionsData") - public void testSeekReadExceptions(final String filePath, final Class c, final String msg, final boolean isFile, final boolean isSeek, final int pos) throws Exception { + public void testSeekReadExceptions(final String filePath, final Class c, final String msg, final boolean isFile, final boolean isSeek, final boolean isClosed, + final int pos) throws Exception { final BlockCompressedInputStream bcis = isFile ? new BlockCompressedInputStream(new File(filePath)) : new BlockCompressedInputStream(new FileInputStream(filePath)); + + if ( isClosed ) { + bcis.close(); + } + boolean haveException = false; try { if ( isSeek ) { @@ -212,5 +221,6 @@ public Deflater makeDeflater(final int compressionLevel, final boolean gzipCompa } bcis.close(); Assert.assertEquals(deflateCalls[0], 3, "deflate calls"); + Assert.assertEquals(reader.readLine(), null); } } From 8d207ae878f9f5136cb6e6ae628659eddef047d6 Mon Sep 17 00:00:00 2001 From: Tim Fennell Date: Wed, 12 Jul 2017 08:14:04 -0400 Subject: [PATCH 14/23] Updates to readme about documentation, communication & license. (#916) --- README.md | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index c2af30592..afe901e05 100644 --- a/README.md +++ b/README.md @@ -16,11 +16,19 @@ common file formats, such as [SAM][1] and [VCF][2], used for high-throughput sequencing data. There are also an number of useful utilities for manipulating HTS data. -Please see the [HTSJDK Documentation](http://samtools.github.io/htsjdk) for more information. - > **NOTE: _HTSJDK does not currently support the latest Variant Call Format Specification (VCFv4.3 and BCFv2.2)._** -#### Building HTSJDK +### Documentation & Getting Help + +API documentation for all versions of HTSJDK since `1.128` are available through [javadoc.io](http://www.javadoc.io/doc/com.github.samtools/htsjdk). + +If you believe you have found a bug or have an issue with the library please a) search the open and recently closed issues to ensure it has not already been reported, then b) log an issue. + +The project has a [gitter chat room](https://gitter.im/samtools/htsjdk) if you would like to chat with the developers and others involved in the project. + +To receive announcements of releases and other significant project news please subscribe to the [htsjdk-announce](https://groups.google.com/forum/#!forum/htsjdk-announce) google group. + +### Building HTSJDK HTSJDK is now built using [gradle](http://gradle.org/). @@ -74,7 +82,7 @@ Example gradle usage from the htsjdk root directory: ./gradlew tasks ``` -#### Create an HTSJDK project in IntelliJ +### Create an HTSJDK project in IntelliJ To create a project in IntelliJ IDE for htsjdk do the following: 1. Select fom the menu: `File -> New -> Project from Existing Sources` @@ -83,13 +91,17 @@ To create a project in IntelliJ IDE for htsjdk do the following: From time to time if dependencies change in htsjdk you may need to refresh the project from the `View -> Gradle` menu. -#### Licensing Information +### Licensing Information -Not all sub-packages of htsjdk are subject to the same license, so a license notice is included in each source file or sub-package as appropriate. Please check the relevant license notice whenever you start working with a part of htsjdk that you have not previously worked with to avoid any surprises. +Not all sub-packages of htsjdk are subject to the same license, so a license notice is included in each source file or sub-package as appropriate. +Please check the relevant license notice whenever you start working with a part of htsjdk that you have not previously worked with to avoid any surprises. +Broadly speaking the majority of the code is covered under the MIT license with the following notable exceptions: -#### Java Minimum Version Support Policy +* Much of the CRAM code is under the Apache License, Version 2 +* Core `tribble` code (underlying VCF reading/writing amongst other things) is under LGPL +* Code supporting the reading/writing of SRA format is uncopyrighted & public domain -> **NOTE: _Effective November 24th 2015, HTSJDK has ended support of Java 7 and previous versions. Java 8 is now required_.** +### Java Minimum Version Support Policy We will support all Java SE versions supported by Oracle until at least six months after Oracle's Public Updates period has ended ([see this link](http://www.oracle.com/technetwork/java/eol-135779.html)). @@ -97,9 +109,8 @@ Java SE Major Release | End of Java SE Oracle Public Updates | Proposed End of S ---- | ---- | ---- | ---- 6 | Feb 2013 | Aug 2013 | Oct 2015 7 | Apr 2015 | Oct 2015 | Oct 2015 -8* | Mar 2017 | Sep 2017 | Sep 2017 +8 | Jul 2018 | Jul 2018 | TBD -* to be finalized HTSJDK is migrating to semantic versioning (http://semver.org/). We will eventually adhere to it strictly and bump our major version whenever there are breaking changes to our API, but until we more clearly define what constitutes our official API, clients should assume that every release potentially contains at least minor changes to public methods. From 624a3bf314575769a9e50c04d43bd7a41fd8a81d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20G=C3=B3mez-S=C3=A1nchez?= Date: Wed, 19 Jul 2017 16:41:02 +0200 Subject: [PATCH 15/23] Add missing HtsjdkTest marker to test classes (#936) Some tests were being skipped because they weren't marked as HtsjdkTest --- src/test/java/htsjdk/samtools/BAMFileSpanTest.java | 4 +++- src/test/java/htsjdk/samtools/DuplicateScoringStrategyTest.java | 3 ++- src/test/java/htsjdk/samtools/PathInputResourceTest.java | 4 +++- src/test/java/htsjdk/samtools/SAMProgramRecordTest.java | 3 ++- src/test/java/htsjdk/samtools/SAMReadGroupRecordTest.java | 3 ++- src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java | 3 ++- src/test/java/htsjdk/samtools/fastq/FastqEncoderTest.java | 3 ++- src/test/java/htsjdk/samtools/util/CigarElementUnitTest.java | 3 ++- 8 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/test/java/htsjdk/samtools/BAMFileSpanTest.java b/src/test/java/htsjdk/samtools/BAMFileSpanTest.java index 4fc39b294..06d1bc9ab 100644 --- a/src/test/java/htsjdk/samtools/BAMFileSpanTest.java +++ b/src/test/java/htsjdk/samtools/BAMFileSpanTest.java @@ -1,11 +1,13 @@ package htsjdk.samtools; import java.util.Arrays; + +import htsjdk.HtsjdkTest; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -public class BAMFileSpanTest { +public class BAMFileSpanTest extends HtsjdkTest { @Test(dataProvider = "testRemoveContentsBeforeProvider") public void testRemoveContentsBefore(BAMFileSpan originalSpan, BAMFileSpan cutoff, BAMFileSpan expectedSpan) { diff --git a/src/test/java/htsjdk/samtools/DuplicateScoringStrategyTest.java b/src/test/java/htsjdk/samtools/DuplicateScoringStrategyTest.java index 5943535af..d86b697a5 100644 --- a/src/test/java/htsjdk/samtools/DuplicateScoringStrategyTest.java +++ b/src/test/java/htsjdk/samtools/DuplicateScoringStrategyTest.java @@ -1,10 +1,11 @@ package htsjdk.samtools; +import htsjdk.HtsjdkTest; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -public class DuplicateScoringStrategyTest { +public class DuplicateScoringStrategyTest extends HtsjdkTest { @DataProvider public Object [][] compareData() { diff --git a/src/test/java/htsjdk/samtools/PathInputResourceTest.java b/src/test/java/htsjdk/samtools/PathInputResourceTest.java index 1bc2aa161..f82b9a627 100644 --- a/src/test/java/htsjdk/samtools/PathInputResourceTest.java +++ b/src/test/java/htsjdk/samtools/PathInputResourceTest.java @@ -5,10 +5,12 @@ import java.nio.file.Paths; import java.util.HashMap; import java.util.function.Function; + +import htsjdk.HtsjdkTest; import org.testng.Assert; import org.testng.annotations.Test; -public class PathInputResourceTest { +public class PathInputResourceTest extends HtsjdkTest { final String localBam = "src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam"; @Test diff --git a/src/test/java/htsjdk/samtools/SAMProgramRecordTest.java b/src/test/java/htsjdk/samtools/SAMProgramRecordTest.java index 5d9a36893..99a26cc38 100644 --- a/src/test/java/htsjdk/samtools/SAMProgramRecordTest.java +++ b/src/test/java/htsjdk/samtools/SAMProgramRecordTest.java @@ -23,13 +23,14 @@ */ package htsjdk.samtools; +import htsjdk.HtsjdkTest; import org.testng.Assert; import org.testng.annotations.Test; /** * Test for SAMReadGroupRecordTest */ -public class SAMProgramRecordTest { +public class SAMProgramRecordTest extends HtsjdkTest { @Test public void testGetSAMString() { diff --git a/src/test/java/htsjdk/samtools/SAMReadGroupRecordTest.java b/src/test/java/htsjdk/samtools/SAMReadGroupRecordTest.java index c3a7423d3..0801f52a5 100644 --- a/src/test/java/htsjdk/samtools/SAMReadGroupRecordTest.java +++ b/src/test/java/htsjdk/samtools/SAMReadGroupRecordTest.java @@ -23,13 +23,14 @@ */ package htsjdk.samtools; +import htsjdk.HtsjdkTest; import org.testng.Assert; import org.testng.annotations.Test; /** * Test for SAMReadGroupRecordTest */ -public class SAMReadGroupRecordTest { +public class SAMReadGroupRecordTest extends HtsjdkTest { @Test public void testGetSAMString() { diff --git a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java index e0c73d502..89e6121d2 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java @@ -23,6 +23,7 @@ */ package htsjdk.samtools; +import htsjdk.HtsjdkTest; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -32,7 +33,7 @@ /** * Test for SAMReadGroupRecordTest */ -public class SAMSequenceRecordTest { +public class SAMSequenceRecordTest extends HtsjdkTest { @Test public void testGetSAMString() { diff --git a/src/test/java/htsjdk/samtools/fastq/FastqEncoderTest.java b/src/test/java/htsjdk/samtools/fastq/FastqEncoderTest.java index 72e59cff7..c367397a3 100644 --- a/src/test/java/htsjdk/samtools/fastq/FastqEncoderTest.java +++ b/src/test/java/htsjdk/samtools/fastq/FastqEncoderTest.java @@ -23,6 +23,7 @@ */ package htsjdk.samtools.fastq; +import htsjdk.HtsjdkTest; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordSetBuilder; import org.testng.Assert; @@ -31,7 +32,7 @@ /** * @author Daniel Gomez-Sanchez (magicDGS) */ -public class FastqEncoderTest { +public class FastqEncoderTest extends HtsjdkTest { @Test public void testAsFastqRecord() throws Exception { diff --git a/src/test/java/htsjdk/samtools/util/CigarElementUnitTest.java b/src/test/java/htsjdk/samtools/util/CigarElementUnitTest.java index 54cfdedfc..d9b3bf84a 100644 --- a/src/test/java/htsjdk/samtools/util/CigarElementUnitTest.java +++ b/src/test/java/htsjdk/samtools/util/CigarElementUnitTest.java @@ -1,11 +1,12 @@ package htsjdk.samtools.util; +import htsjdk.HtsjdkTest; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import org.testng.annotations.Test; -public class CigarElementUnitTest { +public class CigarElementUnitTest extends HtsjdkTest { @Test(expectedExceptions = IllegalArgumentException.class) public void testNegativeLengthCheck(){ From cfbbe4b8e0e5ad4f011d6aacec2640eab8ff49a0 Mon Sep 17 00:00:00 2001 From: Vadim Zalunin Date: Thu, 20 Jul 2017 13:21:07 +0100 Subject: [PATCH 16/23] fix for ambiguity and lc match/mismatch bases in CRAM (#889) --- .../htsjdk/samtools/CRAMContainerStreamWriter.java | 9 +- .../htsjdk/samtools/cram/build/CramNormalizer.java | 11 +- .../samtools/cram/build/Sam2CramRecordFactory.java | 136 +++++++-------------- .../cram/encoding/readfeatures/Substitution.java | 15 ++- .../htsjdk/samtools/cram/ref/ReferenceSource.java | 9 +- .../java/htsjdk/samtools/util/SequenceUtil.java | 54 +++++++- .../java/htsjdk/samtools/BAMFileWriterTest.java | 39 ++++++ .../java/htsjdk/samtools/CRAMComplianceTest.java | 13 +- .../java/htsjdk/samtools/CRAMSliceMD5Test.java | 136 +++++++++++++++++++++ .../cram/build/Sam2CramRecordFactoryTest.java | 109 +++++++++++++++++ .../samtools/cram/ref/ReferenceSourceTest.java | 33 +++++ .../htsjdk/samtools/util/SequenceUtilTest.java | 57 +++++++++ .../htsjdk/samtools/cram/amb#amb.2.1.cram | Bin 0 -> 11045 bytes .../htsjdk/samtools/cram/amb#amb.3.0.cram | Bin 0 -> 1210 bytes .../resources/htsjdk/samtools/cram/amb#amb.sam | 57 +++++++++ src/test/resources/htsjdk/samtools/cram/amb.fa | 2 + src/test/resources/htsjdk/samtools/cram/amb.fa.fai | 1 + .../htsjdk/samtools/cram/ambiguityCodes.fasta | 2 + .../htsjdk/samtools/cram/ambiguityCodes.fasta.fai | 1 + .../samtoolsSliceMD5WithAmbiguityCodesTest.cram | Bin 0 -> 1826 bytes 20 files changed, 577 insertions(+), 107 deletions(-) create mode 100644 src/test/java/htsjdk/samtools/CRAMSliceMD5Test.java create mode 100644 src/test/java/htsjdk/samtools/cram/build/Sam2CramRecordFactoryTest.java create mode 100644 src/test/java/htsjdk/samtools/cram/ref/ReferenceSourceTest.java create mode 100644 src/test/resources/htsjdk/samtools/cram/amb#amb.2.1.cram create mode 100644 src/test/resources/htsjdk/samtools/cram/amb#amb.3.0.cram create mode 100644 src/test/resources/htsjdk/samtools/cram/amb#amb.sam create mode 100644 src/test/resources/htsjdk/samtools/cram/amb.fa create mode 100644 src/test/resources/htsjdk/samtools/cram/amb.fa.fai create mode 100644 src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta create mode 100644 src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta.fai create mode 100644 src/test/resources/htsjdk/samtools/cram/samtoolsSliceMD5WithAmbiguityCodesTest.cram diff --git a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java index 4707b7bcc..c588bdb46 100644 --- a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java +++ b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java @@ -17,6 +17,7 @@ import htsjdk.samtools.cram.structure.Slice; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.samtools.util.SequenceUtil; import java.io.IOException; import java.io.OutputStream; @@ -437,7 +438,13 @@ protected void flushContainer() throws IllegalArgumentException, IllegalAccessEx final SAMRecord restoredSamRecord = f.create(cramRecords.get(i)); assert (restoredSamRecord.getAlignmentStart() == samRecords.get(i).getAlignmentStart()); assert (restoredSamRecord.getReferenceName().equals(samRecords.get(i).getReferenceName())); - assert (restoredSamRecord.getReadString().equals(samRecords.get(i).getReadString())); + + if (!restoredSamRecord.getReadString().equals(samRecords.get(i).getReadString())) { + // try to fix the original read bases by normalizing them to BAM set: + final byte[] originalReadBases = samRecords.get(i).getReadString().getBytes(); + final String originalReadBasesUpperCaseIupacNoDot = new String(SequenceUtil.toBamReadBasesInPlace(originalReadBases)); + assert (restoredSamRecord.getReadString().equals(originalReadBasesUpperCaseIupacNoDot)); + } assert (restoredSamRecord.getBaseQualityString().equals(samRecords.get(i).getBaseQualityString())); } } diff --git a/src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java b/src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java index 1be1aa546..b2dd67c44 100644 --- a/src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java +++ b/src/main/java/htsjdk/samtools/cram/build/CramNormalizer.java @@ -32,6 +32,7 @@ import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.SubstitutionMatrix; import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.SequenceUtil; import java.util.ArrayList; import java.util.Arrays; @@ -242,7 +243,8 @@ public static void restoreQualityScores(final byte defaultQualityScore, } else System.arraycopy(ref, alignmentStart - refOffsetZeroBased, bases, 0, bases.length); - return bases; + + return SequenceUtil.toBamReadBasesInPlace(bases); } final List variations = record.readFeatures; for (final ReadFeature variation : variations) { @@ -256,6 +258,7 @@ public static void restoreQualityScores(final byte defaultQualityScore, final Substitution substitution = (Substitution) variation; byte refBase = getByteOrDefault(ref, alignmentStart + posInSeq - refOffsetZeroBased, (byte) 'N'); + // substitution requires ACGTN only: refBase = Utils.normalizeBase(refBase); final byte base = substitutionMatrix.base(refBase, substitution.getCode()); substitution.setBase(base); @@ -304,11 +307,7 @@ public static void restoreQualityScores(final byte defaultQualityScore, } } - for (int i = 0; i < bases.length; i++) { - bases[i] = Utils.normalizeBase(bases[i]); - } - - return bases; + return SequenceUtil.toBamReadBasesInPlace(bases); } private static byte getByteOrDefault(final byte[] array, final int pos, diff --git a/src/main/java/htsjdk/samtools/cram/build/Sam2CramRecordFactory.java b/src/main/java/htsjdk/samtools/cram/build/Sam2CramRecordFactory.java index b7ffcb13f..6c59e13fd 100644 --- a/src/main/java/htsjdk/samtools/cram/build/Sam2CramRecordFactory.java +++ b/src/main/java/htsjdk/samtools/cram/build/Sam2CramRecordFactory.java @@ -17,50 +17,21 @@ */ package htsjdk.samtools.cram.build; -import htsjdk.samtools.CigarElement; -import htsjdk.samtools.CigarOperator; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.*; import htsjdk.samtools.SAMRecord.SAMTagAndValue; -import htsjdk.samtools.SAMTag; import htsjdk.samtools.cram.common.CramVersions; import htsjdk.samtools.cram.common.Version; -import htsjdk.samtools.cram.encoding.readfeatures.BaseQualityScore; -import htsjdk.samtools.cram.encoding.readfeatures.Deletion; -import htsjdk.samtools.cram.encoding.readfeatures.HardClip; -import htsjdk.samtools.cram.encoding.readfeatures.InsertBase; -import htsjdk.samtools.cram.encoding.readfeatures.Padding; -import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature; -import htsjdk.samtools.cram.encoding.readfeatures.RefSkip; -import htsjdk.samtools.cram.encoding.readfeatures.SoftClip; -import htsjdk.samtools.cram.encoding.readfeatures.Substitution; +import htsjdk.samtools.cram.encoding.readfeatures.*; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.ReadTag; import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.SequenceUtil; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; +import java.util.*; public class Sam2CramRecordFactory { - - public static final String UNKNOWN_READ_GROUP_ID = "UNKNOWN"; - public static final String UNKNOWN_READ_GROUP_SAMPLE = "UNKNOWN"; - - private final static byte QS_asciiOffset = 33; - public final static byte unsetQualityScore = 32; - public final static byte ignorePositionsWithQualityScore = -1; - private byte[] refBases; private final Version version; - private byte[] refSNPs; final private SAMFileHeader header; @@ -68,9 +39,6 @@ private final Map readGroupMap = new HashMap(); - private long landedRefMaskScores = 0; - private long landedTotalScores = 0; - public boolean captureAllTags = false; public boolean preserveReadNames = false; public final Set captureTags = new TreeSet(); @@ -151,8 +119,14 @@ public CramCompressionRecord createCramRecord(final SAMRecord record) { } else cramRecord.readFeatures = Collections.emptyList(); cramRecord.readBases = record.getReadBases(); + + /** + * CRAM read bases are limited to ACGTN, see https://github.com/samtools/hts-specs/blob/master/CRAMv3.pdf passage 10.2 on read bases. + * However, BAM format allows upper case IUPAC codes without a dot, so we follow the same approach to reproduce the behaviour of samtools. + */ + // copy read bases to avoid changing the original record: + cramRecord.readBases = SequenceUtil.toBamReadBasesInPlace(Arrays.copyOf(record.getReadBases(), record.getReadLength())); cramRecord.qualityScores = record.getBaseQualities(); - landedTotalScores += cramRecord.readLength; if (version.compatibleWith(CramVersions.CRAM_v3)) cramRecord.setUnknownBases(record.getReadBases() == SAMRecord.NULL_SEQUENCE); @@ -187,7 +161,7 @@ public CramCompressionRecord createCramRecord(final SAMRecord record) { * A wrapper method to provide better diagnostics for ArrayIndexOutOfBoundsException. * * @param cramRecord CRAM record - * @param samRecord SAM record + * @param samRecord SAM record * @return a list of read features created for the given {@link htsjdk.samtools.SAMRecord} */ private List checkedCreateVariations(final CramCompressionRecord cramRecord, final SAMRecord samRecord) { @@ -247,7 +221,7 @@ public CramCompressionRecord createCramRecord(final SAMRecord record) { case M: case X: case EQ: - addSubstitutionsAndMaskedBases(cramRecord, features, zeroBasedPositionInRead, alignmentStartOffset, + addMismatchReadFeatures(cramRecord.alignmentStart, features, zeroBasedPositionInRead, alignmentStartOffset, cigarElementLength, bases, qualityScore); break; default: @@ -291,57 +265,47 @@ private void addInsertion(final List features, final int zeroBasedP } } - private void addSubstitutionsAndMaskedBases(final CramCompressionRecord cramRecord, final List features, final int fromPosInRead, final int + /** + * Processes a stretch of read bases marked as match or mismatch and emits appropriate read features. + * Briefly the algorithm is: + *

  • emit nothing for a read base matching corresponding reference base.
  • + *
  • emit a {@link Substitution} read feature for each ACTGN-ACTGN mismatch.
  • + *
  • emit {@link ReadBase} for a non-ACTGN mismatch. The side effect is the quality score stored twice.
  • + *

    + * IMPORTANT: reference and read bases are always compared for match/mismatch in upper case due to BAM limitations. + * + * @param alignmentStart CRAM record alignment start + * @param features a list of read features to add to + * @param fromPosInRead a zero based position in the read to start with + * @param alignmentStartOffset offset into the reference array + * @param nofReadBases how many read bases to process + * @param bases the read bases array + * @param qualityScore the quality score array + */ + void addMismatchReadFeatures(final int alignmentStart, final List features, final int fromPosInRead, final int alignmentStartOffset, final int nofReadBases, final byte[] bases, final byte[] qualityScore) { - int oneBasedPositionInRead; - final boolean noQS = (qualityScore.length == 0); + int oneBasedPositionInRead = fromPosInRead + 1; + int refIndex = alignmentStart + alignmentStartOffset - 1; - int i; - boolean qualityAdded; byte refBase; - for (i = 0; i < nofReadBases; i++) { - oneBasedPositionInRead = i + fromPosInRead + 1; - final int referenceCoordinates = cramRecord.alignmentStart + i + alignmentStartOffset - 1; - qualityAdded = false; - if (referenceCoordinates >= refBases.length) refBase = 'N'; - else refBase = refBases[referenceCoordinates]; - refBase = Utils.normalizeBase(refBase); - - if (bases[i + fromPosInRead] != refBase) { - final Substitution substitution = new Substitution(); - substitution.setPosition(oneBasedPositionInRead); - substitution.setBase(bases[i + fromPosInRead]); - substitution.setReferenceBase(refBase); - - features.add(substitution); - - if (noQS) continue; - } - - if (noQS) continue; - - if (refSNPs != null) { - final byte snpOrNot = refSNPs[referenceCoordinates]; - if (snpOrNot != 0) { - final byte score = (byte) (QS_asciiOffset + qualityScore[i + fromPosInRead]); - features.add(new BaseQualityScore(oneBasedPositionInRead, score)); - qualityAdded = true; - landedRefMaskScores++; + for (int i = 0; i < nofReadBases; i++, oneBasedPositionInRead++, refIndex++) { + if (refIndex >= refBases.length) refBase = 'N'; + else refBase = refBases[refIndex]; + + final byte readBase = bases[i + fromPosInRead]; + + if (readBase != refBase) { + final boolean isSubstitution = SequenceUtil.isUpperACGTN(readBase) && SequenceUtil.isUpperACGTN(refBase); + if (isSubstitution) { + features.add(new Substitution(oneBasedPositionInRead, readBase, refBase)); + } else { + final byte score = qualityScore[i + fromPosInRead]; + features.add(new ReadBase(oneBasedPositionInRead, readBase, score)); } } - - if (qualityAdded) landedTotalScores++; } } - public long getLandedRefMaskScores() { - return landedRefMaskScores; - } - - public long getLandedTotalScores() { - return landedTotalScores; - } - public byte[] getRefBases() { return refBases; } @@ -350,14 +314,6 @@ public void setRefBases(final byte[] refBases) { this.refBases = refBases; } - public byte[] getRefSNPs() { - return refSNPs; - } - - public void setRefSNPs(final byte[] refSNPs) { - this.refSNPs = refSNPs; - } - public Map getReadGroupMap() { return readGroupMap; } diff --git a/src/main/java/htsjdk/samtools/cram/encoding/readfeatures/Substitution.java b/src/main/java/htsjdk/samtools/cram/encoding/readfeatures/Substitution.java index bc84b5aa8..1747c4474 100644 --- a/src/main/java/htsjdk/samtools/cram/encoding/readfeatures/Substitution.java +++ b/src/main/java/htsjdk/samtools/cram/encoding/readfeatures/Substitution.java @@ -22,6 +22,8 @@ /** * A substitution event captured in read coordinates. It is characterized by position in read, read base and reference base. * The class is also responsible for converting combinations of read base and reference base into a byte value (code). + * + * Both reference and read bases must be ACGTN only. */ public class Substitution implements Serializable, ReadFeature { public static final int NO_CODE = -1; @@ -31,11 +33,11 @@ */ private int position; /** - * The read base (ACGTN) + * The read base, allowed values are ACGTN. */ private byte base = -1; /** - * The reference sequence base matching the position of this substitution. + * The reference sequence base matching the position of this substitution, allowed values are ACGTN. */ private byte referenceBase = -1; /** @@ -43,6 +45,15 @@ */ private byte code = NO_CODE; + public Substitution() { + } + + public Substitution(int position, byte base, byte referenceBase) { + this.position = position; + this.base = base; + this.referenceBase = referenceBase; + } + public byte getCode() { return code; } diff --git a/src/main/java/htsjdk/samtools/cram/ref/ReferenceSource.java b/src/main/java/htsjdk/samtools/cram/ref/ReferenceSource.java index b162c9412..3edcf5dd3 100644 --- a/src/main/java/htsjdk/samtools/cram/ref/ReferenceSource.java +++ b/src/main/java/htsjdk/samtools/cram/ref/ReferenceSource.java @@ -20,6 +20,7 @@ import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.SAMUtils; import htsjdk.samtools.cram.build.Utils; import htsjdk.samtools.cram.io.InputStreamUtils; import htsjdk.samtools.reference.ReferenceSequence; @@ -27,6 +28,7 @@ import htsjdk.samtools.reference.ReferenceSequenceFileFactory; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.SequenceUtil; +import htsjdk.samtools.util.StringUtil; import java.io.File; import java.io.IOException; @@ -124,10 +126,13 @@ public void clearCache() { return null; } - // Upper case and normalize (-> ACGTN) in-place, and add to the cache + // Upper case (in-place), and add to the cache private byte[] addToCache(final String sequenceName, final byte[] bases) { + // Normalize to upper case only. We can't use the cram normalization utility Utils.normalizeBases, since + // we don't want to normalize ambiguity codes, we can't use SamUtils.normalizeBases, since we don't want + // to normalize no-call ('.') bases. for (int i = 0; i < bases.length; i++) { - bases[i] = Utils.normalizeBase(bases[i]); + bases[i] = StringUtil.toUpperCase(bases[i]); } cacheW.put(sequenceName, new WeakReference(bases)); return bases; diff --git a/src/main/java/htsjdk/samtools/util/SequenceUtil.java b/src/main/java/htsjdk/samtools/util/SequenceUtil.java index 7088217da..8e399c17f 100644 --- a/src/main/java/htsjdk/samtools/util/SequenceUtil.java +++ b/src/main/java/htsjdk/samtools/util/SequenceUtil.java @@ -50,6 +50,25 @@ public static final byte[] VALID_BASES_UPPER = new byte[]{A, C, G, T}; public static final byte[] VALID_BASES_LOWER = new byte[]{a, c, g, t}; + private static final byte[] ACGTN_BASES = new byte[]{A, C, G, T, N}; + private static final String IUPAC_CODES_STRING = ".aAbBcCdDgGhHkKmMnNrRsStTvVwWyY"; + /** + * A set of bases supported by BAM in reads, see http://samtools.github.io/hts-specs/SAMv1.pdf chapter 4.2 on 'seq' field. + * Effectively these are upper cased IUPAC codes with equals sign ('=') and without dot ('.'). + */ + private static final byte[] BAM_READ_BASE_SET = "=ABCDGHKMNRSTVWY".getBytes(); + + /** + * A lookup table to find a corresponding BAM read base. + */ + private static final byte[] bamReadBaseLookup = new byte[127]; + static { + Arrays.fill(bamReadBaseLookup, N); + for (final byte base: BAM_READ_BASE_SET) { + bamReadBaseLookup[base] = base; + bamReadBaseLookup[base + 32] = base; + } + } private static final byte A_MASK = 1; private static final byte C_MASK = 2; @@ -57,13 +76,13 @@ private static final byte T_MASK = 8; private static final byte[] bases = new byte[127]; - + private static final byte NON_IUPAC_CODE = 0; /* * Definition of IUPAC codes: * http://www.bioinformatics.org/sms2/iupac.html */ static { - Arrays.fill(bases, (byte) 0); + Arrays.fill(bases, NON_IUPAC_CODE); bases[A] = A_MASK; bases[C] = C_MASK; bases[G] = G_MASK; @@ -142,7 +161,24 @@ private static boolean isValidBase(final byte b, final byte[] validBases) { return false; } - /** Calculates the fraction of bases that are G/C in the sequence. */ + /** + * Check if the given base is one of upper case ACGTN */ + public static boolean isUpperACGTN(final byte base) { + return isValidBase(base, ACGTN_BASES); + } + + + /** Returns all IUPAC codes as a string */ + public static String getIUPACCodesString() { + return IUPAC_CODES_STRING; + } + + /** Checks if the given base is a IUPAC code */ + public static boolean isIUPAC(final byte base) { + return bases[base] != NON_IUPAC_CODE; + } + + /** Calculates the fraction of bases that are G/C in the sequence */ public static double calculateGc(final byte[] bases) { int gcs = 0; for (int i = 0; i < bases.length; ++i) { @@ -153,6 +189,18 @@ public static double calculateGc(final byte[] bases) { return gcs / (double) bases.length; } + /** Check if the given base belongs to BAM read base set '=ABCDGHKMNRSTVWY' */ + public static boolean isBamReadBase(final byte base) { + return isValidBase(base, BAM_READ_BASE_SET); + } + + /** Update and return the given array of bases by upper casing and then replacing all non-BAM read bases with N */ + public static byte[] toBamReadBasesInPlace(final byte[] bases) { + for (int i = 0; i < bases.length; i++) + bases[i] = bamReadBaseLookup[bases[i]]; + return bases; + } + /** * default signature that forces the lists to be the same size * diff --git a/src/test/java/htsjdk/samtools/BAMFileWriterTest.java b/src/test/java/htsjdk/samtools/BAMFileWriterTest.java index b228d76d8..3bb46e6f7 100644 --- a/src/test/java/htsjdk/samtools/BAMFileWriterTest.java +++ b/src/test/java/htsjdk/samtools/BAMFileWriterTest.java @@ -26,11 +26,15 @@ import htsjdk.HtsjdkTest; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; +import htsjdk.samtools.util.SequenceUtil; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; +import java.io.IOException; /** * Test that BAM writing doesn't blow up. For presorted writing, the resulting BAM file is read and contents are @@ -190,4 +194,39 @@ public void testNegativePresorted() throws Exception { testHelper(getRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate), SAMFileHeader.SortOrder.queryname, true); Assert.fail("Exception should be thrown"); } + + + /** + * A test to check that BAM changes read bases according with {@link SequenceUtil#toBamReadBasesInPlace}. + */ + @Test + public void testBAMReadBases() throws IOException { + final SAMFileHeader header = new SAMFileHeader(); + header.addSequence(new SAMSequenceRecord("1", SequenceUtil.getIUPACCodesString().length())); + header.addReadGroup(new SAMReadGroupRecord("rg1")); + + final SAMRecord originalSAMRecord = new SAMRecord(header); + originalSAMRecord.setReadName("test"); + originalSAMRecord.setReferenceIndex(0); + originalSAMRecord.setAlignmentStart(1); + originalSAMRecord.setReadBases(SequenceUtil.getIUPACCodesString().getBytes()); + originalSAMRecord.setCigarString(originalSAMRecord.getReadLength() + "M"); + originalSAMRecord.setBaseQualities(SAMRecord.NULL_QUALS); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (final BAMFileWriter writer = new BAMFileWriter(baos, null)) { + writer.setHeader(header); + writer.addAlignment(originalSAMRecord); + } + + + final BAMFileReader reader = new BAMFileReader(new ByteArrayInputStream(baos.toByteArray()), null, true, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory()); + final CloseableIterator iterator = reader.getIterator(); + iterator.hasNext(); + final SAMRecord recordFromBAM = iterator.next(); + + Assert.assertNotEquals(recordFromBAM.getReadBases(), originalSAMRecord.getReadBases()); + Assert.assertEquals(recordFromBAM.getReadBases(), SequenceUtil.toBamReadBasesInPlace(originalSAMRecord.getReadBases())); + } + } diff --git a/src/test/java/htsjdk/samtools/CRAMComplianceTest.java b/src/test/java/htsjdk/samtools/CRAMComplianceTest.java index 57167279d..5afeecd0c 100644 --- a/src/test/java/htsjdk/samtools/CRAMComplianceTest.java +++ b/src/test/java/htsjdk/samtools/CRAMComplianceTest.java @@ -7,15 +7,14 @@ import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.SequenceUtil; import org.testng.Assert; import org.testng.annotations.BeforeTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.*; -import java.security.DigestInputStream; -import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.List; @@ -35,6 +34,7 @@ @DataProvider(name = "partialVerification") public Object[][] getPartialVerificationData() { return new Object[][] { + {"amb#amb"}, {"auxf#values"}, // unsigned attributes: https://github.com/samtools/htsjdk/issues/499 {"c1#noseq"}, // unsigned attributes: https://github.com/samtools/htsjdk/issues/499 {"c1#unknown"}, // unsigned attributes: https://github.com/samtools/htsjdk/issues/499 @@ -170,7 +170,10 @@ private void assertSameRecordsPartial(Integer majorVersion, SAMRecord record1, S * https://github.com/samtools/htsjdk/issues/509 */ if (record1.getReadBases() != SAMRecord.NULL_SEQUENCE || majorVersion >= CramVersions.CRAM_v3.major) { - Assert.assertEquals(record2.getReadBases(), record1.getReadBases()); + // BAM and CRAM convert read bases to upper case IUPAC codes + final byte[] originalBases = record1.getReadBases(); + SequenceUtil.toBamReadBasesInPlace(originalBases); + Assert.assertEquals(record2.getReadBases(), originalBases); } Assert.assertEquals(record2.getBaseQualities(), record1.getBaseQualities()); @@ -181,6 +184,10 @@ private void assertSameRecordsPartial(Integer majorVersion, SAMRecord record1, S final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools/cram"); return new Object[][] { + // Test cram file created with samtools using a *reference* that contains ambiguity codes + // 'R' and 'M', a single no call '.', and some lower case bases. + {new File(TEST_DATA_DIR, "samtoolsSliceMD5WithAmbiguityCodesTest.cram"), + new File(TEST_DATA_DIR, "ambiguityCodes.fasta")}, {new File(TEST_DATA_DIR, "NA12878.20.21.1-100.100-SeqsPerSlice.0-unMapped.cram"), new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta")}, {new File(TEST_DATA_DIR, "NA12878.20.21.1-100.100-SeqsPerSlice.1-unMapped.cram"), diff --git a/src/test/java/htsjdk/samtools/CRAMSliceMD5Test.java b/src/test/java/htsjdk/samtools/CRAMSliceMD5Test.java new file mode 100644 index 000000000..40568c426 --- /dev/null +++ b/src/test/java/htsjdk/samtools/CRAMSliceMD5Test.java @@ -0,0 +1,136 @@ +package htsjdk.samtools; + +import htsjdk.samtools.cram.CRAMException; +import htsjdk.samtools.cram.build.CramIO; +import htsjdk.samtools.cram.ref.CRAMReferenceSource; +import htsjdk.samtools.cram.ref.ReferenceSource; +import htsjdk.samtools.cram.structure.Container; +import htsjdk.samtools.cram.structure.ContainerIO; +import htsjdk.samtools.cram.structure.CramHeader; +import htsjdk.samtools.cram.structure.Slice; +import htsjdk.samtools.reference.InMemoryReferenceSequenceFile; +import htsjdk.samtools.util.SequenceUtil; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.util.Arrays; + +/** + * Created by vadim on 03/07/2017. + */ +public class CRAMSliceMD5Test { + + @Test + public void testSliceMD5() throws IOException { + final CramTestCase test = new CramTestCase(); + + // read the CRAM: + final ByteArrayInputStream bais = new ByteArrayInputStream(test.cramData); + final CramHeader cramHeader = CramIO.readCramHeader(bais); + final Container container = ContainerIO.readContainer(cramHeader.getVersion(), bais); + final Slice slice = container.slices[0]; + Assert.assertEquals(slice.alignmentStart, 1); + Assert.assertEquals(slice.alignmentSpan, test.referenceBases.length); + // check the slice MD5 is the MD5 of upper-cased ref bases: + final byte[] ucRefMD5 = SequenceUtil.calculateMD5(test.refBasesFromUCSource, 0, test.refBasesFromUCSource.length); + Assert.assertEquals(slice.refMD5, ucRefMD5); + + // check the CRAM file reads: + final CRAMFileReader reader = new CRAMFileReader(new ByteArrayInputStream(test.cramData), (File) null, test.referenceSourceUpperCased, ValidationStringency.STRICT); + final SAMRecordIterator iterator = reader.getIterator(); + Assert.assertTrue(iterator.hasNext()); + Assert.assertEquals(iterator.next(), test.record); + } + + @Test(expectedExceptions = CRAMException.class) + public void testExceptionWhileReadingWithWrongReference() throws IOException { + final CramTestCase test = new CramTestCase(); + + // try reading the CRAM file with the incorrect ref source that does not upper case bases: + final CRAMFileReader reader = new CRAMFileReader(new ByteArrayInputStream(test.cramData), (File) null, test.referenceSourceMixedCase, ValidationStringency.STRICT); + final SAMRecordIterator iterator = reader.getIterator(); + // expect an exception here due to slice MD5 mismatch: + iterator.hasNext(); + } + + + /** + * A test case to demonstrate the effect of upper casing of reference bases. + * The class contains some assertions in the constructor to stress out reference bases case expectations. + */ + private static class CramTestCase { + private final byte[] referenceBases; + private final byte[] referenceBasesUpperCased; + private final SAMFileHeader samFileHeader; + /** + * An invalid reference source that does not change bases: + */ + private final CRAMReferenceSource referenceSourceMixedCase; + private final InMemoryReferenceSequenceFile memoryReferenceSequenceFile; + /** + * A valid reference source that uppercases reference bases: + */ + private final ReferenceSource referenceSourceUpperCased; + private final byte[] refBasesFromUCSource; + private final byte[] refBasesFromMixedCaseSource; + private final SAMRecord record; + private final byte[] cramData; + + private CramTestCase() { + referenceBases = SequenceUtil.getIUPACCodesString().getBytes(); + referenceBasesUpperCased = SequenceUtil.upperCase(Arrays.copyOf(referenceBases, referenceBases.length)); + + samFileHeader = new SAMFileHeader(); + samFileHeader.addSequence(new SAMSequenceRecord("1", referenceBases.length)); + samFileHeader.addReadGroup(new SAMReadGroupRecord("rg1")); + + // this source does not change ref bases: + referenceSourceMixedCase = (sequenceRecord, tryNameVariants) -> referenceBases; + + memoryReferenceSequenceFile = new InMemoryReferenceSequenceFile(); + // copy ref bases to avoid the original from upper casing: + memoryReferenceSequenceFile.add("1", Arrays.copyOf(referenceBases, referenceBases.length)); + // this is the correct reference source, it upper cases ref bases: + referenceSourceUpperCased = new ReferenceSource(memoryReferenceSequenceFile); + + refBasesFromUCSource = referenceSourceUpperCased.getReferenceBases(samFileHeader.getSequence(0), true); + // check the ref bases from the source are upper cased indeed: + Assert.assertEquals(refBasesFromUCSource, referenceBasesUpperCased); + // check there is no lower case A: + Assert.assertTrue(!new String(refBasesFromUCSource).contains("a")); + + refBasesFromMixedCaseSource = referenceSourceMixedCase.getReferenceBases(samFileHeader.getSequence(0), true); + // check the mixed case source does not change ref base casing: + Assert.assertEquals(refBasesFromMixedCaseSource, referenceBases); + // check the mixed case source contains lower case bases: + Assert.assertTrue(new String(refBasesFromMixedCaseSource).contains("a")); + + final int readLen = referenceBases.length; + final byte[] bases = new byte[readLen]; + Arrays.fill(bases, (byte) 'A'); + final byte[] scores = new byte[readLen]; + Arrays.fill(scores, (byte) '!'); + + record = new SAMRecord(samFileHeader); + record.setReadName("test"); + record.setReferenceIndex(0); + record.setAlignmentStart(1); + record.setCigarString(readLen + "M"); + record.setReadBases(bases); + record.setBaseQualities(scores); + + // write a valid CRAM with a valid reference source: + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (final CRAMFileWriter writer = new CRAMFileWriter(baos, referenceSourceUpperCased, samFileHeader, "test")) { + writer.addAlignment(record); + } + cramData = baos.toByteArray(); + } + } + + +} \ No newline at end of file diff --git a/src/test/java/htsjdk/samtools/cram/build/Sam2CramRecordFactoryTest.java b/src/test/java/htsjdk/samtools/cram/build/Sam2CramRecordFactoryTest.java new file mode 100644 index 000000000..088f4f32f --- /dev/null +++ b/src/test/java/htsjdk/samtools/cram/build/Sam2CramRecordFactoryTest.java @@ -0,0 +1,109 @@ +package htsjdk.samtools.cram.build; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMUtils; +import htsjdk.samtools.cram.common.CramVersions; +import htsjdk.samtools.cram.encoding.readfeatures.ReadBase; +import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature; +import htsjdk.samtools.cram.encoding.readfeatures.Substitution; +import htsjdk.samtools.cram.structure.CramCompressionRecord; +import htsjdk.samtools.util.SequenceUtil; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created by vadim on 06/06/2017. + */ +public class Sam2CramRecordFactoryTest { + + /** + * This checks that all read bases returned in the record from {@link Sam2CramRecordFactory#createCramRecord(SAMRecord)} + * are from the BAM read base set. + */ + @Test + public void testReadBaseNormalization() { + final SAMFileHeader header = new SAMFileHeader(); + + final SAMRecord record = new SAMRecord(header); + record.setReadName("test"); + record.setReadUnmappedFlag(true); + record.setReadBases(SequenceUtil.getIUPACCodesString().getBytes()); + record.setBaseQualities(SAMRecord.NULL_QUALS); + + final Sam2CramRecordFactory sam2CramRecordFactory = new Sam2CramRecordFactory(null, header, CramVersions.CRAM_v3); + final CramCompressionRecord cramRecord = sam2CramRecordFactory.createCramRecord(record); + + Assert.assertNotEquals(cramRecord.readBases, record.getReadBases()); + Assert.assertEquals(cramRecord.readBases, SequenceUtil.toBamReadBasesInPlace(record.getReadBases())); + } + + @DataProvider(name = "emptyFeatureListProvider") + public Object[][] testPositive() { + return new Object[][]{ + // a matching base + {"A", "A", "!"}, + // a matching ambiguity base + {"R", "R", "!"}, + }; + } + + @Test(dataProvider = "emptyFeatureListProvider") + public void testAddMismatchReadFeaturesNoReadFeaturesForMatch(final String refBases, final String readBases, final String fastqScores) { + final List readFeatures = buildMatchOrMismatchReadFeatures(refBases, readBases, fastqScores); + Assert.assertTrue(readFeatures.isEmpty()); + } + + /** + * Test the outcome of a ACGTN mismatch. + * The result should always be a {@link Substitution} read feature. + */ + @Test + public void testAddMismatchReadFeaturesSingleSubstitution() { + final List readFeatures = buildMatchOrMismatchReadFeatures("A", "C", "!"); + + Assert.assertEquals(1, readFeatures.size()); + + final ReadFeature rf = readFeatures.get(0); + Assert.assertTrue(rf instanceof Substitution); + final Substitution substitution = (Substitution) rf; + Assert.assertEquals(1, substitution.getPosition()); + Assert.assertEquals('C', substitution.getBase()); + Assert.assertEquals('A', substitution.getReferenceBase()); + } + + /** + * Test the outcome of non-ACGTN ref and read bases mismatching each other. + * The result should be explicit read base and score capture via {@link ReadBase}. + */ + @Test + public void testAddMismatchReadFeaturesAmbiguityMismatch() { + final List readFeatures = buildMatchOrMismatchReadFeatures("R", "F", "1"); + Assert.assertEquals(1, readFeatures.size()); + + final ReadFeature rf = readFeatures.get(0); + Assert.assertTrue(rf instanceof ReadBase); + final ReadBase readBaseFeature = (ReadBase) rf; + Assert.assertEquals(1, readBaseFeature.getPosition()); + Assert.assertEquals('F', readBaseFeature.getBase()); + Assert.assertEquals(SAMUtils.fastqToPhred('1'), readBaseFeature.getQualityScore()); + } + + private List buildMatchOrMismatchReadFeatures(final String refBases, final String readBases, final String scores) { + final SAMFileHeader header = new SAMFileHeader(); + final CramCompressionRecord record = new CramCompressionRecord(); + record.alignmentStart = 1; + final List readFeatures = new ArrayList<>(); + final int fromPosInRead = 0; + final int alignmentStartOffset = 0; + final int nofReadBases = 1; + + final Sam2CramRecordFactory sam2CramRecordFactory = new Sam2CramRecordFactory(refBases.getBytes(), header, CramVersions.CRAM_v3); + sam2CramRecordFactory.addMismatchReadFeatures(record.alignmentStart, readFeatures, fromPosInRead, alignmentStartOffset, nofReadBases, readBases.getBytes(), SAMUtils.fastqToPhred(scores)); + return readFeatures; + } +} diff --git a/src/test/java/htsjdk/samtools/cram/ref/ReferenceSourceTest.java b/src/test/java/htsjdk/samtools/cram/ref/ReferenceSourceTest.java new file mode 100644 index 000000000..34ae95b1d --- /dev/null +++ b/src/test/java/htsjdk/samtools/cram/ref/ReferenceSourceTest.java @@ -0,0 +1,33 @@ +package htsjdk.samtools.cram.ref; + +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.InMemoryReferenceSequenceFile; +import htsjdk.samtools.util.SequenceUtil; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Created by vadim on 29/06/2017. + */ +public class ReferenceSourceTest { + + @Test + public void testReferenceSourceUpperCasesBases() { + final String sequenceName = "1"; + final String nonIupacCharacters = "1=eE"; + final byte[] originalRefBases = (nonIupacCharacters + SequenceUtil.getIUPACCodesString()).getBytes(); + SAMSequenceRecord sequenceRecord = new SAMSequenceRecord(sequenceName, originalRefBases.length); + + InMemoryReferenceSequenceFile memoryReferenceSequenceFile = new InMemoryReferenceSequenceFile(); + memoryReferenceSequenceFile.add(sequenceName, Arrays.copyOf(originalRefBases, originalRefBases.length)); + Assert.assertEquals(memoryReferenceSequenceFile.getSequence(sequenceName).getBases(), originalRefBases); + + ReferenceSource referenceSource = new ReferenceSource(memoryReferenceSequenceFile); + byte[] refBasesFromSource = referenceSource.getReferenceBases(sequenceRecord, false); + + Assert.assertNotEquals(refBasesFromSource, originalRefBases); + Assert.assertEquals(refBasesFromSource, SequenceUtil.upperCase(originalRefBases)); + } +} diff --git a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java index 6a115db9b..81a949094 100644 --- a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java @@ -562,4 +562,61 @@ private void assertEquals(final byte[] actual, final byte[] expected) { Assert.assertEquals(actual[i], expected[i], "Array differ at position " + i); } } + + @Test + public void testIsACGTN() { + for (byte base = Byte.MIN_VALUE; base < Byte.MAX_VALUE; base++) { + if (base == 'A' || base == 'C' || base == 'G' || base == 'T' || base == 'N') { + Assert.assertTrue(SequenceUtil.isUpperACGTN(base)); + } else { + Assert.assertFalse(SequenceUtil.isUpperACGTN(base)); + } + } + } + + @Test + public void testIsIUPAC() { + final String iupacString = ".aAbBcCdDgGhHkKmMnNrRsStTvVwWyY"; + for (byte code=0; code-qPL_ z^S#e`o^zk)bIy;)?zeg<+=%|$GFyu_j3sq`;{3!Y87-|2wv#t&n{;f8*Bpz-!mQvm zhvM-=2cofHB3y0}e5}Bmqlf!~A-0`2@6xg!y*Xku1`RtkMqSvX2{ju-Aww{zZ#MPx zM0V;8VS|R<=Qpc+V+nO4+}o!P_V#oq!u^SUNi{`+vovk4p#7(Bk_27F*z84 z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_ z01yBIKmZ8*p9!dGl08;6qoV;9i*pDg@sfC{iU~L{5X zchL-{G4i*QQH7DUgN!8B-$Iw+RnjUj@<=|!#tZ-0_%*+PaDOf!#SqZBN+e1fSLhJKVMZ>B$z*-HOtKyL`;7KpUjDSS``*gD&YQort9@y{ zKCXFndy?eOFkIdEFsdd$9wwI++2gC|?hmj1>%~ls`1(k$m?=YN=e6r-D~GhPwC&!> z1Nm#&ivwRw4cQ+Y&o~E_T@B+K6hwKeo#{8%^ibyDjmd^5{}{0o*4seoG7NDe(h3XneW z@#kpqCFFeGQU3dC3N0Dl##7Vc=54ble%mzq{hjMw-#qQ#ld1?0D%Z6&ZR9R*Fs1Vg zgYPfdDErFGm{A3W`C_4n?mu)*@zgTfdn#F~vb3tXfuHfs@9ySFCs3(nasO^K@kDUI zVpNIO-)VgTO~`g{OKo@e=r(gFhegHBd)3_t%p}<@IjOR6z!sfe Q7t!P>lSd|#&NN1U14TbMPyhe` literal 0 HcmV?d00001 diff --git a/src/test/resources/htsjdk/samtools/cram/amb#amb.3.0.cram b/src/test/resources/htsjdk/samtools/cram/amb#amb.3.0.cram new file mode 100644 index 0000000000000000000000000000000000000000..e683dc858158a41b9ecb70936d7443e65adcec4c GIT binary patch literal 1210 zcmZ<`a`a_p&}F~`-{C?`ObiY4=db?U&&a^gFr}eezMBIg$IN=rk*`@nfWcw?>tvU; zT<kxE*OzAG#Lx1RINW(_-St~nb+_+nIo@`r{m7)R?8@uoPtJRp z@}%^k#`N19mm4lKWVwka|6{+ZCv>G7=$dWPr89w)zdzj7)M2DCXkLqD2AahvXU5PL z!pYchZq=0)`V5Q=4Tl>J3$cW_Ffs)DN`pXB$0|7rHgD*%8P|PnFA?62F7ZMDS3jyf|s&VsU=3thQ zm1AaPVsdk4WR!LbU}ThW3T9-K_w-|BV&HXhWMq_ca$;d)WYl70RP_x6>I(zf2a@Ct z1ghf+@`M-_>_c`oZXRhFIeD<#HcdG<%PWVeBX7pCee>H- z^NGz^d61<1HhLbi?%43osY6FfTo5D9G%{!(f=# z?eu42%A-R|*H@fe*z7FcwaB%eFIJ;Vj2FoA7jf;L2W2-e%DA{dBkD`5Gf>cOA&})3 zxaGxps2-5)e6Vg1OW5_|B6XmCs2&rb5w#BmfP(%&*4{u7T`8a$_v$faXEda(~4X;KDKhVeE1o*(!-wntHCVbYt TH~oPW8yhGWFfzPLX7mOCYTt2? literal 0 HcmV?d00001 diff --git a/src/test/resources/htsjdk/samtools/cram/amb#amb.sam b/src/test/resources/htsjdk/samtools/cram/amb#amb.sam new file mode 100644 index 000000000..0640c90d2 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/cram/amb#amb.sam @@ -0,0 +1,57 @@ +@HD VN:1.4 GO:none SO:coordinate +@SQ SN:iupac LN:31 M5:f88a72084e90c68cc7aa569bbf257e70 +@RG ID:ID SM:foo +read_A 0 iupac 1 86 30M * 0 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ****************************** +read_B 0 iupac 1 86 30M * 0 0 BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB ****************************** +read_C 0 iupac 1 86 30M * 0 0 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC ****************************** +read_D 0 iupac 1 86 30M * 0 0 DDDDDDDDDDDDDDDDDDDDDDDDDDDDDD ****************************** +read_E 4 iupac 1 86 30M * 0 0 EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE ****************************** +read_F 0 iupac 1 86 30M * 0 0 FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF ****************************** +read_G 0 iupac 1 86 30M * 0 0 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG ****************************** +read_H 0 iupac 1 86 30M * 0 0 HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH ****************************** +read_I 0 iupac 1 86 30M * 0 0 IIIIIIIIIIIIIIIIIIIIIIIIIIIIII ****************************** +read_J 0 iupac 1 86 30M * 0 0 JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ ****************************** +read_K 0 iupac 1 86 30M * 0 0 KKKKKKKKKKKKKKKKKKKKKKKKKKKKKK ****************************** +read_L 0 iupac 1 86 30M * 0 0 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLL ****************************** +read_M 0 iupac 1 86 30M * 0 0 MMMMMMMMMMMMMMMMMMMMMMMMMMMMMM ****************************** +read_N 0 iupac 1 86 30M * 0 0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ****************************** +read_O 0 iupac 1 86 30M * 0 0 OOOOOOOOOOOOOOOOOOOOOOOOOOOOOO ****************************** +read_P 0 iupac 1 86 30M * 0 0 PPPPPPPPPPPPPPPPPPPPPPPPPPPPPP ****************************** +read_Q 0 iupac 1 86 30M * 0 0 QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ ****************************** +read_R 0 iupac 1 86 30M * 0 0 RRRRRRRRRRRRRRRRRRRRRRRRRRRRRR ****************************** +read_S 0 iupac 1 86 30M * 0 0 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSS ****************************** +read_T 0 iupac 1 86 30M * 0 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT ****************************** +read_U 0 iupac 1 86 30M * 0 0 UUUUUUUUUUUUUUUUUUUUUUUUUUUUUU ****************************** +read_V 0 iupac 1 86 30M * 0 0 VVVVVVVVVVVVVVVVVVVVVVVVVVVVVV ****************************** +read_W 0 iupac 1 86 30M * 0 0 WWWWWWWWWWWWWWWWWWWWWWWWWWWWWW ****************************** +read_X 0 iupac 1 86 30M * 0 0 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ****************************** +read_Y 0 iupac 1 86 30M * 0 0 YYYYYYYYYYYYYYYYYYYYYYYYYYYYYY ****************************** +read_Z 0 iupac 1 86 30M * 0 0 ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ ****************************** +read_a 0 iupac 1 86 30M * 0 0 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ****************************** +read_b 0 iupac 1 86 30M * 0 0 bbbbbbbbbbbbbbbbbbbbbbbbbbbbbb ****************************** +read_c 0 iupac 1 86 30M * 0 0 cccccccccccccccccccccccccccccc ****************************** +read_d 0 iupac 1 86 30M * 0 0 dddddddddddddddddddddddddddddd ****************************** +read_e 0 iupac 1 86 30M * 0 0 eeeeeeeeeeeeeeeeeeeeeeeeeeeeee ****************************** +read_f 0 iupac 1 86 30M * 0 0 ffffffffffffffffffffffffffffff ****************************** +read_g 0 iupac 1 86 30M * 0 0 gggggggggggggggggggggggggggggg ****************************** +read_h 0 iupac 1 86 30M * 0 0 hhhhhhhhhhhhhhhhhhhhhhhhhhhhhh ****************************** +read_i 0 iupac 1 86 30M * 0 0 iiiiiiiiiiiiiiiiiiiiiiiiiiiiii ****************************** +read_j 0 iupac 1 86 30M * 0 0 jjjjjjjjjjjjjjjjjjjjjjjjjjjjjj ****************************** +read_k 0 iupac 1 86 30M * 0 0 kkkkkkkkkkkkkkkkkkkkkkkkkkkkkk ****************************** +read_l 0 iupac 1 86 30M * 0 0 llllllllllllllllllllllllllllll ****************************** +read_m 0 iupac 1 86 30M * 0 0 mmmmmmmmmmmmmmmmmmmmmmmmmmmmmm ****************************** +read_n 0 iupac 1 86 30M * 0 0 nnnnnnnnnnnnnnnnnnnnnnnnnnnnnn ****************************** +read_o 0 iupac 1 86 30M * 0 0 oooooooooooooooooooooooooooooo ****************************** +read_p 0 iupac 1 86 30M * 0 0 pppppppppppppppppppppppppppppp ****************************** +read_q 0 iupac 1 86 30M * 0 0 qqqqqqqqqqqqqqqqqqqqqqqqqqqqqq ****************************** +read_r 0 iupac 1 86 30M * 0 0 rrrrrrrrrrrrrrrrrrrrrrrrrrrrrr ****************************** +read_s 0 iupac 1 86 30M * 0 0 ssssssssssssssssssssssssssssss ****************************** +read_t 0 iupac 1 86 30M * 0 0 tttttttttttttttttttttttttttttt ****************************** +read_u 0 iupac 1 86 30M * 0 0 uuuuuuuuuuuuuuuuuuuuuuuuuuuuuu ****************************** +read_v 0 iupac 1 86 30M * 0 0 vvvvvvvvvvvvvvvvvvvvvvvvvvvvvv ****************************** +read_w 0 iupac 1 86 30M * 0 0 wwwwwwwwwwwwwwwwwwwwwwwwwwwwww ****************************** +read_x 0 iupac 1 86 30M * 0 0 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ****************************** +read_y 0 iupac 1 86 30M * 0 0 yyyyyyyyyyyyyyyyyyyyyyyyyyyyyy ****************************** +read_z 0 iupac 1 86 30M * 0 0 zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz ****************************** +read_dot 0 iupac 1 86 30M * 0 0 .............................. ****************************** +read_equals 0 iupac 1 86 30M * 0 0 ============================== ****************************** diff --git a/src/test/resources/htsjdk/samtools/cram/amb.fa b/src/test/resources/htsjdk/samtools/cram/amb.fa new file mode 100644 index 000000000..040dd1c27 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/cram/amb.fa @@ -0,0 +1,2 @@ +>iupac 31 857152f076709b2c6067edcbaaba65c7 +.aAbBcCdDgGhHkKmMnNrRsStTvVwWyY diff --git a/src/test/resources/htsjdk/samtools/cram/amb.fa.fai b/src/test/resources/htsjdk/samtools/cram/amb.fa.fai new file mode 100644 index 000000000..89701fd82 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/cram/amb.fa.fai @@ -0,0 +1 @@ +iupac 31 49 31 32 diff --git a/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta b/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta new file mode 100644 index 000000000..430b59a1d --- /dev/null +++ b/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta @@ -0,0 +1,2 @@ +>Sheila +GCTAGCRM.gactAAAAAAAAAA diff --git a/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta.fai b/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta.fai new file mode 100644 index 000000000..f3cdedb55 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta.fai @@ -0,0 +1 @@ +Sheila 20 8 20 21 diff --git a/src/test/resources/htsjdk/samtools/cram/samtoolsSliceMD5WithAmbiguityCodesTest.cram b/src/test/resources/htsjdk/samtools/cram/samtoolsSliceMD5WithAmbiguityCodesTest.cram new file mode 100644 index 0000000000000000000000000000000000000000..e72731880c4a4ab58e4b488d37cb160c214cd3fa GIT binary patch literal 1826 zcmcIkeN0+XMC}Ld$|jqAwNIWx|QFaGWQhl&PuYcP%+f&9ijybe)hT|J z-!O2jZimPJhN9-Pfe(k;>^oaa{#@9Wxx3{^{OM{_X5`51-ob6^M@r)!-F zafPh8TGYNkD%yL0^TW{(eekXd49A83@c*@!)}MX6UIu$$6@gSr>^l_q6+4G8@*VUY z^fpt)&(>LzEf%ZA;_p`4{zrAES|UUJk2B8O4BS&J89ZY z(>zUkXymjf2*Z(&Err;7GL~jY3P)jN6)cI7BM)piMu6gI$8+Go+3f-dHs49E!AmeA zVP3c#D|SAxc^GS$e2n86D?~~$Vt^u%c4Dk%^JN55(lix@aXQ8+_9Bd9970f01bBpl zUFL9FA@p!Pn-&0dC^-Z%Y>ROKBon)!kw7Jk07xl;bgK{uCx$?spi3-sg0f;1v|8Z= z1%r_A)360dw*rBlD|4uagwQN;p#~BX=juWzo{W&o15l6^GAPl?9BLq&}NckgEUz$IR><7Fgc{IN#a6pi>aVv#mUgdK{xb+ zQYyF#@!XKEz-WG4y>I0bVzaZ}|R5ea4)rW__w>FI&?d!EQl$34h&Mr$% zxDkI-Go|8xR3=W8e($Qif4`tM{gXH}b;D>KGxojo@lSs}eeASd9=MXRH?SC(OL3&0 zx^}qxwM!+B9w~JBk~GhZbrLyqFPHtCuy2EAZ2ZN0<-YA58S`WE7e4WKLfG_Km`KSAasqYzL);%vRJqj44HuJd}w@Psa21%9+8T zT?pN5ntVqNSv%h9n#NSbI#no(A}1TjEHgp_t>%lu+qwU|@pnu`uO(J9+yMej%tw4m z06cR`Uxld{F|3#jLuYh%Hw*B-{#Wi`Dqak`IRZs#{%H{4k;DU?m`YqH6Or$(s@h4R z+*j>Kg<`yU7#CwoEy>`jTrPyp-QAicWb_%PnlPo2Y~i91zkb{#Y{4$3UylSEta-pz Yz`z_{Gg;SY0hq8b;c3HY+KqGn1wJ5`xBvhE literal 0 HcmV?d00001 From d0f0ba138304eeb3b569f648767a9247075af854 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 25 Jul 2017 13:37:38 -0400 Subject: [PATCH 17/23] Add ability to redirect log to an alternate print Stream (#942) * added a new method which allows the logger to be redirected to a different stream other than stdout. This allows logging to be redirected to a file or other destination. --- src/main/java/htsjdk/samtools/util/Log.java | 35 +++++++++++++++++++-- src/test/java/htsjdk/samtools/util/LogTest.java | 41 +++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 src/test/java/htsjdk/samtools/util/LogTest.java diff --git a/src/main/java/htsjdk/samtools/util/Log.java b/src/main/java/htsjdk/samtools/util/Log.java index acbd3c425..dfe758a39 100644 --- a/src/main/java/htsjdk/samtools/util/Log.java +++ b/src/main/java/htsjdk/samtools/util/Log.java @@ -41,13 +41,13 @@ */ public final class Log { /** Enumeration for setting log levels. */ - public static enum LogLevel { ERROR, WARNING, INFO, DEBUG } + public enum LogLevel { ERROR, WARNING, INFO, DEBUG } private static LogLevel globalLogLevel = LogLevel.INFO; + private static PrintStream out = System.err; private final Class clazz; private final String className; - private final PrintStream out = System.err; /** * Private constructor @@ -67,10 +67,41 @@ public static Log getInstance(final Class clazz) { return new Log(clazz); } + /** + * Set the log level. + * + * @param logLevel The log level enumeration + */ public static void setGlobalLogLevel(final LogLevel logLevel) { globalLogLevel = logLevel; } + /** + * Get the log level. + * + * @return The enumeration for setting log levels. + */ + public static LogLevel getGlobalLogLevel() { + return globalLogLevel; + } + + /** + * Set the {@link PrintStream} for writing. + * + * @param stream {@link PrintStream} to write to. + */ + public static void setGlobalPrintStream(final PrintStream stream) { out = stream; } + + /** + * Get the {@link PrintStream} for writing. + * + * @return {@link PrintStream} to write to. + */ + public static PrintStream getGlobalPrintStream() { + return out; + } + + /** Returns true if the specified log level is enabled otherwise false. */ public static final boolean isEnabled(final LogLevel level) { return level.ordinal() <= globalLogLevel.ordinal(); diff --git a/src/test/java/htsjdk/samtools/util/LogTest.java b/src/test/java/htsjdk/samtools/util/LogTest.java new file mode 100644 index 000000000..a9b82b128 --- /dev/null +++ b/src/test/java/htsjdk/samtools/util/LogTest.java @@ -0,0 +1,41 @@ +package htsjdk.samtools.util; + +import htsjdk.HtsjdkTest; +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.nio.file.Files; +import java.util.List; + +public class LogTest extends HtsjdkTest { + + private final Log log = Log.getInstance(getClass()); + + @Test + public void testLogToFile() throws IOException { + final File logFile = File.createTempFile(getClass().getSimpleName(), ".tmp"); + logFile.deleteOnExit(); + + final Log.LogLevel originalLogLevel = Log.getGlobalLogLevel(); + final PrintStream originalStream = Log.getGlobalPrintStream(); + + try (final PrintStream stream = new PrintStream(new FileOutputStream(logFile.getPath(), true))) { + Log.setGlobalPrintStream(stream); + Log.setGlobalLogLevel(Log.LogLevel.DEBUG); + final String words = "Hello World"; + log.info(words); + final List list = Files.readAllLines(logFile.toPath()); + Assert.assertEquals(Log.getGlobalLogLevel(), Log.LogLevel.DEBUG); + Assert.assertEquals(list.size(), 1); + Assert.assertTrue(list.get(0).contains(words)); + } finally { + Log.setGlobalLogLevel(originalLogLevel); + Log.setGlobalPrintStream(originalStream); + } + } +} From 1969eb8eea1d2c8ec60e98fb028ff9b4c8c0e392 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Tue, 25 Jul 2017 17:01:59 -0400 Subject: [PATCH 18/23] Replace bad .fai test file and create one new CRAMComplianceTest data provider. (#940) --- .../java/htsjdk/samtools/CRAMComplianceTest.java | 31 +++++++++++++++++++++- .../htsjdk/samtools/cram/ambiguityCodes.fasta.fai | 2 +- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/test/java/htsjdk/samtools/CRAMComplianceTest.java b/src/test/java/htsjdk/samtools/CRAMComplianceTest.java index 5afeecd0c..2f6ebd5c3 100644 --- a/src/test/java/htsjdk/samtools/CRAMComplianceTest.java +++ b/src/test/java/htsjdk/samtools/CRAMComplianceTest.java @@ -34,7 +34,6 @@ @DataProvider(name = "partialVerification") public Object[][] getPartialVerificationData() { return new Object[][] { - {"amb#amb"}, {"auxf#values"}, // unsigned attributes: https://github.com/samtools/htsjdk/issues/499 {"c1#noseq"}, // unsigned attributes: https://github.com/samtools/htsjdk/issues/499 {"c1#unknown"}, // unsigned attributes: https://github.com/samtools/htsjdk/issues/499 @@ -86,6 +85,36 @@ public void fullVerificationTest(String name) throws IOException { doComplianceTest(name, (version, expected, actual) -> Assert.assertEquals(expected, actual)); } + // Files that can be subjected to full verification only after read base normalization, because either + // the reference or the reads contain ambiguity codes that are normalized by SequenceUtil.toBamReadBasesInPlace + // during the round-trip process. + @DataProvider(name = "ambiguityCodeVerification") + public Object[][] getAmbiguityCodeVerificationData() { + return new Object[][]{ + {"amb#amb"} + }; + } + + @Test(dataProvider = "ambiguityCodeVerification") + public void ambiguityCodeVerificationTest(String name) throws IOException { + doComplianceTest(name, + (version, expected, actual) -> + { + if (expected.getReadString().equals(actual.getReadString())) { + Assert.assertEquals(expected, actual); + } else { + // tolerate BAM and CRAM conversion of read bases to upper case IUPAC codes by + // creating a deep copy of the expected reads and normalizing (upper case IUPAC) + // the bases; then proceeding with the full compare with the actual + SAMRecord expectedNormalized = actual.deepCopy(); + final byte[] expectedBases = expectedNormalized.getReadBases(); + SequenceUtil.toBamReadBasesInPlace(expectedBases); + Assert.assertEquals(actual, expectedNormalized); + } + } + ); + } + @BeforeTest public void beforeTest() { Log.setGlobalLogLevel(Log.LogLevel.ERROR); diff --git a/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta.fai b/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta.fai index f3cdedb55..d35aa7ed6 100644 --- a/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta.fai +++ b/src/test/resources/htsjdk/samtools/cram/ambiguityCodes.fasta.fai @@ -1 +1 @@ -Sheila 20 8 20 21 +Sheila 23 8 23 24 From 69f31b5a2ccb847c676649dc4b2bc2a337cb6989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20G=C3=B3mez-S=C3=A1nchez?= Date: Thu, 27 Jul 2017 18:05:14 +0200 Subject: [PATCH 19/23] Increase test coverage for Cigar classes (#939) * Full coverage for CigarElement * Full coverage for CigarOperator * Increase Cigar coverage and test explicitly some methods --- .../htsjdk/samtools/CigarOperatorUnitTest.java | 137 +++++++++++++++++++++ src/test/java/htsjdk/samtools/CigarTest.java | 126 ++++++++++++++++++- .../htsjdk/samtools/util/CigarElementUnitTest.java | 28 +++++ 3 files changed, 287 insertions(+), 4 deletions(-) create mode 100644 src/test/java/htsjdk/samtools/CigarOperatorUnitTest.java diff --git a/src/test/java/htsjdk/samtools/CigarOperatorUnitTest.java b/src/test/java/htsjdk/samtools/CigarOperatorUnitTest.java new file mode 100644 index 000000000..21c36d64b --- /dev/null +++ b/src/test/java/htsjdk/samtools/CigarOperatorUnitTest.java @@ -0,0 +1,137 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Daniel Gomez-Sanchez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package htsjdk.samtools; + +import htsjdk.HtsjdkTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * @author Daniel Gomez-Sanchez (magicDGS) + */ +public class CigarOperatorUnitTest extends HtsjdkTest { + + @DataProvider + public Object[][] chars() { + return new Object[][] { + {'M', CigarOperator.M}, + {'I', CigarOperator.I}, + {'D', CigarOperator.D}, + {'N', CigarOperator.N}, + {'S', CigarOperator.S}, + {'H', CigarOperator.H}, + {'P', CigarOperator.P}, + {'=', CigarOperator.EQ}, + {'X', CigarOperator.X} + }; + } + + @Test(dataProvider = "chars") + public void testCharacterToEnum(final char c, final CigarOperator op) throws Exception { + Assert.assertEquals(CigarOperator.characterToEnum(c), op); + } + + @Test(dataProvider = "chars") + public void testEnumToCharacter(final char c, final CigarOperator op) throws Exception { + Assert.assertEquals(CigarOperator.enumToCharacter(op), c); + } + + @DataProvider + public Object[][] illegalChars() { + return new Object[][] { + {'A'}, {'E'}, {'O'}, {'U'} + }; + } + + @Test(dataProvider = "illegalChars", expectedExceptions = IllegalArgumentException.class) + public void testIllegalCharacterToEnum(final char c) throws Exception { + CigarOperator.characterToEnum(c); + } + + @DataProvider + public Object[][] binary() { + return new Object[][] { + {0, CigarOperator.M}, + {1, CigarOperator.I}, + {2, CigarOperator.D}, + {3, CigarOperator.N}, + {4, CigarOperator.S}, + {5, CigarOperator.H}, + {6, CigarOperator.P}, + {7, CigarOperator.EQ}, + {8, CigarOperator.X} + }; + } + + @Test(dataProvider = "binary") + public void testBinaryToEnum(final int bin, final CigarOperator op) throws Exception { + Assert.assertEquals(CigarOperator.binaryToEnum(bin), op); + } + + @Test(dataProvider = "binary") + public void testEnumToBinary(final int bin, final CigarOperator op) throws Exception { + Assert.assertEquals(CigarOperator.enumToBinary(op), bin); + } + + @DataProvider + public Object[][] illegalBinary() { + return new Object[][] { + {-1}, {9}, {10} + }; + } + + @Test(dataProvider = "illegalBinary", expectedExceptions = IllegalArgumentException.class) + public void testIllegalBinaryToEnum(final int bin) throws Exception { + CigarOperator.binaryToEnum(bin); + } + + @DataProvider + public Object[][] opStatus() { + return new Object[][] { + // op, isClipping, isIndel, isSkip, isAlignment, isPadding + {CigarOperator.M, false, false, false, true, false}, + {CigarOperator.I, false, true, false, false, false}, + {CigarOperator.D, false, true, false, false, false}, + {CigarOperator.N, false, false, true, false, false}, + {CigarOperator.S, true, false, false, false, false}, + {CigarOperator.H, true, false, false, false, false}, + {CigarOperator.P, false, false, false, false, true}, + {CigarOperator.EQ, false, false, false, true, false}, + {CigarOperator.X, false, false, false, true, false} + }; + } + + @Test(dataProvider = "opStatus") + public void testIsSetOfOperations(final CigarOperator op, final boolean isClipping, + final boolean isIndel,final boolean isSkip, final boolean isAlignment, + final boolean isPadding) throws Exception { + Assert.assertEquals(op.isClipping(), isClipping); + Assert.assertEquals(op.isIndel(), isIndel); + Assert.assertEquals(op.isIndelOrSkippedRegion(), isIndel || isSkip); + Assert.assertEquals(op.isAlignment(), isAlignment); + Assert.assertEquals(op.isPadding(), isPadding); + } +} \ No newline at end of file diff --git a/src/test/java/htsjdk/samtools/CigarTest.java b/src/test/java/htsjdk/samtools/CigarTest.java index 07fe34633..c104073a0 100644 --- a/src/test/java/htsjdk/samtools/CigarTest.java +++ b/src/test/java/htsjdk/samtools/CigarTest.java @@ -63,6 +63,9 @@ public void testPositive(final String cigar) { public Object[][] negativeTestsData() { return new Object[][]{ + // CIGAR element with zero length + {"0M", SAMValidationError.Type.INVALID_CIGAR}, + // Cannot have two consecutive insertions (of the same type) {"1M1D1D1M", SAMValidationError.Type.ADJACENT_INDEL_IN_CIGAR}, {"1M1I1I1M", SAMValidationError.Type.ADJACENT_INDEL_IN_CIGAR}, @@ -80,11 +83,15 @@ public void testPositive(final String cigar) { {"1H1S", SAMValidationError.Type.INVALID_CIGAR}, {"1S1H", SAMValidationError.Type.INVALID_CIGAR}, {"1H1H", SAMValidationError.Type.INVALID_CIGAR}, + + // Hard clipping operator not at start or end of CIGAR + {"1M1H1M", SAMValidationError.Type.INVALID_CIGAR}, + + // Padding operator not valid at end of CIGAR + {"1M1P", SAMValidationError.Type.INVALID_CIGAR}, + // Padding operator not between real operators in CIGAR + {"1S1P1M", SAMValidationError.Type.INVALID_CIGAR} }; -/* - // Zero length for an element not allowed. TODO: not sure why this is commented out - {"100M0D10M1D10M", SAMValidationError.Type.INVALID_CIGAR} -*/ } @Test(dataProvider = "negativeTestsData") @@ -116,4 +123,115 @@ public void testMakeCigarFromOperators() { Assert.assertFalse(cigar.isRightClipped()); Assert.assertTrue(cigar.isClipped()); } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMakeCigarFromNullOperator() { + Cigar.fromCigarOperators(null); + } + + @DataProvider + public Object[][] referenceLengthData() { + return new Object[][] { + // consuming reference + {"1M", 1, 1}, + {"1=", 1, 1}, + {"1X", 1, 1}, + {"1N", 1, 1}, + {"1D", 1, 1}, + + // non-consuming reference + {"1S", 0, 0}, + {"1H", 0, 0}, + + // special case: padding + {"1P", 0, 1} + }; + } + + @Test(dataProvider = "referenceLengthData") + public void testGetReferenceLength(final String textCigar, + final int referenceLength, final int paddedReferenceLenght) throws Exception{ + final Cigar cigar = TextCigarCodec.decode(textCigar); + Assert.assertEquals(cigar.getReferenceLength(), referenceLength); + Assert.assertEquals(cigar.getPaddedReferenceLength(), paddedReferenceLenght); + } + + @DataProvider + public Object[][] readLengthData() { + return new Object[][] { + // consuming read bases + {"1M", 1}, + {"2I", 2}, + {"3S", 3}, + {"4X", 4}, + {"5=", 5}, + + // non-consuming reference + {"1D", 0}, + {"2N", 0}, + {"4H", 0}, + {"4P", 0} + }; + } + + @Test(dataProvider = "readLengthData") + public void testGetReadLength(final String textCigar, final int readLength) throws Exception{ + final Cigar cigar = TextCigarCodec.decode(textCigar); + Assert.assertEquals(cigar.getReadLength(), readLength); + } + + @Test + public void testContainsOperator() { + final Cigar cigar = TextCigarCodec.decode("10M1S"); + Assert.assertTrue(cigar.containsOperator(CigarOperator.M)); + Assert.assertTrue(cigar.containsOperator(CigarOperator.S)); + Assert.assertFalse(cigar.containsOperator(CigarOperator.X)); + } + + @DataProvider + public Object[][] firstLastData() { + final CigarElement M_ELEMENT = new CigarElement(1, CigarOperator.M); + final CigarElement S_ELEMENT = new CigarElement(1, CigarOperator.S); + return new Object[][] { + {"*", null, null}, + {"1M", M_ELEMENT, M_ELEMENT}, + {"1M1S", M_ELEMENT, S_ELEMENT}, + {"1S1M", S_ELEMENT, M_ELEMENT}, + {"1S1M1S", S_ELEMENT, S_ELEMENT}, + {"1M1D1M1D1M", M_ELEMENT, M_ELEMENT} + }; + } + + @Test(dataProvider = "firstLastData") + public void testGetFirstOrLastCigarElement(final String textCigar, final CigarElement first, final CigarElement last) { + final Cigar cigar = TextCigarCodec.decode(textCigar); + Assert.assertEquals(cigar.getFirstCigarElement(), first); + Assert.assertEquals(cigar.getLastCigarElement(), last); + } + + @DataProvider + public Object[][] clippedData() { + return new Object[][] { + // no clipped + {"10M", false}, + // wrong place for soft-clip and hard-clip returns false + {"1M1S1M", false}, + {"1M1H1M", false}, + + // clipped + {"1S1M", true}, + {"1M1S", true}, + {"1S1M1S", true}, + {"1H1M", true}, + {"1M1H", true}, + {"1H1M1H", true} + }; + } + + @Test(dataProvider = "clippedData") + public void testIsClipped(final String textCigar, final boolean isClipped) { + // this test is indirectly testing both left and right clipping methods + Assert.assertEquals(TextCigarCodec.decode(textCigar).isClipped(), isClipped); + } + } diff --git a/src/test/java/htsjdk/samtools/util/CigarElementUnitTest.java b/src/test/java/htsjdk/samtools/util/CigarElementUnitTest.java index d9b3bf84a..23607ac83 100644 --- a/src/test/java/htsjdk/samtools/util/CigarElementUnitTest.java +++ b/src/test/java/htsjdk/samtools/util/CigarElementUnitTest.java @@ -4,6 +4,8 @@ import htsjdk.HtsjdkTest; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; +import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; public class CigarElementUnitTest extends HtsjdkTest { @@ -12,4 +14,30 @@ public void testNegativeLengthCheck(){ final CigarElement element = new CigarElement(-1, CigarOperator.M); } + + + @DataProvider + public Object[][] elementsForEquals() { + final CigarElement mElement = new CigarElement(10, CigarOperator.M); + return new Object[][] { + // same object + {mElement, mElement, true}, + // different equal objects + {mElement, new CigarElement(mElement.getLength(), mElement.getOperator()), true}, + // different lengths + {mElement, new CigarElement(mElement.getLength() + 1, mElement.getOperator()), false}, + // different operators + {mElement, new CigarElement(mElement.getLength(), CigarOperator.X), false}, + // different class + {mElement, mElement.toString(), false} + }; + } + + @Test(dataProvider = "elementsForEquals") + public void testEqualsAndHashCode(final CigarElement element, final Object other, final boolean isEquals) { + Assert.assertEquals(element.equals(other), isEquals); + if (isEquals) { + Assert.assertEquals(element.hashCode(), other.hashCode()); + } + } } From 3ad79480602b5580cf885aab642175be9000050a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20G=C3=B3mez-S=C3=A1nchez?= Date: Thu, 27 Jul 2017 18:07:32 +0200 Subject: [PATCH 20/23] deprecate SAMTools.java (#937) * Deprecate SAMTools.java because it is confusing and of very limited use, as well as being untested and unmaintained. --- src/main/java/htsjdk/samtools/SAMTools.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/htsjdk/samtools/SAMTools.java b/src/main/java/htsjdk/samtools/SAMTools.java index 551f846d6..911198e0a 100644 --- a/src/main/java/htsjdk/samtools/SAMTools.java +++ b/src/main/java/htsjdk/samtools/SAMTools.java @@ -31,7 +31,10 @@ /** * Command line utility for manipulating SAM/BAM files. + * @deprecated since 07/2017. This class does not add anything to the HTSJDK library except an example of how to iterate over a SAM/BAM file. + * In addition, it is not tested. */ +@Deprecated public class SAMTools { private String mCommand = null; private File mInputFile = null; From 81ac5480569922dbe1c03c697f70393c07bf9ea3 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Thu, 27 Jul 2017 11:54:50 -0700 Subject: [PATCH 21/23] Should write to /dev/stdout as a special case. (#933) --- src/main/java/htsjdk/samtools/util/IOUtil.java | 4 ++-- .../variantcontext/writer/IndexingVariantContextWriter.java | 4 +++- .../variantcontext/writer/VariantContextWriterBuilder.java | 5 +++-- .../writer/VariantContextWriterBuilderUnitTest.java | 8 ++++++++ 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/IOUtil.java b/src/main/java/htsjdk/samtools/util/IOUtil.java index 373bd5f24..5d8afed73 100644 --- a/src/main/java/htsjdk/samtools/util/IOUtil.java +++ b/src/main/java/htsjdk/samtools/util/IOUtil.java @@ -366,7 +366,7 @@ public static void assertFilesAreReadable(final List files) { * and if it is a file then not a directory and is readable. If any * condition is false then a runtime exception is thrown. * - * @param files the list of files to check for readability + * @param inputs the list of files to check for readability */ public static void assertInputsAreValid(final List inputs) { for (final String input : inputs) assertInputIsValid(input); @@ -465,7 +465,7 @@ else if (!dir.canRead()) { public static void assertFilesEqual(final File f1, final File f2) { try { if (f1.length() != f2.length()) { - throw new SAMException("Files " + f1 + " and " + f2 + " are different lengths."); + throw new SAMException("File " + f1 + " is " + f1.length() + " bytes but file " + f2 + " is " + f2.length() + " bytes."); } final FileInputStream s1 = new FileInputStream(f1); final FileInputStream s2 = new FileInputStream(f2); diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/IndexingVariantContextWriter.java b/src/main/java/htsjdk/variant/variantcontext/writer/IndexingVariantContextWriter.java index eece2d1ac..fa3f6ba54 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/IndexingVariantContextWriter.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/IndexingVariantContextWriter.java @@ -60,6 +60,8 @@ private IndexingVariantContextWriter(final String name, final File location, fin this.refDict = refDict; } + static String DEFAULT_READER_NAME = "Reader Name"; + /** * Create a VariantContextWriter with an associated index using the default index creator * @@ -178,6 +180,6 @@ public void add(final VariantContext vc) { * @return */ protected static final String writerName(final File location, final OutputStream stream) { - return location == null ? stream.toString() : location.getAbsolutePath(); + return location == null ? stream == null ? DEFAULT_READER_NAME : stream.toString() : location.getAbsolutePath(); } } diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java b/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java index b1d7ca0eb..ddc0d50d5 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilder.java @@ -406,8 +406,9 @@ else if (STREAM_TYPES.contains(this.outType)) typeToBuild = OutputType.BCF_STREAM; } + // If we are writing to a file, or a special file type (ex. pipe) where the stream is not yet open. OutputStream outStreamFromFile = this.outStream; - if (FILE_TYPES.contains(this.outType)) { + if (FILE_TYPES.contains(this.outType) || (STREAM_TYPES.contains(this.outType) && this.outStream == null)) { try { outStreamFromFile = IOUtil.maybeBufferOutputStream(new FileOutputStream(outFile), bufferSize); } catch (final FileNotFoundException e) { @@ -445,7 +446,7 @@ else if (STREAM_TYPES.contains(this.outType)) if (options.contains(Options.INDEX_ON_THE_FLY)) throw new IllegalArgumentException("VCF index creation not supported for stream output."); - writer = createVCFWriter(null, outStream); + writer = createVCFWriter(null, outStreamFromFile); break; case BCF_STREAM: if (options.contains(Options.INDEX_ON_THE_FLY)) diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilderUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilderUnitTest.java index 179c4cb2f..5e33e5c44 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilderUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/VariantContextWriterBuilderUnitTest.java @@ -396,4 +396,12 @@ public void testModifyOption() { Assert.assertFalse(builder.isOptionSet(option)); // has been unset } } + + @Test + public void testStdOut() { + final VariantContextWriter writer = new VariantContextWriterBuilder().setOutputFile("/dev/stdout").clearOptions().build(); + OutputStream s = ((VCFWriter) writer).getOutputStream(); + Assert.assertNotNull(((VCFWriter) writer).getOutputStream()); + Assert.assertNotEquals(((VCFWriter) writer).getStreamName(), IndexingVariantContextWriter.DEFAULT_READER_NAME); + } } From 3ae056ce29d56cccbdf60d7ed6109aef768b4810 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 27 Jul 2017 13:26:47 -0400 Subject: [PATCH 22/23] Updated javadocs for the query method --- src/main/java/htsjdk/variant/vcf/VCFFileReader.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFFileReader.java b/src/main/java/htsjdk/variant/vcf/VCFFileReader.java index b43935880..d13387cd2 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFileReader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFileReader.java @@ -137,7 +137,14 @@ public VCFHeader getFileHeader() { } } - /** Queries for records within the region specified. */ + /** + * Queries for records overlapping the region specified. + * Note that this method requires VCF files with an associated index. If no index exists a TribbleException will be thrown. + * @param chrom the chomosome to query + * @param start query interval start + * @param end query interval end + * @return non-null iterator over VariantContexts + */ public CloseableIterator query(final String chrom, final int start, final int end) { try { return reader.query(chrom, start, end); } catch (final IOException ioe) { From 03e8eaa7ed72a477be81b6f8a56c9e2eb6486d8d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 31 Jul 2017 14:10:45 -0400 Subject: [PATCH 23/23] Broke out the static variables for VCF suffixes so that they can be used separately. --- src/main/java/htsjdk/samtools/util/IOUtil.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/samtools/util/IOUtil.java b/src/main/java/htsjdk/samtools/util/IOUtil.java index 5d8afed73..8e06c039d 100644 --- a/src/main/java/htsjdk/samtools/util/IOUtil.java +++ b/src/main/java/htsjdk/samtools/util/IOUtil.java @@ -86,8 +86,11 @@ public static final long TWO_GBS = 2 * ONE_GB; public static final long FIVE_GBS = 5 * ONE_GB; + public static final String VCF_FILE_EXTENSION = ".vcf"; + public static final String BCF_FILE_EXTENSION = ".bcf"; + public static final String COMPRESSED_VCF_FILE_EXTENSION = ".vcf.gz"; /** Possible extensions for VCF files and related formats. */ - public static final String[] VCF_EXTENSIONS = new String[] {".vcf", ".vcf.gz", ".bcf"}; + public static final String[] VCF_EXTENSIONS = {VCF_FILE_EXTENSION, COMPRESSED_VCF_FILE_EXTENSION, BCF_FILE_EXTENSION}; public static final String INTERVAL_LIST_FILE_EXTENSION = IntervalList.INTERVAL_LIST_FILE_EXTENSION;