Permalink
Browse files

Normalize CRAM reference bases and throw on MD5 validation failure.

  • Loading branch information...
1 parent c8202f2 commit a781afa9597dcdbcde0020bfe464abee269b3b2e @cmnbroad cmnbroad committed Jul 26, 2016
@@ -177,10 +177,14 @@ void nextContainer() throws IOException, IllegalArgumentException,
final Slice slice = container.slices[i];
if (slice.sequenceId < 0)
continue;
- if (validationStringency != ValidationStringency.SILENT && !slice.validateRefMD5(refs)) {
- log.error(String
- .format("Reference sequence MD5 mismatch for slice: seq id %d, start %d, span %d, expected MD5 %s", slice.sequenceId,
- slice.alignmentStart, slice.alignmentSpan, String.format("%032x", new BigInteger(1, slice.refMD5))));
+ if (!slice.validateRefMD5(refs)) {
+ final String msg = String.format(
+ "Reference sequence MD5 mismatch for slice: sequence id %d, start %d, span %d, expected MD5 %s",
+ slice.sequenceId,
+ slice.alignmentStart,
+ slice.alignmentSpan,
+ String.format("%032x", new BigInteger(1, slice.refMD5)));
+ throw new CRAMException(msg);
}
}
@@ -17,7 +17,9 @@
*/
package htsjdk.samtools.cram.build;
-class Utils {
+final public class Utils {
+
+ private Utils() {};
/**
* CRAM operates with upper case bases, so both read and ref bases should be
@@ -15,8 +15,8 @@
* against the reference by using common name variations,
* such as adding or removing a leading "chr" prefix
* from the requested name. if false, use exact match
- * @return the bases representing the requested sequence. or null if the sequence
- * cannot be found
+ * @return the upper cased, normalized (see {@link htsjdk.samtools.cram.build.Utils#normalizeBase})
+ * bases representing the requested sequence, or null if the sequence cannot be found
*/
byte[] getReferenceBases(final SAMSequenceRecord sequenceRecord, final boolean tryNameVariants);
}
@@ -20,6 +20,7 @@
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.cram.build.Utils;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
@@ -123,13 +124,23 @@ public void clearCache() {
return null;
}
+ // Upper case and normalize (-> ACGTN) in-place, and add to the cache
+ private byte[] addToCache(final String sequenceName, final byte[] bases) {
+ for (int i = 0; i < bases.length; i++) {
+ bases[i] = Utils.normalizeBase(bases[i]);
+ }
+ cacheW.put(sequenceName, new WeakReference<byte[]>(bases));
+ return bases;
+ }
+
public synchronized byte[] getReferenceBases(final SAMSequenceRecord record,
final boolean tryNameVariants) {
{ // check cache by sequence name:
final String name = record.getSequenceName();
final byte[] bases = findInCache(name);
- if (bases != null)
+ if (bases != null) {
return bases;
+ }
}
final String md5 = record.getAttribute(SAMSequenceRecord.MD5_TAG);
@@ -152,10 +163,7 @@ public void clearCache() {
{ // try to fetch sequence by name:
bases = findBasesByName(record.getSequenceName(), tryNameVariants);
if (bases != null) {
- SequenceUtil.upperCase(bases);
- cacheW.put(record.getSequenceName(), new WeakReference<byte[]>(
- bases));
- return bases;
+ return addToCache(record.getSequenceName(), bases);
}
}
@@ -169,9 +177,7 @@ public void clearCache() {
}
}
if (bases != null) {
- SequenceUtil.upperCase(bases);
- cacheW.put(md5, new WeakReference<byte[]>(bases));
- return bases;
+ return addToCache(md5, bases);
}
}
}
@@ -30,6 +30,11 @@ public void testCRAIIndexerFromContainer() throws IOException {
refSource,
ValidationStringency.STRICT);
SAMFileHeader samHeader = reader.getFileHeader();
+ Iterator<SAMRecord> it = reader.getIterator();
+ while(it.hasNext()) {
+ SAMRecord samRec = it.next();
+ }
+
reader.close();
FileInputStream fis = new FileInputStream(CRAMFile);
@@ -1,10 +1,19 @@
package htsjdk.samtools.cram.structure;
+import htsjdk.samtools.CRAMFileReader;
+import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.ValidationStringency;
+import htsjdk.samtools.cram.CRAMException;
+import htsjdk.samtools.cram.ref.ReferenceSource;
import htsjdk.samtools.util.SequenceUtil;
import org.testng.Assert;
import org.testng.annotations.Test;
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+
/**
* Created by vadim on 07/12/2015.
*/
@@ -33,4 +42,29 @@ public void test_validateRef() {
Assert.assertEquals(slice.refMD5, md5);
Assert.assertTrue(slice.validateRefMD5(ref));
}
+
+ @Test(expectedExceptions= CRAMException.class)
+ public void testFailsMD5Check() throws IOException {
+ // auxf.alteredForMD5test.fa has been altered slightly from the original reference
+ // to cause the CRAM md5 check to fail
+ final File CRAMFile = new File("src/test/resources/htsjdk/samtools/cram/auxf#values.3.0.cram");
+ final File refFile = new File("src/test/resources/htsjdk/samtools/cram/auxf.alteredForMD5test.fa");
+ ReferenceSource refSource = new ReferenceSource(refFile);
+ CRAMFileReader reader = null;
+ try {
+ reader = new CRAMFileReader(
+ CRAMFile,
+ null,
+ refSource,
+ ValidationStringency.STRICT);
+ Iterator<SAMRecord> it = reader.getIterator();
+ while (it.hasNext()) {
+ it.next();
+ }
+ } finally {
+ if (reader != null) {
+ reader.close();
+ }
+ }
+ }
}
@@ -0,0 +1,2 @@
+>Sheila
+CTAGCTCAGAAAAAAAAAA
@@ -0,0 +1 @@
+Sheila 19 8 19 20

0 comments on commit a781afa

Please sign in to comment.