From c3d5a884869d4f83b7b8eab37c7153c2576b2c63 Mon Sep 17 00:00:00 2001 From: Pierre Lindenbaum Date: Thu, 25 Aug 2016 16:30:12 +0200 Subject: [PATCH] added function to extract SA tag and return a list of supplementary alignments (#685) added a new function SAMUtils.getOtherCanonicalAlignments This function is used to extract the 'SA' tag of a SAMRecord as a Listof supplementary alignments. --- src/main/java/htsjdk/samtools/SAMUtils.java | 124 ++++++++++++++++++++++++ src/test/java/htsjdk/samtools/SAMUtilsTest.java | 77 ++++++++++++++- 2 files changed, 200 insertions(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/samtools/SAMUtils.java b/src/main/java/htsjdk/samtools/SAMUtils.java index b31b7718c..c0432acc8 100644 --- a/src/main/java/htsjdk/samtools/SAMUtils.java +++ b/src/main/java/htsjdk/samtools/SAMUtils.java @@ -41,12 +41,18 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; +import java.util.regex.Pattern; /** * Utilty methods. */ public final class SAMUtils { + /** regex for semicolon, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */ + private static final Pattern SEMICOLON_PAT = Pattern.compile("[;]"); + /** regex for comma, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */ + private static final Pattern COMMA_PAT = Pattern.compile("[,]"); + // Representation of bases, one for when in low-order nybble, one for when in high-order nybble. private static final byte COMPRESSED_EQUAL_LOW = 0; private static final byte COMPRESSED_A_LOW = 1; @@ -1096,4 +1102,122 @@ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, fina public static boolean isValidUnsignedIntegerAttribute(long value) { return value >= 0 && value <= BinaryCodec.MAX_UINT; } + + /** + * Extract a List of 'other canonical alignments' from a SAM record. Those alignments are stored as a string in the 'SA' tag as defined + * in the SAM specification. + * The name, sequence and qualities, mate data are copied from the original record. + * @param record must be non null and must have a non-null associated header. + * @return a list of 'other canonical alignments' SAMRecords. The list is empty if the 'SA' attribute is missing. + */ + public static List getOtherCanonicalAlignments(final SAMRecord record) { + if( record == null ) throw new IllegalArgumentException("record is null"); + if( record.getHeader() == null ) throw new IllegalArgumentException("record.getHeader() is null"); + /* extract value of SA tag */ + final Object saValue = record.getAttribute( SAMTagUtil.getSingleton().SA ); + if( saValue == null ) return Collections.emptyList(); + if( ! (saValue instanceof String) ) throw new SAMException( + "Expected a String for attribute 'SA' but got " + saValue.getClass() ); + + final SAMRecordFactory samReaderFactory = new DefaultSAMRecordFactory(); + + /* the spec says: "Other canonical alignments in a chimeric alignment, formatted as a + * semicolon-delimited list: (rname,pos,strand,CIGAR,mapQ,NM;)+. + * Each element in the list represents a part of the chimeric alignment. + * Conventionally, at a supplementary line, the 1rst element points to the primary line. + */ + + /* break string using semicolon */ + final String semiColonStrs[] = SEMICOLON_PAT.split((String)saValue); + + /* the result list */ + final List alignments = new ArrayList<>( semiColonStrs.length ); + + /* base SAM flag */ + int record_flag = record.getFlags() ; + record_flag &= ~SAMFlag.PROPER_PAIR.flag; + record_flag &= ~SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag; + record_flag &= ~SAMFlag.READ_REVERSE_STRAND.flag; + + + for(int i=0; i< semiColonStrs.length;++i ) { + final String semiColonStr = semiColonStrs[i]; + /* ignore empty string */ + if( semiColonStr.isEmpty() ) continue; + + /* break string using comma */ + final String commaStrs[] = COMMA_PAT.split(semiColonStr); + if( commaStrs.length != 6 ) throw new SAMException("Bad 'SA' attribute in " + semiColonStr); + + /* create the new record */ + final SAMRecord otherRec = samReaderFactory.createSAMRecord( record.getHeader() ); + + /* copy fields from the original record */ + otherRec.setReadName( record.getReadName() ); + otherRec.setReadBases( record.getReadBases() ); + otherRec.setBaseQualities( record.getBaseQualities() ); + if( record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { + otherRec.setMateReferenceIndex( record.getMateReferenceIndex() ); + otherRec.setMateAlignmentStart( record.getMateAlignmentStart() ); + } + + + /* get reference sequence */ + final int tid = record.getHeader().getSequenceIndex( commaStrs[0] ); + if( tid == -1 ) throw new SAMException("Unknown contig in " + semiColonStr); + otherRec.setReferenceIndex( tid ); + + /* fill POS */ + final int alignStart; + try { + alignStart = Integer.parseInt(commaStrs[1]); + } catch( final NumberFormatException err ) { + throw new SAMException("bad POS in "+semiColonStr, err); + } + + otherRec.setAlignmentStart( alignStart ); + + /* set TLEN */ + if( record.getReadPairedFlag() && + !record.getMateUnmappedFlag() && + record.getMateReferenceIndex() == tid ) { + otherRec.setInferredInsertSize( record.getMateAlignmentStart() - alignStart ); + } + + /* set FLAG */ + int other_flag = record_flag; + other_flag |= (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag) ; + /* spec: Conventionally, at a supplementary line, the 1st element points to the primary line */ + if( !( record.getSupplementaryAlignmentFlag() && i==0 ) ) { + other_flag |= SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag; + } + otherRec.setFlags(other_flag); + + /* set CIGAR */ + otherRec.setCigar( TextCigarCodec.decode( commaStrs[3] ) ); + + /* set MAPQ */ + try { + otherRec.setMappingQuality( Integer.parseInt(commaStrs[4]) ); + } catch (final NumberFormatException err) { + throw new SAMException("bad MAPQ in "+semiColonStr, err); + } + + /* fill NM */ + try { + otherRec.setAttribute( SAMTagUtil.getSingleton().NM , Integer.parseInt(commaStrs[5]) ); + } catch (final NumberFormatException err) { + throw new SAMException("bad NM in "+semiColonStr, err); + } + + /* if strand is not the same: reverse-complement */ + if( otherRec.getReadNegativeStrandFlag() != record.getReadNegativeStrandFlag() ) { + SAMRecordUtil.reverseComplement(otherRec); + } + + /* add the alignment */ + alignments.add( otherRec ); + } + return alignments; + } } diff --git a/src/test/java/htsjdk/samtools/SAMUtilsTest.java b/src/test/java/htsjdk/samtools/SAMUtilsTest.java index 48baf448d..0fa6b4a69 100644 --- a/src/test/java/htsjdk/samtools/SAMUtilsTest.java +++ b/src/test/java/htsjdk/samtools/SAMUtilsTest.java @@ -26,7 +26,7 @@ import org.testng.Assert; import org.testng.annotations.Test; -import java.util.Arrays; +import java.util.List; public class SAMUtilsTest { @Test @@ -173,4 +173,79 @@ public void testClippingOfRecordWithMateAtSamePosition() { record.setSecondOfPairFlag(true); Assert.assertEquals(SAMUtils.getNumOverlappingAlignedBasesToClip(record), 10); } + + @Test + public void testOtherCanonicalAlignments() { + // setup the record + final SAMFileHeader header = new SAMFileHeader(); + header.addSequence(new SAMSequenceRecord("1", 1000)); + header.addSequence(new SAMSequenceRecord("2", 1000)); + final SAMRecord record = new SAMRecord(header); + record.setReadPairedFlag(true); + record.setFirstOfPairFlag(true); + record.setCigar(TextCigarCodec.decode("10M")); + record.setReferenceIndex(0); + record.setAlignmentStart(1); + record.setMateReferenceIndex(0); + record.setMateAlignmentStart(1); + record.setReadPairedFlag(true); + record.setSupplementaryAlignmentFlag(true);//spec says first 'SA' record will be the primary record + + record.setMateReferenceIndex(0); + record.setMateAlignmentStart(100); + record.setInferredInsertSize(99); + + record.setReadBases("AAAAAAAAAA".getBytes()); + record.setBaseQualities("##########".getBytes()); + // check no alignments if no SA tag */ + Assert.assertEquals(SAMUtils.getOtherCanonicalAlignments(record).size(),0); + + + record.setAttribute(SAMTagUtil.getSingleton().SA, + "2,500,+,3S2=1X2=2S,60,1;" + + "1,191,-,8M2S,60,0;"); + + // extract suppl alignments + final List suppl = SAMUtils.getOtherCanonicalAlignments(record); + Assert.assertNotNull(suppl); + Assert.assertEquals(suppl.size(), 2); + + for(final SAMRecord other: suppl) { + Assert.assertFalse(other.getReadUnmappedFlag()); + Assert.assertTrue(other.getReadPairedFlag()); + Assert.assertFalse(other.getMateUnmappedFlag()); + Assert.assertEquals(other.getMateAlignmentStart(),record.getMateAlignmentStart()); + Assert.assertEquals(other.getMateReferenceName(),record.getMateReferenceName()); + + Assert.assertEquals(other.getReadName(),record.getReadName()); + if( other.getReadNegativeStrandFlag()==record.getReadNegativeStrandFlag()) { + Assert.assertEquals(other.getReadString(),record.getReadString()); + Assert.assertEquals(other.getBaseQualityString(),record.getBaseQualityString()); + } + } + + SAMRecord other = suppl.get(0); + Assert.assertFalse(other.getSupplementaryAlignmentFlag());//1st of suppl and 'record' is supplementary + Assert.assertEquals(other.getReferenceName(),"2"); + Assert.assertEquals(other.getAlignmentStart(),500); + Assert.assertFalse(other.getReadNegativeStrandFlag()); + Assert.assertEquals(other.getMappingQuality(), 60); + Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),1); + Assert.assertEquals(other.getCigarString(),"3S2=1X2=2S"); + Assert.assertEquals(other.getInferredInsertSize(),0); + + + other = suppl.get(1); + Assert.assertTrue(other.getSupplementaryAlignmentFlag()); + Assert.assertEquals(other.getReferenceName(),"1"); + Assert.assertEquals(other.getAlignmentStart(),191); + Assert.assertTrue(other.getReadNegativeStrandFlag()); + Assert.assertEquals(other.getMappingQuality(), 60); + Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),0); + Assert.assertEquals(other.getCigarString(),"8M2S"); + Assert.assertEquals(other.getInferredInsertSize(),-91);//100(mate) - 191(other) + + + } + }