SAMUtils:getOtherCanonicalAlignments extract 'SA' tag and return a list of supplementary alignments #685

Merged
merged 5 commits into from Aug 25, 2016
Jump to file or symbol
Failed to load files and symbols.
+200 −1
Split
@@ -41,12 +41,18 @@
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
+import java.util.regex.Pattern;
/**
* Utilty methods.
*/
public final class SAMUtils {
+ /** regex for semicolon, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */
+ private static final Pattern SEMICOLON_PAT = Pattern.compile("[;]");
+ /** regex for comma, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */
+ private static final Pattern COMMA_PAT = Pattern.compile("[,]");
+
// Representation of bases, one for when in low-order nybble, one for when in high-order nybble.
private static final byte COMPRESSED_EQUAL_LOW = 0;
private static final byte COMPRESSED_A_LOW = 1;
@@ -1096,4 +1102,122 @@ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, fina
public static boolean isValidUnsignedIntegerAttribute(long value) {
return value >= 0 && value <= BinaryCodec.MAX_UINT;
}
+
+ /**
+ * Extract a List of 'other canonical alignments' from a SAM record. Those alignments are stored as a string in the 'SA' tag as defined
+ * in the SAM specification.
+ * The name, sequence and qualities, mate data are copied from the original record.
+ * @param record must be non null and must have a non-null associated header.
+ * @return a list of 'other canonical alignments' SAMRecords. The list is empty if the 'SA' attribute is missing.
+ */
+ public static List<SAMRecord> getOtherCanonicalAlignments(final SAMRecord record) {
+ if( record == null ) throw new IllegalArgumentException("record is null");
+ if( record.getHeader() == null ) throw new IllegalArgumentException("record.getHeader() is null");
+ /* extract value of SA tag */
+ final Object saValue = record.getAttribute( SAMTagUtil.getSingleton().SA );
+ if( saValue == null ) return Collections.emptyList();
+ if( ! (saValue instanceof String) ) throw new SAMException(
+ "Expected a String for attribute 'SA' but got " + saValue.getClass() );
+
+ final SAMRecordFactory samReaderFactory = new DefaultSAMRecordFactory();
+
+ /* the spec says: "Other canonical alignments in a chimeric alignment, formatted as a
+ * semicolon-delimited list: (rname,pos,strand,CIGAR,mapQ,NM;)+.
+ * Each element in the list represents a part of the chimeric alignment.
+ * Conventionally, at a supplementary line, the 1rst element points to the primary line.
+ */
+
+ /* break string using semicolon */
+ final String semiColonStrs[] = SEMICOLON_PAT.split((String)saValue);
+
+ /* the result list */
+ final List<SAMRecord> alignments = new ArrayList<>( semiColonStrs.length );
+
+ /* base SAM flag */
+ int record_flag = record.getFlags() ;
+ record_flag &= ~SAMFlag.PROPER_PAIR.flag;
+ record_flag &= ~SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag;
+ record_flag &= ~SAMFlag.READ_REVERSE_STRAND.flag;
+
+
+ for(int i=0; i< semiColonStrs.length;++i ) {
+ final String semiColonStr = semiColonStrs[i];
+ /* ignore empty string */
+ if( semiColonStr.isEmpty() ) continue;
+
+ /* break string using comma */
+ final String commaStrs[] = COMMA_PAT.split(semiColonStr);
+ if( commaStrs.length != 6 ) throw new SAMException("Bad 'SA' attribute in " + semiColonStr);
+
+ /* create the new record */
+ final SAMRecord otherRec = samReaderFactory.createSAMRecord( record.getHeader() );
+
+ /* copy fields from the original record */
+ otherRec.setReadName( record.getReadName() );
+ otherRec.setReadBases( record.getReadBases() );
+ otherRec.setBaseQualities( record.getBaseQualities() );
@jamesemery

jamesemery Aug 17, 2016

Contributor

Furthermore, what about the mate information?

+ if( record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
+ otherRec.setMateReferenceIndex( record.getMateReferenceIndex() );
+ otherRec.setMateAlignmentStart( record.getMateAlignmentStart() );
+ }
+
+
+ /* get reference sequence */
+ final int tid = record.getHeader().getSequenceIndex( commaStrs[0] );
+ if( tid == -1 ) throw new SAMException("Unknown contig in " + semiColonStr);
+ otherRec.setReferenceIndex( tid );
+
+ /* fill POS */
+ final int alignStart;
+ try {
+ alignStart = Integer.parseInt(commaStrs[1]);
+ } catch( final NumberFormatException err ) {
+ throw new SAMException("bad POS in "+semiColonStr, err);
+ }
+
+ otherRec.setAlignmentStart( alignStart );
+
+ /* set TLEN */
+ if( record.getReadPairedFlag() &&
+ !record.getMateUnmappedFlag() &&
+ record.getMateReferenceIndex() == tid ) {
+ otherRec.setInferredInsertSize( record.getMateAlignmentStart() - alignStart );
+ }
+
+ /* set FLAG */
+ int other_flag = record_flag;
+ other_flag |= (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag) ;
+ /* spec: Conventionally, at a supplementary line, the 1st element points to the primary line */
+ if( !( record.getSupplementaryAlignmentFlag() && i==0 ) ) {
+ other_flag |= SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag;
+ }
+ otherRec.setFlags(other_flag);
+
+ /* set CIGAR */
+ otherRec.setCigar( TextCigarCodec.decode( commaStrs[3] ) );
+
+ /* set MAPQ */
+ try {
+ otherRec.setMappingQuality( Integer.parseInt(commaStrs[4]) );
+ } catch (final NumberFormatException err) {
+ throw new SAMException("bad MAPQ in "+semiColonStr, err);
+ }
+
+ /* fill NM */
+ try {
+ otherRec.setAttribute( SAMTagUtil.getSingleton().NM , Integer.parseInt(commaStrs[5]) );
+ } catch (final NumberFormatException err) {
+ throw new SAMException("bad NM in "+semiColonStr, err);
+ }
+
+ /* if strand is not the same: reverse-complement */
+ if( otherRec.getReadNegativeStrandFlag() != record.getReadNegativeStrandFlag() ) {
+ SAMRecordUtil.reverseComplement(otherRec);
+ }
+
+ /* add the alignment */
+ alignments.add( otherRec );
+ }
+ return alignments;
+ }
}
@@ -26,7 +26,7 @@
import org.testng.Assert;
import org.testng.annotations.Test;
-import java.util.Arrays;
+import java.util.List;
public class SAMUtilsTest {
@Test
@@ -173,4 +173,79 @@ public void testClippingOfRecordWithMateAtSamePosition() {
record.setSecondOfPairFlag(true);
Assert.assertEquals(SAMUtils.getNumOverlappingAlignedBasesToClip(record), 10);
}
+
+ @Test
+ public void testOtherCanonicalAlignments() {
+ // setup the record
+ final SAMFileHeader header = new SAMFileHeader();
+ header.addSequence(new SAMSequenceRecord("1", 1000));
+ header.addSequence(new SAMSequenceRecord("2", 1000));
+ final SAMRecord record = new SAMRecord(header);
+ record.setReadPairedFlag(true);
+ record.setFirstOfPairFlag(true);
+ record.setCigar(TextCigarCodec.decode("10M"));
+ record.setReferenceIndex(0);
+ record.setAlignmentStart(1);
+ record.setMateReferenceIndex(0);
+ record.setMateAlignmentStart(1);
+ record.setReadPairedFlag(true);
+ record.setSupplementaryAlignmentFlag(true);//spec says first 'SA' record will be the primary record
+
+ record.setMateReferenceIndex(0);
+ record.setMateAlignmentStart(100);
+ record.setInferredInsertSize(99);
+
+ record.setReadBases("AAAAAAAAAA".getBytes());
+ record.setBaseQualities("##########".getBytes());
+ // check no alignments if no SA tag */
+ Assert.assertEquals(SAMUtils.getOtherCanonicalAlignments(record).size(),0);
+
+
+ record.setAttribute(SAMTagUtil.getSingleton().SA,
+ "2,500,+,3S2=1X2=2S,60,1;" +
+ "1,191,-,8M2S,60,0;");
+
+ // extract suppl alignments
+ final List<SAMRecord> suppl = SAMUtils.getOtherCanonicalAlignments(record);
+ Assert.assertNotNull(suppl);
+ Assert.assertEquals(suppl.size(), 2);
+
+ for(final SAMRecord other: suppl) {
+ Assert.assertFalse(other.getReadUnmappedFlag());
+ Assert.assertTrue(other.getReadPairedFlag());
+ Assert.assertFalse(other.getMateUnmappedFlag());
+ Assert.assertEquals(other.getMateAlignmentStart(),record.getMateAlignmentStart());
+ Assert.assertEquals(other.getMateReferenceName(),record.getMateReferenceName());
+
+ Assert.assertEquals(other.getReadName(),record.getReadName());
+ if( other.getReadNegativeStrandFlag()==record.getReadNegativeStrandFlag()) {
+ Assert.assertEquals(other.getReadString(),record.getReadString());
+ Assert.assertEquals(other.getBaseQualityString(),record.getBaseQualityString());
+ }
+ }
+
+ SAMRecord other = suppl.get(0);
+ Assert.assertFalse(other.getSupplementaryAlignmentFlag());//1st of suppl and 'record' is supplementary
+ Assert.assertEquals(other.getReferenceName(),"2");
+ Assert.assertEquals(other.getAlignmentStart(),500);
+ Assert.assertFalse(other.getReadNegativeStrandFlag());
+ Assert.assertEquals(other.getMappingQuality(), 60);
+ Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),1);
+ Assert.assertEquals(other.getCigarString(),"3S2=1X2=2S");
+ Assert.assertEquals(other.getInferredInsertSize(),0);
+
+
+ other = suppl.get(1);
+ Assert.assertTrue(other.getSupplementaryAlignmentFlag());
+ Assert.assertEquals(other.getReferenceName(),"1");
+ Assert.assertEquals(other.getAlignmentStart(),191);
+ Assert.assertTrue(other.getReadNegativeStrandFlag());
+ Assert.assertEquals(other.getMappingQuality(), 60);
+ Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),0);
+ Assert.assertEquals(other.getCigarString(),"8M2S");
@jamesemery

jamesemery Aug 17, 2016

Contributor

You should at least add a test that a record with no SA tag returns an empty list properly

+ Assert.assertEquals(other.getInferredInsertSize(),-91);//100(mate) - 191(other)
+
+
+ }
+
}