Yf add tag aware merge samdictionary #689

Merged
merged 4 commits into from Oct 27, 2016
@@ -23,20 +23,22 @@
*/
package htsjdk.samtools;
+import htsjdk.samtools.util.Log;
+
import java.io.Serializable;
import java.math.BigInteger;
import java.security.MessageDigest;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
+import java.util.stream.Collector;
+import java.util.stream.Collectors;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.bind.annotation.XmlTransient;
+import static htsjdk.samtools.SAMSequenceRecord.*;
+import static java.util.stream.Collectors.toList;
+
/**
* Collection of SAMSequenceRecords.
*/
@@ -64,6 +66,8 @@ public SAMSequenceDictionary(final List<SAMSequenceRecord> list) {
return Collections.unmodifiableList(mSequences);
}
+ private static Log log = Log.getInstance(SAMSequenceDictionary.class);
+
public SAMSequenceRecord getSequence(final String name) {
return mSequenceMap.get(name);
}
@@ -135,7 +139,7 @@ public long getReferenceLength() {
}
return len;
}
-
+
/**
* @return true is the dictionary is empty
*/
@@ -146,15 +150,15 @@ public boolean isEmpty() {
private static String DICT_MISMATCH_TEMPLATE = "SAM dictionaries are not the same: %s.";
/**
* Non-comprehensive {@link #equals(Object)}-assertion: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent
- * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call
+ * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call
* {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}.
* Aliases are ignored.
*
* @throws AssertionError When the dictionaries are not the same, with some human-readable information as to why
*/
public void assertSameDictionary(final SAMSequenceDictionary that) {
if (this == that) return;
-
+
final Iterator<SAMSequenceRecord> thatSequences = that.mSequences.iterator();
for (final SAMSequenceRecord thisSequence : mSequences) {
if (!thatSequences.hasNext())
@@ -189,7 +193,7 @@ public boolean equals(Object o) {
* alternate names fo a given contig. e.g:
* <code>1,chr1,chr01,01,CM000663,NC_000001.10</code> e.g:
* <code>MT,chrM</code>
- *
+ *
* @param originalName
* existing contig name
* @param altName
@@ -219,11 +223,11 @@ public SAMSequenceRecord addSequenceAlias(final String originalName,
/**
* return a MD5 sum for ths dictionary, the checksum is re-computed each
* time this method is called.
- *
+ *
* <pre>
* md5( (seq1.md5_if_available) + ' '+(seq2.name+seq2.length) + ' '+...)
* </pre>
- *
+ *
* @return a MD5 checksum for this dictionary or the empty string if it is
* empty
*/
@@ -266,5 +270,86 @@ public String toString() {
" length:"+ getReferenceLength()+" "+
" md5:"+md5()+")";
}
+
+ public static final List<String> DEFAULT_DICTIONARY_EQUAL_TAG = Arrays.asList(
+ SAMSequenceRecord.MD5_TAG,
+ SAMSequenceRecord.SEQUENCE_LENGTH_TAG);
+
+ /**
+ * Will merge dictionaryTags from two dictionaries into one focusing on merging the tags rather than the sequences.
+ *
+ * Requires that dictionaries have the same SAMSequence records in the same order.
+ * For each sequenceIndex, the union of the tags from both sequences will be added to the new sequence, mismatching
+ * values (for tags that are in both) will generate a warning, and the value from dict1 will be used.
+ * For tags that are in tagsToEquate an unequal value will generate an error (an IllegalArgumentException will
+ * be thrown.) tagsToEquate must include LN and MD.
+ *
+ * @param dict1 first dictionary
+ * @param dict2 first dictionary
+ * @param tagsToMatch list of tags that must be equal if present in both sequence. Must contain MD, and LN
+ * @return dictionary consisting of the same sequences as the two inputs with the merged values of tags.
+ */
+ static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionary dict1,
+ final SAMSequenceDictionary dict2,
+ final List<String> tagsToMatch) {
@tfenne

tfenne Sep 20, 2016

Owner

Circling back, I think this parameter should probably go away. Length (if present) and MD5 (if present) should be required to match in all cases, since by definition they should match. Other tags, I'm not sure it matters if they match.

@yfarjoun

yfarjoun Sep 22, 2016

Contributor

I agree about Length and MD5.

Other tags there are two options when they do not match: take the first, or throw an exception.

My idea is that if the tags are in the list, an exception will be thrown, otherwise the value from the first dictionary will be taken. Sounds good?

@tfenne

tfenne Sep 22, 2016

Owner

I guess my thought is that the set of tags on SQ lines is known, so we know which are ok to differ vs. not, and there's no need to supply a list of tags. I don't care that much though.

+
+ // We require MD and LN to match.
+ if (!tagsToMatch.contains(MD5_TAG) || !tagsToMatch.contains(SEQUENCE_LENGTH_TAG)) {
+ throw new IllegalArgumentException("Both " + MD5_TAG + " and " + SEQUENCE_LENGTH_TAG + " must be matched " +
+ "when merging dictionaries. Found: " + String.join(",", tagsToMatch));
+ }
+
+ if (!dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()).equals(
+ dict2.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()))) {
+
+ throw new IllegalArgumentException(String.format("Do not use this function to merge dictionaries with " +
+ "different sequences in them. Sequences must be in the same order as well. Found [%s] and [%s].",
+ String.join(", ", dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(toList())),
+ String.join(", ", dict2.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(toList()))));
+ }
+
+ final SAMSequenceDictionary finalDict = new SAMSequenceDictionary();
+ for (int sequenceIndex = 0; sequenceIndex < dict1.getSequences().size(); sequenceIndex++) {
+ final SAMSequenceRecord s1 = dict1.getSequence(sequenceIndex);
+ final SAMSequenceRecord s2 = dict2.getSequence(sequenceIndex);
+
+ final String sName = s1.getSequenceName();
+ final SAMSequenceRecord sMerged = new SAMSequenceRecord(sName, UNKNOWN_SEQUENCE_LENGTH);
+ finalDict.addSequence(sMerged);
+
+ final Set<String> allTags = new HashSet<>();
+ s1.getAttributes().stream().forEach(a -> allTags.add(a.getKey()));
+ s2.getAttributes().stream().forEach(a -> allTags.add(a.getKey()));
+
+ for (final String tag : allTags) {
+ final String value1 = s1.getAttribute(tag);
+ final String value2 = s2.getAttribute(tag);
+
+ if (value1 != null && value2 != null && !value1.equals(value2)) {
+ String baseMessage = String.format("Found sequence entry for which " +
+ "tags differ: %s and tag %s has the two values: %s and %s.",
+ sName, tag, value1, value2);
+
+ if (tagsToMatch.contains(tag)) {
+ log.error("Cannot merge dictionaries. ", baseMessage);
+ throw new IllegalArgumentException("Cannot merge dictionaries. " + baseMessage);
+ } else {
+ log.warn(baseMessage, " Using ", value1);
+ }
+ }
+ sMerged.setAttribute(tag, value1 == null ? value2 : value1);
+ }
+
+ final int length1 = s1.getSequenceLength();
+ final int length2 = s2.getSequenceLength();
+
+ if (length1 != UNKNOWN_SEQUENCE_LENGTH && length2 != UNKNOWN_SEQUENCE_LENGTH && length1 != length2) {
+ throw new IllegalArgumentException(String.format("Cannot merge the two dictionaries. " +
+ "Found sequence entry for which " + "lengths differ: %s has lengths %s and %s", sName, length1, length2));
+ }
+ sMerged.setSequenceLength(length1 == UNKNOWN_SEQUENCE_LENGTH ? length2 : length1);
+ }
+ return finalDict;
+ }
}
@@ -27,11 +27,15 @@
package htsjdk.samtools;
import org.testng.Assert;
+import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.StringReader;
import java.io.StringWriter;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
@@ -89,4 +93,55 @@ public void testXmlSeralization() throws JAXBException {
Assert.assertEquals(dict1, dict2);
}
+ @DataProvider(name="testMergeDictionariesData")
+ public Object[][] testMergeDictionariesData(){
+
+ final SAMSequenceRecord rec1, rec2, rec3, rec4, rec5;
+ rec1 = new SAMSequenceRecord("chr1", 100);
+ rec2 = new SAMSequenceRecord("chr1", 101);
+ rec2.setMd5("dummy");
+ rec3 = new SAMSequenceRecord("chr1", SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH);
+ rec3.setMd5("dummy2");
+
+ rec4 = new SAMSequenceRecord("chr1", 100);
+ rec4.setAttribute(SAMSequenceRecord.URI_TAG,"file://some/file/name.ok");
+
+ rec5 = new SAMSequenceRecord("chr2", 200);
+ rec4.setAttribute(SAMSequenceRecord.URI_TAG,"file://some/file/name.ok");
+
+ return new Object[][]{
+ new Object[]{rec1, rec1, true},
+ new Object[]{rec2, rec2, true},
+ new Object[]{rec3, rec3, true},
+ new Object[]{rec4, rec4, true},
+ new Object[]{rec1, rec2, false},//since 100 != 101 in Length
+ new Object[]{rec1, rec3, true},
+ new Object[]{rec1, rec4, true},
+ new Object[]{rec2, rec3, false}, // since MD5 is not equal
+ new Object[]{rec2, rec4, false}, //length differs
+ new Object[]{rec3, rec4, true},
+ new Object[]{rec4, rec5, false}, // different name
+ };
+ }
+
+ @Test(dataProvider = "testMergeDictionariesData", expectedExceptions = IllegalArgumentException.class)
+ public void testMergeDictionaries(final SAMSequenceRecord rec1, final SAMSequenceRecord rec2, boolean canMerge) throws Exception {
+ final SAMSequenceDictionary dict1 = new SAMSequenceDictionary(Collections.singletonList(rec1));
+ final SAMSequenceDictionary dict2 = new SAMSequenceDictionary(Collections.singletonList(rec2));
+
+ try {
+ SAMSequenceDictionary.mergeDictionaries(dict1, dict2, SAMSequenceDictionary.DEFAULT_DICTIONARY_EQUAL_TAG);
+ } catch (final IllegalArgumentException e) {
+ if (canMerge) {
+ throw new Exception("Expected to be able to merge dictionaries, but wasn't:" , e);
+ } else {
+ throw e;
+ }
+ }
+ if (canMerge){
+ throw new IllegalArgumentException("Expected to be able to merge dictionaries, and was indeed able to do so.");
+ } else {
+ throw new Exception("Expected to not be able to merge dictionaries, but was able");
+ }
+ }
}