From 4be2e8e61dc3291ee0add577b201b41ab214e034 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 22 Aug 2016 09:45:20 -0400 Subject: [PATCH 1/4] Adds a function isWithinHammingDistance that checks if two strings are within a given Hamming distance --- .../java/htsjdk/samtools/util/StringUtil.java | 29 +++++++++++++++++ .../htsjdk/samtools/util/StringUtilTest.java | 31 +++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/src/main/java/htsjdk/samtools/util/StringUtil.java b/src/main/java/htsjdk/samtools/util/StringUtil.java index ecb1b3f49f..2a37426fac 100644 --- a/src/main/java/htsjdk/samtools/util/StringUtil.java +++ b/src/main/java/htsjdk/samtools/util/StringUtil.java @@ -545,4 +545,33 @@ public static int levenshteinDistance(final String string1, final String string2 return i; } + + /** + * Determines if two strings s1 and s2 are within maxHammingDistance of ecah other using the Hamming Distance metric. + * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if + * the two strings are of different lengths. + * + * @param s1 The first string to compare + * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. + * @param maxHammingDistance The largest Hamming distance the strings can have for this function to return true. + * @return true if the two strings are within maxHammingDistance of each other, false otherwise. + * @throws IllegalArgumentException If the two strings have differing lengths. + */ + public static boolean isWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance) { + if (s1.length() != s2.length()) { + throw new IllegalArgumentException("Attempted to determine if two strings of different length were within a specified edit distance."); + } + int measuredDistance = 0; + for (int i = 0;i < s1.length();i++) { + if (s1.charAt(i) != s2.charAt(i)) { + measuredDistance++; + // If the measuredDistance is larger than the maxHammingDistance we can short circuit and return + // false, there is no need to continue evaluating the distance. + if (measuredDistance > maxHammingDistance) { + return false; + } + } + } + return true; + } } diff --git a/src/test/java/htsjdk/samtools/util/StringUtilTest.java b/src/test/java/htsjdk/samtools/util/StringUtilTest.java index 91e8792f4e..1f8be60297 100644 --- a/src/test/java/htsjdk/samtools/util/StringUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/StringUtilTest.java @@ -67,4 +67,35 @@ public Object[][] splitScenarios() { {"A:BB:C:", new String[]{"A", "BB", "C:"}, true}, }; } + + @DataProvider(name="hammingDistanceProvider") + public Object[][] hammingDistance() { + return new Object[][] { + {"ATAC", "GCAT", 3, true}, + {"ATAC", "GCAT", 2, false}, + {"ATAC", "GCAT", 1, false}, + {"ATAC", "GCAT", 0, false} + }; + } + + @Test(dataProvider = "hammingDistanceProvider") + public void testIsWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance, final boolean expectedResult) { + Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), expectedResult); + } + + @DataProvider(name="hammingDistanceExceptionProvider") + public Object[][] hammingDistanceException() { + return new Object[][] { + {"ATAC", "GCT", 3, true}, + {"ATAC", "AT", 2, false}, + {"ATAC", "T", 1, false}, + {"", "GCAT", 0, false} + }; + } + + @Test(dataProvider = "hammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class) + public void testIsWithinHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance, final boolean expectedResult) { + Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), expectedResult); + } + } From 1991b1102cc1bb5d861c558f9df1673a591db61a Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 22 Aug 2016 10:30:01 -0400 Subject: [PATCH 2/4] Adding hammingDistance function per magicDGS's comment --- .../java/htsjdk/samtools/util/StringUtil.java | 23 +++++++++++++ .../htsjdk/samtools/util/StringUtilTest.java | 32 +++++++++++++++---- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/StringUtil.java b/src/main/java/htsjdk/samtools/util/StringUtil.java index 2a37426fac..8e8d358d09 100644 --- a/src/main/java/htsjdk/samtools/util/StringUtil.java +++ b/src/main/java/htsjdk/samtools/util/StringUtil.java @@ -546,6 +546,29 @@ public static int levenshteinDistance(final String string1, final String string2 return i; } + /** + * Calculates the hamming distance between two strings s1 and s2. + * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if + * the two strings are of different lengths. + * + * @param s1 The first string to compare + * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. + * @return Hamming distance between s1 and s2. + * @throws IllegalArgumentException If the two strings have differing lengths. + */ + public static int hammingDistance(final String s1, final String s2) { + if (s1.length() != s2.length()) { + throw new IllegalArgumentException("Attempted to determine Hamming distance of strings with differing lengths."); + } + int measuredDistance = 0; + for (int i = 0;i < s1.length();i++) { + if (s1.charAt(i) != s2.charAt(i)) { + measuredDistance++; + } + } + return measuredDistance; + } + /** * Determines if two strings s1 and s2 are within maxHammingDistance of ecah other using the Hamming Distance metric. * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if diff --git a/src/test/java/htsjdk/samtools/util/StringUtilTest.java b/src/test/java/htsjdk/samtools/util/StringUtilTest.java index 1f8be60297..7a6970df97 100644 --- a/src/test/java/htsjdk/samtools/util/StringUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/StringUtilTest.java @@ -68,8 +68,8 @@ public Object[][] splitScenarios() { }; } - @DataProvider(name="hammingDistanceProvider") - public Object[][] hammingDistance() { + @DataProvider(name="withinHammingDistanceProvider") + public Object[][] isWithinHammingDistanceProvider() { return new Object[][] { {"ATAC", "GCAT", 3, true}, {"ATAC", "GCAT", 2, false}, @@ -78,13 +78,13 @@ public Object[][] hammingDistance() { }; } - @Test(dataProvider = "hammingDistanceProvider") + @Test(dataProvider = "withinHammingDistanceProvider") public void testIsWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance, final boolean expectedResult) { Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), expectedResult); } - @DataProvider(name="hammingDistanceExceptionProvider") - public Object[][] hammingDistanceException() { + @DataProvider(name="withinHammingDistanceExceptionProvider") + public Object[][] isWithinHammingDistanceException() { return new Object[][] { {"ATAC", "GCT", 3, true}, {"ATAC", "AT", 2, false}, @@ -94,8 +94,26 @@ public Object[][] hammingDistanceException() { } @Test(dataProvider = "hammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class) - public void testIsWithinHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance, final boolean expectedResult) { - Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), expectedResult); + public void testIsWithinHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) { + // We assert hammingDistance = 0, and isWithinHammingDistance = true because the values don't matter + // and we are checking to ensure that the IllegalArgumentException is thrown + Assert.assertEquals(StringUtil.hammingDistance(s1, s2), 0); + Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), true); + } + + @DataProvider(name="hammingDistanceProvider") + public Object[][] hammingDistance() { + return new Object[][] { + {"ATAC", "GCAT", 3}, + {"ATAGC", "ATAGC", 0}, + {"ATAC", "atac", 4}, // Hamming distance is case sensitive + {"", "", 0} + }; + } + + @Test(dataProvider = "hammingDistanceProvider") + public void testHammingDistanceExceptions(final String s1, final String s2, final int expectedResult) { + Assert.assertEquals(StringUtil.hammingDistance(s1, s2), expectedResult); } } From 771340436bc89ec02876c63e29cdf31cb5810728 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Thu, 25 Aug 2016 12:24:52 -0400 Subject: [PATCH 3/4] Addressing comments from Louis and Nils --- .../java/htsjdk/samtools/util/StringUtil.java | 13 +++++---- .../htsjdk/samtools/util/StringUtilTest.java | 29 ++++++++++--------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/StringUtil.java b/src/main/java/htsjdk/samtools/util/StringUtil.java index 8e8d358d09..5e85434bc4 100644 --- a/src/main/java/htsjdk/samtools/util/StringUtil.java +++ b/src/main/java/htsjdk/samtools/util/StringUtil.java @@ -547,9 +547,10 @@ public static int levenshteinDistance(final String string1, final String string2 } /** - * Calculates the hamming distance between two strings s1 and s2. + * Calculates the Hamming distance (number of character mismatches) between two strings s1 and s2. * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if - * the two strings are of different lengths. + * the two strings are of different lengths. Hamming distance is case sensitive. Also note that when + * used to compare two DNA strings that contain Ns mathing Ns will not be counted as mismatches. * * @param s1 The first string to compare * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. @@ -558,7 +559,8 @@ public static int levenshteinDistance(final String string1, final String string2 */ public static int hammingDistance(final String s1, final String s2) { if (s1.length() != s2.length()) { - throw new IllegalArgumentException("Attempted to determine Hamming distance of strings with differing lengths."); + throw new IllegalArgumentException("Attempted to determine Hamming distance of strings with differing lengths. " + + "The first string has length " + s1.length() + " and the second string has length " + s2.length() + "."); } int measuredDistance = 0; for (int i = 0;i < s1.length();i++) { @@ -570,9 +572,10 @@ public static int hammingDistance(final String s1, final String s2) { } /** - * Determines if two strings s1 and s2 are within maxHammingDistance of ecah other using the Hamming Distance metric. + * Determines if two strings s1 and s2 are within maxHammingDistance of each other using the Hamming distance metric. * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if - * the two strings are of different lengths. + * the two strings are of different lengths. Hamming distance is case sensitive. Also note that when + * used to compare two DNA strings that contain Ns mathing Ns will not be counted as mismatches. * * @param s1 The first string to compare * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. diff --git a/src/test/java/htsjdk/samtools/util/StringUtilTest.java b/src/test/java/htsjdk/samtools/util/StringUtilTest.java index 7a6970df97..dbb2a07096 100644 --- a/src/test/java/htsjdk/samtools/util/StringUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/StringUtilTest.java @@ -86,33 +86,36 @@ public void testIsWithinHammingDistance(final String s1, final String s2, final @DataProvider(name="withinHammingDistanceExceptionProvider") public Object[][] isWithinHammingDistanceException() { return new Object[][] { - {"ATAC", "GCT", 3, true}, - {"ATAC", "AT", 2, false}, - {"ATAC", "T", 1, false}, - {"", "GCAT", 0, false} + {"ATAC", "GCT" , 3}, + {"ATAC", "AT" , 2}, + {"ATAC", "T" , 1}, + {"" , "GCAT", 0} }; } - @Test(dataProvider = "hammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class) + @Test(dataProvider = "withinHammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class) public void testIsWithinHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) { - // We assert hammingDistance = 0, and isWithinHammingDistance = true because the values don't matter - // and we are checking to ensure that the IllegalArgumentException is thrown - Assert.assertEquals(StringUtil.hammingDistance(s1, s2), 0); - Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), true); + StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance); + } + + @Test(dataProvider = "withinHammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class) + public void testHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) { + StringUtil.hammingDistance(s1, s2); } @DataProvider(name="hammingDistanceProvider") public Object[][] hammingDistance() { return new Object[][] { - {"ATAC", "GCAT", 3}, + {"ATAC" , "GCAT" , 3}, {"ATAGC", "ATAGC", 0}, - {"ATAC", "atac", 4}, // Hamming distance is case sensitive - {"", "", 0} + {"ATAC" , "atac" , 4}, // Hamming distance is case sensitive. + {"" , "" , 0}, // Two empty strings should have Hamming distance of 0. + {"nAGTN", "nAGTN", 0} // Ensure that matching Ns are not counted as mismatches. }; } @Test(dataProvider = "hammingDistanceProvider") - public void testHammingDistanceExceptions(final String s1, final String s2, final int expectedResult) { + public void testHammingDistance(final String s1, final String s2, final int expectedResult) { Assert.assertEquals(StringUtil.hammingDistance(s1, s2), expectedResult); } From bf1adc55a2206446c5fa109907b1a47fea766dc3 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 19 Sep 2016 18:52:59 -0400 Subject: [PATCH 4/4] Responding to Louis' comment, DNA not treated special --- src/main/java/htsjdk/samtools/util/StringUtil.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/StringUtil.java b/src/main/java/htsjdk/samtools/util/StringUtil.java index 5e85434bc4..90492533e1 100644 --- a/src/main/java/htsjdk/samtools/util/StringUtil.java +++ b/src/main/java/htsjdk/samtools/util/StringUtil.java @@ -549,8 +549,8 @@ public static int levenshteinDistance(final String string1, final String string2 /** * Calculates the Hamming distance (number of character mismatches) between two strings s1 and s2. * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if - * the two strings are of different lengths. Hamming distance is case sensitive. Also note that when - * used to compare two DNA strings that contain Ns mathing Ns will not be counted as mismatches. + * the two strings are of different lengths. Hamming distance is case sensitive and does not have + * any special treatment for DNA. * * @param s1 The first string to compare * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. @@ -574,8 +574,8 @@ public static int hammingDistance(final String s1, final String s2) { /** * Determines if two strings s1 and s2 are within maxHammingDistance of each other using the Hamming distance metric. * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if - * the two strings are of different lengths. Hamming distance is case sensitive. Also note that when - * used to compare two DNA strings that contain Ns mathing Ns will not be counted as mismatches. + * the two strings are of different lengths. Hamming distance is case sensitive and does not have any + * special treatment for DNA. * * @param s1 The first string to compare * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical.