Join GitHub today
GitHub is home to over 20 million developers working together to host and review code, manage projects, and build software together.
Adds a function isWithinHammingDistance that checks if two strings ar… #690
Conversation
coveralls
commented
Aug 22, 2016
|
Why don't extract also the method for computing the distance, @fleharty? It will be useful for other purposes... |
|
@magicDGS I agree, so I added the hammingDistance function. |
coveralls
commented
Aug 22, 2016
|
I'm wondering if these belong in htsjdk (I know there are similar things like this in the code, like levenshteinDistance, but...) they're not used by htsjdk anywhere, AFAICT and have no dependence on any htsjdk classes. |
|
@cmnbroad @yfarjoun Picard depends on htsjdk and there is a PR now in Picard that will make use of this function. Since htsjdk already had a stringutils, I was trying to reduce the possibility of code duplication in the future given that things like levenshteinDistance was there, but I see your point that levenshteinDistance isn't ever called within htsjdk. |
|
I think it's fine to have general-purpose utility functions like this in On Tue, Aug 23, 2016 at 5:40 PM, Mark Fleharty notifications@github.com
|
lbergelson
and 1 other
commented on an outdated diff
Aug 25, 2016
| @@ -545,4 +545,56 @@ public static int levenshteinDistance(final String string1, final String string2 | ||
| return i; | ||
| } | ||
| + | ||
| + /** | ||
| + * Calculates the hamming distance between two strings s1 and s2. |
lbergelson
Contributor
|
lbergelson
and 1 other
commented on an outdated diff
Aug 25, 2016
| + */ | ||
| + public static int hammingDistance(final String s1, final String s2) { | ||
| + if (s1.length() != s2.length()) { | ||
| + throw new IllegalArgumentException("Attempted to determine Hamming distance of strings with differing lengths."); | ||
| + } | ||
| + int measuredDistance = 0; | ||
| + for (int i = 0;i < s1.length();i++) { | ||
| + if (s1.charAt(i) != s2.charAt(i)) { | ||
| + measuredDistance++; | ||
| + } | ||
| + } | ||
| + return measuredDistance; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Determines if two strings s1 and s2 are within maxHammingDistance of ecah other using the Hamming Distance metric. |
|
|
lbergelson
and 1 other
commented on an outdated diff
Aug 25, 2016
| @@ -545,4 +545,56 @@ public static int levenshteinDistance(final String string1, final String string2 | ||
| return i; | ||
| } | ||
| + | ||
| + /** | ||
| + * Calculates the hamming distance between two strings s1 and s2. | ||
| + * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if | ||
| + * the two strings are of different lengths. | ||
| + * | ||
| + * @param s1 The first string to compare | ||
| + * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. | ||
| + * @return Hamming distance between s1 and s2. | ||
| + * @throws IllegalArgumentException If the two strings have differing lengths. | ||
| + */ | ||
| + public static int hammingDistance(final String s1, final String s2) { | ||
| + if (s1.length() != s2.length()) { | ||
| + throw new IllegalArgumentException("Attempted to determine Hamming distance of strings with differing lengths."); |
lbergelson
Contributor
|
lbergelson
and 1 other
commented on an outdated diff
Aug 25, 2016
| + | ||
| + @DataProvider(name="withinHammingDistanceExceptionProvider") | ||
| + public Object[][] isWithinHammingDistanceException() { | ||
| + return new Object[][] { | ||
| + {"ATAC", "GCT", 3, true}, | ||
| + {"ATAC", "AT", 2, false}, | ||
| + {"ATAC", "T", 1, false}, | ||
| + {"", "GCAT", 0, false} | ||
| + }; | ||
| + } | ||
| + | ||
| + @Test(dataProvider = "hammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class) | ||
| + public void testIsWithinHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) { | ||
| + // We assert hammingDistance = 0, and isWithinHammingDistance = true because the values don't matter | ||
| + // and we are checking to ensure that the IllegalArgumentException is thrown | ||
| + Assert.assertEquals(StringUtil.hammingDistance(s1, s2), 0); |
lbergelson
Contributor
|
|
It sounds like I'm the only one who thinks this is out of place here, so I defer to the community sentiment. |
nh13
commented on the diff
Aug 25, 2016
| @@ -545,4 +545,56 @@ public static int levenshteinDistance(final String string1, final String string2 | ||
| return i; | ||
| } | ||
| + | ||
| + /** | ||
| + * Calculates the hamming distance between two strings s1 and s2. | ||
| + * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if | ||
| + * the two strings are of different lengths. | ||
| + * | ||
| + * @param s1 The first string to compare | ||
| + * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. | ||
| + * @return Hamming distance between s1 and s2. | ||
| + * @throws IllegalArgumentException If the two strings have differing lengths. | ||
| + */ | ||
| + public static int hammingDistance(final String s1, final String s2) { |
nh13
Contributor
|
nh13
commented on the diff
Aug 25, 2016
| + } | ||
| + } | ||
| + return measuredDistance; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Determines if two strings s1 and s2 are within maxHammingDistance of ecah other using the Hamming Distance metric. | ||
| + * Since Hamming distance is not defined for strings of differing lengths, we throw an exception if | ||
| + * the two strings are of different lengths. | ||
| + * | ||
| + * @param s1 The first string to compare | ||
| + * @param s2 The second string to compare, note that if s1 and s2 are swapped the value returned will be identical. | ||
| + * @param maxHammingDistance The largest Hamming distance the strings can have for this function to return true. | ||
| + * @return true if the two strings are within maxHammingDistance of each other, false otherwise. | ||
| + * @throws IllegalArgumentException If the two strings have differing lengths. | ||
| + */ |
|
|
nh13
and 1 other
commented on an outdated diff
Aug 25, 2016
| + } | ||
| + | ||
| + @Test(dataProvider = "hammingDistanceExceptionProvider", expectedExceptions = IllegalArgumentException.class) | ||
| + public void testIsWithinHammingDistanceExceptions(final String s1, final String s2, final int maxHammingDistance) { | ||
| + // We assert hammingDistance = 0, and isWithinHammingDistance = true because the values don't matter | ||
| + // and we are checking to ensure that the IllegalArgumentException is thrown | ||
| + Assert.assertEquals(StringUtil.hammingDistance(s1, s2), 0); | ||
| + Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), true); | ||
| + } | ||
| + | ||
| + @DataProvider(name="hammingDistanceProvider") | ||
| + public Object[][] hammingDistance() { | ||
| + return new Object[][] { | ||
| + {"ATAC", "GCAT", 3}, | ||
| + {"ATAGC", "ATAGC", 0}, | ||
| + {"ATAC", "atac", 4}, // Hamming distance is case sensitive |
fleharty
Contributor
|
nh13
and 1 other
commented on an outdated diff
Aug 25, 2016
| + {"ATAC", "GCAT", 0, false} | ||
| + }; | ||
| + } | ||
| + | ||
| + @Test(dataProvider = "withinHammingDistanceProvider") | ||
| + public void testIsWithinHammingDistance(final String s1, final String s2, final int maxHammingDistance, final boolean expectedResult) { | ||
| + Assert.assertEquals(StringUtil.isWithinHammingDistance(s1, s2, maxHammingDistance), expectedResult); | ||
| + } | ||
| + | ||
| + @DataProvider(name="withinHammingDistanceExceptionProvider") | ||
| + public Object[][] isWithinHammingDistanceException() { | ||
| + return new Object[][] { | ||
| + {"ATAC", "GCT", 3, true}, | ||
| + {"ATAC", "AT", 2, false}, | ||
| + {"ATAC", "T", 1, false}, | ||
| + {"", "GCAT", 0, false} |
|
|
|
@lbergelson @nh13 Thanks for the comments, back to you. |
coveralls
commented
Aug 25, 2016
lbergelson
was assigned
by droazen
Sep 13, 2016
coveralls
commented
Sep 19, 2016
lbergelson
merged commit 224cfc1
into
samtools:master
Sep 20, 2016
|
|
fleharty commentedAug 22, 2016
•
edited
Description
The motivation for this code is to add functionality to compare two strings and determine
their Hamming distance. Hamming distance is a common metric that is used to compare
strings. The two functions, hammingDistance, and isWithinHammingDistance provide a way
to check the actual Hamming distance, and whether or not two strings are within a given
Hamming distance of each other. isWithinHammingDistance is provided because there are
many cases where it is important to know only if two strings are within a given Hamming distance
and we can terminate early if the two strings are sufficiently dissimilar.
Checklist
…e within a given Hamming distance