From 9c0e3c51706ddde64a0f7f3ff54b277cc7be81c4 Mon Sep 17 00:00:00 2001 From: John Crepezzi Date: Fri, 23 Mar 2012 12:19:42 -0400 Subject: [PATCH] Dice's Coefficient --- README.md | 5 ++ lib/natural/distance/dice_coefficient.js | 69 ++++++++++++++++++++++++ lib/natural/index.js | 3 +- package.json | 2 +- spec/dice_coefficient_spec.js | 26 +++++++++ 5 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 lib/natural/distance/dice_coefficient.js create mode 100644 spec/dice_coefficient_spec.js diff --git a/README.md b/README.md index d63405b56..960f534a9 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,11 @@ Output: 2 0 +And Dice's co-efficient: + + var natural = require('natural'); + console.log(natural.DiceCoefficient('thing', 'things')); + Stemmers -------- diff --git a/lib/natural/distance/dice_coefficient.js b/lib/natural/distance/dice_coefficient.js new file mode 100644 index 000000000..52655d373 --- /dev/null +++ b/lib/natural/distance/dice_coefficient.js @@ -0,0 +1,69 @@ +/* +Copyright (c) 2011, John Crepezzi, Chris Umbel + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Get all of the pairs of letters for a string +var letterPairs = function (str) { + var numPairs = str.length - 1; + var pairs = new Array(numPairs); + for (var i = 0; i < numPairs; i++) { + pairs[i] = str.substring(i, i + 2); + } + return pairs; +}; + +// Get all of the pairs in all of the words for a string +var wordLetterPairs = function (str) { + var allPairs = [], pairs; + var words = str.split(/\s+/); + for (var i = 0; i < words.length; i++) { + pairs = letterPairs(words[i]); + allPairs.push.apply(allPairs, pairs); + } + return allPairs; +}; + +// Perform some sanitization steps +var sanitize = function (str) { + return str.toLowerCase().replace(/^\s+|\s+$/g, ''); +}; + +// Compare two strings, and spit out a number from 0-1 +var compare = function (str1, str2) { + var pairs1 = wordLetterPairs(sanitize(str1)); + var pairs2 = wordLetterPairs(sanitize(str2)); + var intersection = 0, union = pairs1.length + pairs2.length; + var i, j, pair1, pair2; + for (i = 0; i < pairs1.length; i++) { + pair1 = pairs1[i]; + for (j = 0; j < pairs2.length; j++) { + pair2 = pairs2[j]; + if (pair1 == pair2) { + intersection ++; + delete pairs2[j]; + break; + } + } + } + return 2 * intersection / union; +}; + +module.exports = compare; diff --git a/lib/natural/index.js b/lib/natural/index.js index 9d3ef1278..90164775c 100644 --- a/lib/natural/index.js +++ b/lib/natural/index.js @@ -40,4 +40,5 @@ exports.SentenceAnalyzer = require('./analyzers/sentence_analyzer'); exports.stopwords = require('./util/stopwords').words; exports.NGrams = require('./ngrams/ngrams'); exports.JaroWinklerDistance = require('./distance/jaro-winkler_distance'); -exports.LevenshteinDistance = require('./distance/levenshtein_distance'); \ No newline at end of file +exports.LevenshteinDistance = require('./distance/levenshtein_distance'); +exports.DiceCoefficient = require('./distance/dice_coefficient'); diff --git a/package.json b/package.json index c2f25f525..e6f88e339 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "natural", - "description": "General natural language (tokenizing, stemming, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance) facilities for node.", + "description": "General natural language (tokenizing, stemming, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance, Dice's Coefficient) facilities for node.", "version": "0.1.4", "homepage": "https://github.com/NaturalNode/natural", "engines": { diff --git a/spec/dice_coefficient_spec.js b/spec/dice_coefficient_spec.js new file mode 100644 index 000000000..eea4e87cc --- /dev/null +++ b/spec/dice_coefficient_spec.js @@ -0,0 +1,26 @@ +var dice = require('lib/natural/distance/dice_coefficient'); + +describe('dice', function () { + + it('should handle exact matches', function () { + expect(dice('john', 'john')).toBe(1); + }); + + it('should handle total mis-matches', function () { + expect(dice('john', 'matt')).toBe(0); + }); + + // Example from http://en.wikipedia.org/wiki/Dice's_coefficient + it('should handle a typical case', function () { + expect(dice('night', 'nacht')).toBe(0.25); + }); + + it('should sanitize case', function () { + expect(dice('night', 'NIGHT')).toBe(1); + }); + + it('should sanitize spacing', function () { + expect(dice('the space', 'the space')).toBe(1); + }); + +});