Skip to content
Permalink
The files should all inherit the projects license.
15 contributors

Users who have contributed to this file

@JeffPaine @medecau @josegonzalez @tlaunay @rralf @ojomio @nol13 @grigi @zackkitzmiller @olethanh @xraymemory @Julian
306 lines (236 sloc) 9.37 KB
#!/usr/bin/env python
# encoding: utf-8
from __future__ import unicode_literals
import platform
import warnings
try:
from .StringMatcher import StringMatcher as SequenceMatcher
except ImportError:
if platform.python_implementation() != "PyPy":
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
from difflib import SequenceMatcher
from . import utils
###########################
# Basic Scoring Functions #
###########################
@utils.check_for_none
@utils.check_for_equivalence
@utils.check_empty_string
def ratio(s1, s2):
s1, s2 = utils.make_type_consistent(s1, s2)
m = SequenceMatcher(None, s1, s2)
return utils.intr(100 * m.ratio())
@utils.check_for_none
@utils.check_for_equivalence
@utils.check_empty_string
def partial_ratio(s1, s2):
""""Return the ratio of the most similar substring
as a number between 0 and 100."""
s1, s2 = utils.make_type_consistent(s1, s2)
if len(s1) <= len(s2):
shorter = s1
longer = s2
else:
shorter = s2
longer = s1
m = SequenceMatcher(None, shorter, longer)
blocks = m.get_matching_blocks()
# each block represents a sequence of matching characters in a string
# of the form (idx_1, idx_2, len)
# the best partial match will block align with at least one of those blocks
# e.g. shorter = "abcd", longer = XXXbcdeEEE
# block = (1,3,3)
# best score === ratio("abcd", "Xbcd")
scores = []
for block in blocks:
long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
long_end = long_start + len(shorter)
long_substr = longer[long_start:long_end]
m2 = SequenceMatcher(None, shorter, long_substr)
r = m2.ratio()
if r > .995:
return 100
else:
scores.append(r)
return utils.intr(100 * max(scores))
##############################
# Advanced Scoring Functions #
##############################
def _process_and_sort(s, force_ascii, full_process=True):
"""Return a cleaned string with token sorted."""
# pull tokens
ts = utils.full_process(s, force_ascii=force_ascii) if full_process else s
tokens = ts.split()
# sort tokens and join
sorted_string = u" ".join(sorted(tokens))
return sorted_string.strip()
# Sorted Token
# find all alphanumeric tokens in the string
# sort those tokens and take ratio of resulting joined strings
# controls for unordered string elements
@utils.check_for_none
def _token_sort(s1, s2, partial=True, force_ascii=True, full_process=True):
sorted1 = _process_and_sort(s1, force_ascii, full_process=full_process)
sorted2 = _process_and_sort(s2, force_ascii, full_process=full_process)
if partial:
return partial_ratio(sorted1, sorted2)
else:
return ratio(sorted1, sorted2)
def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
"""Return a measure of the sequences' similarity between 0 and 100
but sorting the token before comparing.
"""
return _token_sort(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
"""Return the ratio of the most similar substring as a number between
0 and 100 but sorting the token before comparing.
"""
return _token_sort(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
@utils.check_for_none
def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
"""Find all alphanumeric tokens in each string...
- treat them as a set
- construct two strings of the form:
<sorted_intersection><sorted_remainder>
- take ratios of those two strings
- controls for unordered partial matches"""
if not full_process and s1 == s2:
return 100
p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2
if not utils.validate_string(p1):
return 0
if not utils.validate_string(p2):
return 0
# pull tokens
tokens1 = set(p1.split())
tokens2 = set(p2.split())
intersection = tokens1.intersection(tokens2)
diff1to2 = tokens1.difference(tokens2)
diff2to1 = tokens2.difference(tokens1)
sorted_sect = " ".join(sorted(intersection))
sorted_1to2 = " ".join(sorted(diff1to2))
sorted_2to1 = " ".join(sorted(diff2to1))
combined_1to2 = sorted_sect + " " + sorted_1to2
combined_2to1 = sorted_sect + " " + sorted_2to1
# strip
sorted_sect = sorted_sect.strip()
combined_1to2 = combined_1to2.strip()
combined_2to1 = combined_2to1.strip()
if partial:
ratio_func = partial_ratio
else:
ratio_func = ratio
pairwise = [
ratio_func(sorted_sect, combined_1to2),
ratio_func(sorted_sect, combined_2to1),
ratio_func(combined_1to2, combined_2to1)
]
return max(pairwise)
def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
return _token_set(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
return _token_set(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
###################
# Combination API #
###################
# q is for quick
def QRatio(s1, s2, force_ascii=True, full_process=True):
"""
Quick ratio comparison between two strings.
Runs full_process from utils on both strings
Short circuits if either of the strings is empty after processing.
:param s1:
:param s2:
:param force_ascii: Allow only ASCII characters (Default: True)
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
:return: similarity ratio
"""
if full_process:
p1 = utils.full_process(s1, force_ascii=force_ascii)
p2 = utils.full_process(s2, force_ascii=force_ascii)
else:
p1 = s1
p2 = s2
if not utils.validate_string(p1):
return 0
if not utils.validate_string(p2):
return 0
return ratio(p1, p2)
def UQRatio(s1, s2, full_process=True):
"""
Unicode quick ratio
Calls QRatio with force_ascii set to False
:param s1:
:param s2:
:return: similarity ratio
"""
return QRatio(s1, s2, force_ascii=False, full_process=full_process)
# w is for weighted
def WRatio(s1, s2, force_ascii=True, full_process=True):
"""
Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
**Steps in the order they occur**
#. Run full_process from utils on both strings
#. Short circuit if this makes either string empty
#. Take the ratio of the two processed strings (fuzz.ratio)
#. Run checks to compare the length of the strings
* If one of the strings is more than 1.5 times as long as the other
use partial_ratio comparisons - scale partial results by 0.9
(this makes sure only full results can return 100)
* If one of the strings is over 8 times as long as the other
instead scale by 0.6
#. Run the other ratio functions
* if using partial ratio functions call partial_ratio,
partial_token_sort_ratio and partial_token_set_ratio
scale all of these by the ratio based on length
* otherwise call token_sort_ratio and token_set_ratio
* all token based comparisons are scaled by 0.95
(on top of any partial scalars)
#. Take the highest value from these results
round it and return it as an integer.
:param s1:
:param s2:
:param force_ascii: Allow only ascii characters
:type force_ascii: bool
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
:return:
"""
if full_process:
p1 = utils.full_process(s1, force_ascii=force_ascii)
p2 = utils.full_process(s2, force_ascii=force_ascii)
else:
p1 = s1
p2 = s2
if not utils.validate_string(p1):
return 0
if not utils.validate_string(p2):
return 0
# should we look at partials?
try_partial = True
unbase_scale = .95
partial_scale = .90
base = ratio(p1, p2)
len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))
# if strings are similar length, don't use partials
if len_ratio < 1.5:
try_partial = False
# if one string is much much shorter than the other
if len_ratio > 8:
partial_scale = .6
if try_partial:
partial = partial_ratio(p1, p2) * partial_scale
ptsor = partial_token_sort_ratio(p1, p2, full_process=False) \
* unbase_scale * partial_scale
ptser = partial_token_set_ratio(p1, p2, full_process=False) \
* unbase_scale * partial_scale
return utils.intr(max(base, partial, ptsor, ptser))
else:
tsor = token_sort_ratio(p1, p2, full_process=False) * unbase_scale
tser = token_set_ratio(p1, p2, full_process=False) * unbase_scale
return utils.intr(max(base, tsor, tser))
def UWRatio(s1, s2, full_process=True):
"""Return a measure of the sequences' similarity between 0 and 100,
using different algorithms. Same as WRatio but preserving unicode.
"""
return WRatio(s1, s2, force_ascii=False, full_process=full_process)