Skip to content

Commit

Permalink
difflib.py: improve the run time of pathological diffs
Browse files Browse the repository at this point in the history
  • Loading branch information
pulkin committed May 16, 2024
1 parent 4702b7b commit 31e1ed0
Showing 1 changed file with 18 additions and 5 deletions.
23 changes: 18 additions & 5 deletions Lib/difflib.py
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,7 @@ def _plain_replace(self, a, alo, ahi, b, blo, bhi):
for g in first, second:
yield from g

def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
def _fancy_replace(self, a, alo, ahi, b, blo, bhi, _gravity=1e-6):
r"""
When replacing one block of lines with another, search the blocks
for *similar* lines; the best-matching pair (if any) is used as a
Expand Down Expand Up @@ -918,26 +918,39 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
# search for the pair that matches best without being identical
# (identical lines must be junk lines, & we don't want to synch up
# on junk -- unless we have to)

# for pathological cases with many equal ratios prefer to split
# closer to the middle of a, b chunks such that the resulting
# branching is more optimal (bisect-like)
def _drag_to_center(i, lo, hi):
# any convex function with a maximum at (lo + hi - 1) / 2
# this one is zero at edges lo, hi - 1 and _gravity in the middle
return _gravity * (1 - ((2 * i - lo - hi + 1) / (hi - lo - 1)) ** 2)
# with the weight above, the best_ratio becomes slightly bigger
# which means that the real cutoff is slightly smaller than 0.75

for j in range(blo, bhi):
bj = b[j]
cruncher.set_seq2(bj)
weight_b = _drag_to_center(j, blo, bhi)
for i in range(alo, ahi):
ai = a[i]
if ai == bj:
if eqi is None:
eqi, eqj = i, j
continue
cruncher.set_seq1(ai)
weight_ab = weight_b + _drag_to_center(i, alo, ahi)
# computing similarity is expensive, so use the quick
# upper bounds first -- have seen this speed up messy
# compares by a factor of 3.
# note that ratio() is only expensive to compute the first
# time it's called on a sequence pair; the expensive part
# of the computation is cached by cruncher
if cruncher.real_quick_ratio() > best_ratio and \
cruncher.quick_ratio() > best_ratio and \
cruncher.ratio() > best_ratio:
best_ratio, best_i, best_j = cruncher.ratio(), i, j
if cruncher.real_quick_ratio() + weight_ab > best_ratio and \
cruncher.quick_ratio() + weight_ab > best_ratio and \
cruncher.ratio() + weight_ab > best_ratio:
best_ratio, best_i, best_j = cruncher.ratio() + weight_ab, i, j
if best_ratio < cutoff:
# no non-identical "pretty close" pair
if eqi is None:
Expand Down

0 comments on commit 31e1ed0

Please sign in to comment.