Skip to content

Commit

Permalink
hexdiff: 2 algorithms, doc
Browse files Browse the repository at this point in the history
  • Loading branch information
gpotter2 committed Dec 2, 2023
1 parent cb6dcf4 commit e80b3d4
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 36 deletions.
128 changes: 97 additions & 31 deletions scapy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@
hex_bytes,
bytes_encode,
)
from scapy.error import log_runtime, Scapy_Exception, warning
from scapy.error import (
log_interactive,
log_runtime,
Scapy_Exception,
warning,
)
from scapy.pton_ntop import inet_pton

# Typing imports
Expand Down Expand Up @@ -392,50 +397,111 @@ def repr_hex(s):


@conf.commands.register
def hexdiff(a, b, autojunk=False):
# type: (Union[Packet, AnyStr], Union[Packet, AnyStr], bool) -> None
def hexdiff(
a: Union['Packet', AnyStr],
b: Union['Packet', AnyStr],
algo: Optional[str] = None,
autojunk: bool = False,
) -> None:
"""
Show differences between 2 binary strings, Packets...
For the autojunk parameter, see
https://docs.python.org/3.8/library/difflib.html#difflib.SequenceMatcher
Available algorithms:
- wagnerfischer: Use the Wagner and Fischer algorithm to compute the
Levenstein distance between the strings then backtrack.
- difflib: Use the difflib.SequenceMatcher implementation. This based on a
modified version of the Ratcliff and Obershelp algorithm.
This is much faster, but far less accurate.
https://docs.python.org/3.8/library/difflib.html#difflib.SequenceMatcher
:param a:
:param b: The binary strings, packets... to compare
:param autojunk: Setting it to True will likely increase the comparison
speed a lot on big byte strings, but will reduce accuracy (will tend
to miss insertion and see replacements instead for instance).
:param algo: Force the algo to be 'wagnerfischer' or 'difflib'.
By default, this is chosen depending on the complexity, optimistically
preferring wagnerfischer unless really necessary.
:param autojunk: (difflib only) See difflib documentation.
"""

# Compare the strings using difflib

xb = bytes_encode(a)
yb = bytes_encode(b)

sm = difflib.SequenceMatcher(a=xb, b=yb, autojunk=autojunk)
xarr = [xb[i:i + 1] for i in range(len(xb))]
yarr = [yb[i:i + 1] for i in range(len(yb))]
if algo is None:
# Choose the best algorithm
complexity = len(xb) * len(yb)
if complexity < 1e7:
# Comparing two (non-jumbos) Ethernet packets is ~2e6 which is manageable.
# Anything much larger than this shouldn't be attempted by default.
algo = "wagnerfischer"
if complexity > 1e6:
log_interactive.info(
"Complexity is a bit high. hexdiff will take a few seconds."
)
else:
algo = "difflib"

backtrackx = []
backtracky = []
for opcode in sm.get_opcodes():
typ, x0, x1, y0, y1 = opcode
if typ == 'delete':
backtrackx += xarr[x0:x1]
backtracky += [b''] * (x1 - x0)
elif typ == 'insert':
backtrackx += [b''] * (y1 - y0)
backtracky += yarr[y0:y1]
elif typ in ['equal', 'replace']:
backtrackx += xarr[x0:x1]
backtracky += yarr[y0:y1]

if autojunk:

if algo == "wagnerfischer":
xb = xb[::-1]
yb = yb[::-1]

# costs for the 3 operations
INSERT = 1
DELETE = 1
SUBST = 1

# Typically, d[i,j] will hold the distance between
# the first i characters of xb and the first j characters of yb.
# We change the Wagner Fischer to also store pointers to all
# the intermediate steps taken while calculating the Levenstein distance.
d = {(-1, -1): (0, (-1, -1))}
for j in range(len(yb)):
d[-1, j] = (j + 1) * INSERT, (-1, j - 1)
for i in range(len(xb)):
d[i, -1] = (i + 1) * INSERT + 1, (i - 1, -1)

# Compute the Levenstein distance between the two strings, but
# store all the steps to be able to backtrack at the end.
for j in range(len(yb)):
for i in range(len(xb)):
d[i, j] = min(
(d[i - 1, j - 1][0] + SUBST * (xb[i] != yb[j]), (i - 1, j - 1)),
(d[i - 1, j][0] + DELETE, (i - 1, j)),
(d[i, j - 1][0] + INSERT, (i, j - 1)),
)

# Iterate through the steps backwards to create the diff
i = len(xb) - 1
j = len(yb) - 1
while not (i == j == -1):
i2, j2 = d[i, j][1]
backtrackx.append(xb[i2 + 1:i + 1])
backtracky.append(yb[j2 + 1:j + 1])
i, j = i2, j2
elif algo == "difflib":
sm = difflib.SequenceMatcher(a=xb, b=yb, autojunk=autojunk)
xarr = [xb[i:i + 1] for i in range(len(xb))]
yarr = [yb[i:i + 1] for i in range(len(yb))]
# Iterate through opcodes to build the backtrack
for opcode in sm.get_opcodes():
typ, x0, x1, y0, y1 = opcode
if typ == 'delete':
backtrackx += xarr[x0:x1]
backtracky += [b''] * (x1 - x0)
elif typ == 'insert':
backtrackx += [b''] * (y1 - y0)
backtracky += yarr[y0:y1]
elif typ in ['equal', 'replace']:
backtrackx += xarr[x0:x1]
backtracky += yarr[y0:y1]
# Some lines may have been considered as junk. Check the sizes
lbx = len(backtrackx)
lby = len(backtracky)
backtrackx += [b''] * (max(lbx, lby) - lbx)
backtracky += [b''] * (max(lbx, lby) - lby)
if autojunk:
lbx = len(backtrackx)
lby = len(backtracky)
backtrackx += [b''] * (max(lbx, lby) - lbx)
backtracky += [b''] * (max(lbx, lby) - lby)
else:
raise ValueError("Unknown algorithm '%s'" % algo)

# Print the diff

Expand Down
10 changes: 5 additions & 5 deletions test/regression.uts
Original file line number Diff line number Diff line change
Expand Up @@ -867,11 +867,11 @@ assert fletcher16_checkbytes(b"\x28\x07", 1) == b"\xaf("

= Test hexdiff function
~ not_pypy
def test_hexdiff(a, b, autojunk=False):
def test_hexdiff(a, b, algo=None, autojunk=False):
conf_color_theme = conf.color_theme
conf.color_theme = BlackAndWhite()
with ContextManagerCaptureOutput() as cmco:
hexdiff(a, b, autojunk=autojunk)
hexdiff(a, b, algo=algo, autojunk=autojunk)
result_hexdiff = cmco.get_output()
conf.interactive = True
conf.color_theme = conf_color_theme
Expand Down Expand Up @@ -901,12 +901,12 @@ expected += "0010 7F 00 00 01 ....
expected += " 0010 7F 00 00 02 ....\n"
assert result_hexdiff == expected

# Compare using autojunk
# Compare using difflib

a = "A" * 1000 + "findme" + "B" * 1000
b = "A" * 1000 + "B" * 1000
ret1 = test_hexdiff(a, b)
ret2 = test_hexdiff(a, b, autojunk=True)
ret1 = test_hexdiff(a, b, algo="difflib")
ret2 = test_hexdiff(a, b, algo="difflib", autojunk=True)

expected_ret1 = """
03d0 03d0 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 AAAAAAAAAAAAAAAA
Expand Down

0 comments on commit e80b3d4

Please sign in to comment.