From ff6ff87c6860e901f805d9e6dd3b7233e9f0e8a1 Mon Sep 17 00:00:00 2001 From: Seperman Date: Fri, 17 May 2024 15:50:13 -0700 Subject: [PATCH] adding use_log_scale and log_scale_similarity_threshold --- deepdiff/diff.py | 20 ++++++----- deepdiff/distance.py | 74 ++++++++++++++++++++++++++++++++------ deepdiff/helper.py | 1 + tests/test_cache.py | 3 +- tests/test_delta.py | 6 ++-- tests/test_diff_text.py | 38 +++++++++++++++++--- tests/test_ignore_order.py | 28 +++++++++++---- tests/test_operators.py | 2 +- 8 files changed, 138 insertions(+), 34 deletions(-) diff --git a/deepdiff/diff.py b/deepdiff/diff.py index 9322f31b..660f64cf 100755 --- a/deepdiff/diff.py +++ b/deepdiff/diff.py @@ -27,7 +27,7 @@ np, get_truncate_datetime, dict_, CannotCompare, ENUM_INCLUDE_KEYS, PydanticBaseModel, Opcode, SetOrdered) from deepdiff.serialization import SerializationMixin -from deepdiff.distance import DistanceMixin +from deepdiff.distance import DistanceMixin, logarithmic_similarity from deepdiff.model import ( RemapDict, ResultDict, TextResult, TreeResult, DiffLevel, DictRelationship, AttributeRelationship, REPORT_KEYS, @@ -157,7 +157,9 @@ def __init__(self, progress_logger: Callable=logger.info, report_repetition: bool=False, significant_digits: Optional[int]=None, - threshold_to_diff_deeper: float = 0, + use_log_scale: bool=False, + log_scale_similarity_threshold: int=0.1, + threshold_to_diff_deeper: float = 0.33, truncate_datetime: Optional[str]=None, use_enum_value: bool=False, verbose_level: int=1, @@ -178,7 +180,7 @@ def __init__(self, "cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, " "cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, " "math_epsilon, iterable_compare_func, use_enum_value, _original_type, threshold_to_diff_deeper, " - "ignore_order_func, custom_operators, encodings, ignore_encoding_errors, " + "ignore_order_func, custom_operators, encodings, ignore_encoding_errors, use_log_scale, log_scale_similarity_threshold " "_parameters and _shared_parameters.") % ', '.join(kwargs.keys())) if _parameters: @@ -196,6 +198,8 @@ def __init__(self, if strings == ignore_type_in_groups or strings in ignore_type_in_groups: ignore_string_type_changes = True self.use_enum_value = use_enum_value + self.log_scale_similarity_threshold = log_scale_similarity_threshold + self.use_log_scale = use_log_scale self.threshold_to_diff_deeper = threshold_to_diff_deeper self.ignore_string_type_changes = ignore_string_type_changes self.ignore_type_in_groups = self.get_ignore_types_in_groups( @@ -583,9 +587,8 @@ def _diff_dict( t_keys_union = t2_keys | t1_keys t_keys_added = t2_keys - t_keys_intersect t_keys_removed = t1_keys - t_keys_intersect - if self.threshold_to_diff_deeper: - if len(t_keys_union) and len(t_keys_intersect) / len(t_keys_union) < self.threshold_to_diff_deeper: + if len(t_keys_union) > 1 and len(t_keys_intersect) / len(t_keys_union) < self.threshold_to_diff_deeper: self._report_result('values_changed', level, local_tree=local_tree) return @@ -1145,7 +1148,6 @@ def defaultdict_orderedset(): pairs = dict_() pre_calced_distances = None - if hashes_added and hashes_removed and np and len(hashes_added) > 1 and len(hashes_removed) > 1: # pre-calculates distances ONLY for 1D arrays whether an _original_type # was explicitly passed or a homogeneous array is detected. @@ -1233,7 +1235,6 @@ def _diff_iterable_with_deephash(self, level, parents_ids, _original_type=None, else: t1_hashtable = {k: v for k, v in full_t1_hashtable.items() if k in hashes_removed} t2_hashtable = {k: v for k, v in full_t2_hashtable.items() if k in hashes_added} - if self._stats[PASSES_COUNT] < self.max_passes and get_pairs: self._stats[PASSES_COUNT] += 1 pairs = self._get_most_in_common_pairs_in_iterables( @@ -1403,7 +1404,10 @@ def _diff_numbers(self, level, local_tree=None, report_type_change=True): else: t1_type = t2_type = '' - if self.math_epsilon is not None: + if self.use_log_scale: + if not logarithmic_similarity(level.t1, level.t2, threshold=self.log_scale_similarity_threshold): + self._report_result('values_changed', level, local_tree=local_tree) + elif self.math_epsilon is not None: if not is_close(level.t1, level.t2, abs_tol=self.math_epsilon): self._report_result('values_changed', level, local_tree=local_tree) elif self.significant_digits is None: diff --git a/deepdiff/distance.py b/deepdiff/distance.py index 55144fb7..2c5ae912 100644 --- a/deepdiff/distance.py +++ b/deepdiff/distance.py @@ -1,3 +1,5 @@ +import numpy as np +import math import datetime from deepdiff.deephash import DeepHash from deepdiff.helper import ( @@ -31,7 +33,7 @@ def _get_rough_distance(self): """ _distance = get_numeric_types_distance( - self.t1, self.t2, max_=self.cutoff_distance_for_pairs) + self.t1, self.t2, max_=self.cutoff_distance_for_pairs, use_log_scale=self.use_log_scale, log_scale_similarity_threshold=self.log_scale_similarity_threshold) if _distance is not not_found: return _distance @@ -122,7 +124,10 @@ def _precalculate_numpy_arrays_distance( distances = _get_numpy_array_distance( pairs_transposed[0], pairs_transposed[1], - max_=self.cutoff_distance_for_pairs) + max_=self.cutoff_distance_for_pairs, + use_log_scale=self.use_log_scale, + log_scale_similarity_threshold=self.log_scale_similarity_threshold, + ) i = 0 for added_hash in hashes_added: @@ -186,7 +191,7 @@ def _get_item_length(item, parents_ids=frozenset([])): return length -def _get_numbers_distance(num1, num2, max_=1): +def _get_numbers_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1): """ Get the distance of 2 numbers. The output is a number between 0 to the max. The reason is the @@ -194,6 +199,11 @@ def _get_numbers_distance(num1, num2, max_=1): """ if num1 == num2: return 0 + if use_log_scale: + distance = logarithmic_distance(num1, num2) + if distance < logarithmic_distance: + return 0 + return distance if not isinstance(num1, float): num1 = float(num1) if not isinstance(num2, float): @@ -218,8 +228,42 @@ def _numpy_div(a, b, replace_inf_with=1): result[a == b] = 0 return result +# To deal with numbers close to zero +MATH_LOG_OFFSET = 1e-10 + +def numpy_apply_log_keep_sign(array, offset=MATH_LOG_OFFSET): + # Calculate the absolute value and add the offset + abs_plus_offset = np.abs(array) + offset + + # Calculate the logarithm + log_values = np.log(abs_plus_offset) + + # Apply the original signs to the log values + signed_log_values = np.copysign(log_values, array) + + return signed_log_values + -def _get_numpy_array_distance(num1, num2, max_=1): +def logarithmic_similarity(a: numbers, b: numbers, threshold: float=0.1): + """ + A threshold of 0.1 translates to about 10.5% difference. + A threshold of 0.5 translates to about 65% difference. + A threshold of 0.05 translates to about 5.1% difference. + """ + return logarithmic_distance(a, b) < threshold + + +def logarithmic_distance(a: numbers, b: numbers): + # Apply logarithm to the absolute values and consider the sign + a = float(a) + b = float(b) + log_a = math.copysign(math.log(abs(a) + MATH_LOG_OFFSET), a) + log_b = math.copysign(math.log(abs(b) + MATH_LOG_OFFSET), b) + + return abs(log_a - log_b) + + +def _get_numpy_array_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1): """ Get the distance of 2 numbers. The output is a number between 0 to the max. The reason is the @@ -229,24 +273,32 @@ def _get_numpy_array_distance(num1, num2, max_=1): # getting the pairs of items during the ingore_order=True # calculations, we need to make the divisor of comparison very big # so that any 2 numbers can be chosen as pairs. + if use_log_scale: + num1 = numpy_apply_log_keep_sign(num1) + num2 = numpy_apply_log_keep_sign(num2) + divisor = (num1 + num2) / max_ result = _numpy_div((num1 - num2), divisor, replace_inf_with=max_) - return np.clip(np.absolute(result), 0, max_) + + distance_array = np.clip(np.absolute(result), 0, max_) + if use_log_scale: + distance_array[distance_array < log_scale_similarity_threshold] = 0 + return distance_array -def _get_datetime_distance(date1, date2, max_): +def _get_datetime_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold): return _get_numbers_distance(date1.timestamp(), date2.timestamp(), max_) -def _get_date_distance(date1, date2, max_): +def _get_date_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold): return _get_numbers_distance(date1.toordinal(), date2.toordinal(), max_) -def _get_timedelta_distance(timedelta1, timedelta2, max_): +def _get_timedelta_distance(timedelta1, timedelta2, max_, use_log_scale, log_scale_similarity_threshold): return _get_numbers_distance(timedelta1.total_seconds(), timedelta2.total_seconds(), max_) -def _get_time_distance(time1, time2, max_): +def _get_time_distance(time1, time2, max_, use_log_scale, log_scale_similarity_threshold): return _get_numbers_distance(time_to_seconds(time1), time_to_seconds(time2), max_) @@ -259,8 +311,8 @@ def _get_time_distance(time1, time2, max_): ] -def get_numeric_types_distance(num1, num2, max_): +def get_numeric_types_distance(num1, num2, max_, use_log_scale=False, log_scale_similarity_threshold=0.1): for type_, func in TYPES_TO_DIST_FUNC: if isinstance(num1, type_) and isinstance(num2, type_): - return func(num1, num2, max_) + return func(num1, num2, max_, use_log_scale, log_scale_similarity_threshold) return not_found diff --git a/deepdiff/helper.py b/deepdiff/helper.py index 1fe053fd..7913c43f 100644 --- a/deepdiff/helper.py +++ b/deepdiff/helper.py @@ -1,6 +1,7 @@ import sys import re import os +import math import datetime import uuid import logging diff --git a/tests/test_cache.py b/tests/test_cache.py index 31c9938b..b4e22124 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -74,7 +74,8 @@ def test_cache_deeply_nested_b(self, nested_b_t1, nested_b_t2, nested_b_result): 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False } - assert expected_stats == stats + stats_diff = DeepDiff(expected_stats, stats, use_log_scale=True, log_scale_similarity_threshold=0.15) + assert not stats_diff assert nested_b_result == diff diff_of_diff = DeepDiff(nested_b_result, diff.to_dict(), ignore_order=False) diff --git a/tests/test_delta.py b/tests/test_delta.py index 0f22ab1f..217dc4d4 100644 --- a/tests/test_delta.py +++ b/tests/test_delta.py @@ -448,7 +448,7 @@ def test_delta_dict_items_added_retain_order(self): } } - diff = DeepDiff(t1, t2) + diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0) delta_dict = diff._to_delta_dict() assert expected_delta_dict == delta_dict delta = Delta(diff, bidirectional=False, raise_errors=True) @@ -828,9 +828,9 @@ def compare_func(item1, item2, level=None): 'delta_case14b_threshold_to_diff_deeper': { 't1': picklalbe_obj_without_item, 't2': PicklableClass(11), - 'deepdiff_kwargs': {'threshold_to_diff_deeper': 0.33}, + 'deepdiff_kwargs': {'threshold_to_diff_deeper': 0.5}, 'to_delta_kwargs': {}, - 'expected_delta_dict': {'values_changed': {'root': {'new_value': PicklableClass(11)}}} + 'expected_delta_dict': {'attribute_added': {'root.item': 11}} }, 'delta_case15_diffing_simple_numbers': { 't1': 1, diff --git a/tests/test_diff_text.py b/tests/test_diff_text.py index 93f0bb9a..f41fff38 100755 --- a/tests/test_diff_text.py +++ b/tests/test_diff_text.py @@ -104,7 +104,7 @@ def test_value_change(self): def test_item_added_and_removed(self): t1 = {1: 1, 2: 2, 3: [3], 4: 4} t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6} - ddiff = DeepDiff(t1, t2) + ddiff = DeepDiff(t1, t2, threshold_to_diff_deeper=0) result = { 'dictionary_item_added': ["root[5]", "root[6]"], 'dictionary_item_removed': ["root[4]"], @@ -1023,7 +1023,7 @@ def test_dictionary_with_string_keys1(self): t1 = {"veggie": "carrots"} t2 = {"meat": "carrots"} - diff = DeepDiff(t1, t2) + diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0) assert {'dictionary_item_added': ["root['meat']"], 'dictionary_item_removed': ["root['veggie']"]} == diff @@ -1037,9 +1037,12 @@ def test_dictionary_with_string_keys_threshold_to_diff_deeper(self): def test_dictionary_with_numeric_keys(self): t1 = {Decimal('10.01'): "carrots"} t2 = {10.01: "carrots"} - diff = DeepDiff(t1, t2) + diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0) assert {'dictionary_item_added': ["root[10.01]"], 'dictionary_item_removed': ["root[Decimal('10.01')]"]} == diff + diff2 = DeepDiff(t1, t2) + assert {'values_changed': {'root': {'new_value': {10.01: 'carrots'}, 'old_value': {Decimal('10.01'): 'carrots'}}}} == diff2 + def test_loop(self): class LoopTest: def __init__(self, a): @@ -1331,6 +1334,33 @@ def test_decimal_digits(self, t1, t2, significant_digits, expected_result): ddiff = DeepDiff(t1, t2, ignore_numeric_type_changes=True, ignore_string_type_changes=True, significant_digits=significant_digits) assert expected_result == ddiff + @pytest.mark.parametrize('test_num, t1, t2, log_scale_similarity_threshold, expected', [ + ( + 1, + {'foo': 110, 'bar': 306}, # t1 + {'foo': 140, 'bar': 298}, # t2 + 0.01, # threshold + {'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}, "root['bar']": {'new_value': 298, 'old_value': 306}}}, # expected + ), + ( + 2, + {'foo': 110, 'bar': 306}, # t1 + {'foo': 140, 'bar': 298}, # t2 + 0.1, # threshold + {'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}}}, # expected + ), + ( + 2, + {'foo': 110, 'bar': 306}, # t1 + {'foo': 140, 'bar': 298}, # t2 + 0.3, # threshold + {}, # expected + ), + ]) + def test_log_scale(self, test_num, t1, t2, log_scale_similarity_threshold, expected): + diff = DeepDiff(t1, t2, use_log_scale=True, log_scale_similarity_threshold=log_scale_similarity_threshold) + assert expected == diff, f"test_log_scale #{test_num} failed." + def test_ignore_type_in_groups(self): t1 = [1, 2, 3] t2 = [1.0, 2.0, 3.0] @@ -1348,7 +1378,7 @@ def test_ignore_type_in_groups3(self): t1 = {Decimal('10.01'): "carrots"} t2 = {10.01: "carrots"} - diff1 = DeepDiff(t1, t2) + diff1 = DeepDiff(t1, t2, threshold_to_diff_deeper=0) diff2 = DeepDiff(t1, t2, ignore_numeric_type_changes=True) diff --git a/tests/test_ignore_order.py b/tests/test_ignore_order.py index e01e2fad..c0c3b692 100644 --- a/tests/test_ignore_order.py +++ b/tests/test_ignore_order.py @@ -28,7 +28,7 @@ def test_type_change_numeric_ignored(self, t1, t2, significant_digits, ignore_or ({"a": Decimal(10), "b": 12, 11.0: None}, {b"b": 12, "a": 10.0, Decimal(11): None}, {}), ]) def test_type_change_numeric_when_ignore_order(self, t1, t2, expected_result): - ddiff = DeepDiff(t1, t2, ignore_order=True, ignore_numeric_type_changes=True, ignore_string_type_changes=True) + ddiff = DeepDiff(t1, t2, ignore_order=True, ignore_numeric_type_changes=True, ignore_string_type_changes=True, threshold_to_diff_deeper=0) assert expected_result == ddiff def test_ignore_order_depth1(self): @@ -318,7 +318,7 @@ def test_list_of_unhashable_difference_ignore_order_report_repetition( self): t1 = [1, {"a": 2}, {"a": 2}, {"b": [3, 4, {1: 1}]}, "B"] t2 = [{"b": [3, 4, {1: 1}]}, {1: 1}] - ddiff = DeepDiff(t1, t2, ignore_order=True, report_repetition=True) + ddiff = DeepDiff(t1, t2, ignore_order=True, report_repetition=True, threshold_to_diff_deeper=0) result = { 'iterable_item_added': { 'root[1]': { @@ -567,6 +567,22 @@ def test_decimal_ignore_order(self): result = {} assert result == ddiff + @pytest.mark.parametrize('log_scale_similarity_threshold, expected', [ + ( + 0.1, + {} + ), + ( + 0.01, + {'values_changed': {'root[1][2]': {'new_value': Decimal('268'), 'old_value': Decimal('290.2')}}} + ), + ]) + def test_decimal_log_scale_ignore_order1(self, log_scale_similarity_threshold, expected): + t1 = [{1: Decimal('10.143')}, {2: Decimal('290.2')}] + t2 = [{2: Decimal('268')}, {1: Decimal('10.23')}] + ddiff = DeepDiff(t1, t2, ignore_order=True, use_log_scale=True, log_scale_similarity_threshold=log_scale_similarity_threshold, cutoff_intersection_for_pairs=1) + assert expected == ddiff + @pytest.mark.parametrize("t1, t2, significant_digits, ignore_order", [ (100000, 100021, 3, False), ([10, 12, 100000], [50, 63, 100021], 3, False), @@ -674,7 +690,7 @@ def test_ignore_order_max_passes(self, max_passes, expected): }, ] - ddiff = DeepDiff(t1, t2, ignore_order=True, max_passes=max_passes, verbose_level=2, cache_size=5000, cutoff_intersection_for_pairs=1) + ddiff = DeepDiff(t1, t2, ignore_order=True, max_passes=max_passes, verbose_level=2, cache_size=5000, cutoff_intersection_for_pairs=1, threshold_to_diff_deeper=0) assert expected == ddiff @pytest.mark.parametrize('max_diffs, expected', [ @@ -1123,7 +1139,7 @@ def test_ignore_order_with_compare_func_can_throw_cannot_compare(self): } } - ddiff = DeepDiff(t1, t2, cutoff_intersection_for_pairs=1, cutoff_distance_for_pairs=1, ignore_order=True) + ddiff = DeepDiff(t1, t2, cutoff_intersection_for_pairs=1, cutoff_distance_for_pairs=1, ignore_order=True, threshold_to_diff_deeper=0) assert expected == ddiff def compare_func(x, y, level=None): @@ -1132,7 +1148,7 @@ def compare_func(x, y, level=None): except Exception: raise CannotCompare() from None - ddiff2 = DeepDiff(t1, t2, ignore_order=True, cutoff_intersection_for_pairs=1, cutoff_distance_for_pairs=1, iterable_compare_func=compare_func) + ddiff2 = DeepDiff(t1, t2, ignore_order=True, cutoff_intersection_for_pairs=1, cutoff_distance_for_pairs=1, iterable_compare_func=compare_func, threshold_to_diff_deeper=0) assert expected_with_compare_func == ddiff2 assert ddiff != ddiff2 @@ -1307,7 +1323,7 @@ def test_ignore_order_func(self): def ignore_order_func(level): return "order_does_not_matter" in level.path() - ddiff = DeepDiff(t1, t2, cutoff_intersection_for_pairs=1, cutoff_distance_for_pairs=1, ignore_order_func=ignore_order_func) + ddiff = DeepDiff(t1, t2, cutoff_intersection_for_pairs=1, cutoff_distance_for_pairs=1, ignore_order_func=ignore_order_func, threshold_to_diff_deeper=0) expected = { 'type_changes': { diff --git a/tests/test_operators.py b/tests/test_operators.py index 90fd31d0..d3ba07b2 100644 --- a/tests/test_operators.py +++ b/tests/test_operators.py @@ -164,7 +164,7 @@ def give_up_diffing(self, level, diff_instance): assert {} == ddiff - ddiff2 = DeepDiff(custom2, custom3, custom_operators=[ + ddiff2 = DeepDiff(custom2, custom3, threshold_to_diff_deeper=0, custom_operators=[ ListMatchOperator(types=[CustomClass]) ])