adding use_log_scale and log_scale_similarity_threshold

seperman · May 17, 2024 · ff6ff87 · ff6ff87
1 parent a739a50
commit ff6ff87
Show file tree

Hide file tree

Showing 8 changed files with 138 additions and 34 deletions.
diff --git a/deepdiff/diff.py b/deepdiff/diff.py
@@ -27,7 +27,7 @@
                              np, get_truncate_datetime, dict_, CannotCompare, ENUM_INCLUDE_KEYS,
                              PydanticBaseModel, Opcode, SetOrdered)
 from deepdiff.serialization import SerializationMixin
-from deepdiff.distance import DistanceMixin
+from deepdiff.distance import DistanceMixin, logarithmic_similarity
 from deepdiff.model import (
     RemapDict, ResultDict, TextResult, TreeResult, DiffLevel,
     DictRelationship, AttributeRelationship, REPORT_KEYS,
@@ -157,7 +157,9 @@ def __init__(self,
                  progress_logger: Callable=logger.info,
                  report_repetition: bool=False,
                  significant_digits: Optional[int]=None,
-                 threshold_to_diff_deeper: float = 0,
+                 use_log_scale: bool=False,
+                 log_scale_similarity_threshold: int=0.1,
+                 threshold_to_diff_deeper: float = 0.33,
                  truncate_datetime: Optional[str]=None,
                  use_enum_value: bool=False,
                  verbose_level: int=1,
@@ -178,7 +180,7 @@ def __init__(self,
                 "cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
                 "cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, "
                 "math_epsilon, iterable_compare_func, use_enum_value, _original_type, threshold_to_diff_deeper, "
-                "ignore_order_func, custom_operators, encodings, ignore_encoding_errors, "
+                "ignore_order_func, custom_operators, encodings, ignore_encoding_errors, use_log_scale, log_scale_similarity_threshold "
                 "_parameters and _shared_parameters.") % ', '.join(kwargs.keys()))
 
         if _parameters:
@@ -196,6 +198,8 @@ def __init__(self,
             if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
                 ignore_string_type_changes = True
             self.use_enum_value = use_enum_value
+            self.log_scale_similarity_threshold = log_scale_similarity_threshold
+            self.use_log_scale = use_log_scale
             self.threshold_to_diff_deeper = threshold_to_diff_deeper
             self.ignore_string_type_changes = ignore_string_type_changes
             self.ignore_type_in_groups = self.get_ignore_types_in_groups(
@@ -583,9 +587,8 @@ def _diff_dict(
         t_keys_union = t2_keys | t1_keys
         t_keys_added = t2_keys - t_keys_intersect
         t_keys_removed = t1_keys - t_keys_intersect
-
         if self.threshold_to_diff_deeper:
-            if len(t_keys_union) and len(t_keys_intersect) / len(t_keys_union) < self.threshold_to_diff_deeper:
+            if len(t_keys_union) > 1 and len(t_keys_intersect) / len(t_keys_union) < self.threshold_to_diff_deeper:
                 self._report_result('values_changed', level, local_tree=local_tree)
                 return
 
@@ -1145,7 +1148,6 @@ def defaultdict_orderedset():
         pairs = dict_()
 
         pre_calced_distances = None
-
         if hashes_added and hashes_removed and np and len(hashes_added) > 1 and len(hashes_removed) > 1:
             # pre-calculates distances ONLY for 1D arrays whether an _original_type
             # was explicitly passed or a homogeneous array is detected.
@@ -1233,7 +1235,6 @@ def _diff_iterable_with_deephash(self, level, parents_ids, _original_type=None,
         else:
             t1_hashtable = {k: v for k, v in full_t1_hashtable.items() if k in hashes_removed}
             t2_hashtable = {k: v for k, v in full_t2_hashtable.items() if k in hashes_added}
-
         if self._stats[PASSES_COUNT] < self.max_passes and get_pairs:
             self._stats[PASSES_COUNT] += 1
             pairs = self._get_most_in_common_pairs_in_iterables(
@@ -1403,7 +1404,10 @@ def _diff_numbers(self, level, local_tree=None, report_type_change=True):
         else:
             t1_type = t2_type = ''
 
-        if self.math_epsilon is not None:
+        if self.use_log_scale:
+            if not logarithmic_similarity(level.t1, level.t2, threshold=self.log_scale_similarity_threshold):
+                self._report_result('values_changed', level, local_tree=local_tree)
+        elif self.math_epsilon is not None:
             if not is_close(level.t1, level.t2, abs_tol=self.math_epsilon):
                 self._report_result('values_changed', level, local_tree=local_tree)
         elif self.significant_digits is None:

diff --git a/deepdiff/distance.py b/deepdiff/distance.py
@@ -1,3 +1,5 @@
+import numpy as np
+import math
 import datetime
 from deepdiff.deephash import DeepHash
 from deepdiff.helper import (
@@ -31,7 +33,7 @@ def _get_rough_distance(self):
         """
 
         _distance = get_numeric_types_distance(
-            self.t1, self.t2, max_=self.cutoff_distance_for_pairs)
+            self.t1, self.t2, max_=self.cutoff_distance_for_pairs, use_log_scale=self.use_log_scale, log_scale_similarity_threshold=self.log_scale_similarity_threshold)
 
         if _distance is not not_found:
             return _distance
@@ -122,7 +124,10 @@ def _precalculate_numpy_arrays_distance(
 
         distances = _get_numpy_array_distance(
             pairs_transposed[0], pairs_transposed[1],
-            max_=self.cutoff_distance_for_pairs)
+            max_=self.cutoff_distance_for_pairs,
+            use_log_scale=self.use_log_scale,
+            log_scale_similarity_threshold=self.log_scale_similarity_threshold,
+        )
 
         i = 0
         for added_hash in hashes_added:
@@ -186,14 +191,19 @@ def _get_item_length(item, parents_ids=frozenset([])):
     return length
 
 
-def _get_numbers_distance(num1, num2, max_=1):
+def _get_numbers_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1):
     """
     Get the distance of 2 numbers. The output is a number between 0 to the max.
     The reason is the
     When max is returned means the 2 numbers are really far, and 0 means they are equal.
     """
     if num1 == num2:
         return 0
+    if use_log_scale:
+        distance = logarithmic_distance(num1, num2)
+        if distance < logarithmic_distance:
+            return 0
+        return distance
     if not isinstance(num1, float):
         num1 = float(num1)
     if not isinstance(num2, float):
@@ -218,8 +228,42 @@ def _numpy_div(a, b, replace_inf_with=1):
     result[a == b] = 0
     return result
 
+# To deal with numbers close to zero
+MATH_LOG_OFFSET = 1e-10
+
+def numpy_apply_log_keep_sign(array, offset=MATH_LOG_OFFSET):
+    # Calculate the absolute value and add the offset
+    abs_plus_offset = np.abs(array) + offset
+
+    # Calculate the logarithm
+    log_values = np.log(abs_plus_offset)
+
+    # Apply the original signs to the log values
+    signed_log_values = np.copysign(log_values, array)
+
+    return signed_log_values
+
 
-def _get_numpy_array_distance(num1, num2, max_=1):
+def logarithmic_similarity(a: numbers, b: numbers, threshold: float=0.1):
+    """
+    A threshold of 0.1 translates to about 10.5% difference.
+    A threshold of 0.5 translates to about 65% difference.
+    A threshold of 0.05 translates to about 5.1% difference.
+    """
+    return logarithmic_distance(a, b) < threshold
+
+
+def logarithmic_distance(a: numbers, b: numbers):
+    # Apply logarithm to the absolute values and consider the sign
+    a = float(a)
+    b = float(b)
+    log_a = math.copysign(math.log(abs(a) + MATH_LOG_OFFSET), a)
+    log_b = math.copysign(math.log(abs(b) + MATH_LOG_OFFSET), b)
+
+    return abs(log_a - log_b)
+
+
+def _get_numpy_array_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1):
     """
     Get the distance of 2 numbers. The output is a number between 0 to the max.
     The reason is the
@@ -229,24 +273,32 @@ def _get_numpy_array_distance(num1, num2, max_=1):
     # getting the pairs of items during the ingore_order=True
     # calculations, we need to make the divisor of comparison very big
     # so that any 2 numbers can be chosen as pairs.
+    if use_log_scale:
+        num1 = numpy_apply_log_keep_sign(num1)
+        num2 = numpy_apply_log_keep_sign(num2)
+
     divisor = (num1 + num2) / max_
     result = _numpy_div((num1 - num2), divisor, replace_inf_with=max_)
-    return np.clip(np.absolute(result), 0, max_)
+
+    distance_array = np.clip(np.absolute(result), 0, max_)
+    if use_log_scale:
+        distance_array[distance_array < log_scale_similarity_threshold] = 0
+    return distance_array
 
 
-def _get_datetime_distance(date1, date2, max_):
+def _get_datetime_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold):
     return _get_numbers_distance(date1.timestamp(), date2.timestamp(), max_)
 
 
-def _get_date_distance(date1, date2, max_):
+def _get_date_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold):
     return _get_numbers_distance(date1.toordinal(), date2.toordinal(), max_)
 
 
-def _get_timedelta_distance(timedelta1, timedelta2, max_):
+def _get_timedelta_distance(timedelta1, timedelta2, max_, use_log_scale, log_scale_similarity_threshold):
     return _get_numbers_distance(timedelta1.total_seconds(), timedelta2.total_seconds(), max_)
 
 
-def _get_time_distance(time1, time2, max_):
+def _get_time_distance(time1, time2, max_, use_log_scale, log_scale_similarity_threshold):
     return _get_numbers_distance(time_to_seconds(time1), time_to_seconds(time2), max_)
 
 
@@ -259,8 +311,8 @@ def _get_time_distance(time1, time2, max_):
 ]
 
 
-def get_numeric_types_distance(num1, num2, max_):
+def get_numeric_types_distance(num1, num2, max_, use_log_scale=False, log_scale_similarity_threshold=0.1):
     for type_, func in TYPES_TO_DIST_FUNC:
         if isinstance(num1, type_) and isinstance(num2, type_):
-            return func(num1, num2, max_)
+            return func(num1, num2, max_, use_log_scale, log_scale_similarity_threshold)
     return not_found
diff --git a/deepdiff/helper.py b/deepdiff/helper.py
@@ -1,6 +1,7 @@
 import sys
 import re
 import os
+import math
 import datetime
 import uuid
 import logging

diff --git a/tests/test_cache.py b/tests/test_cache.py
@@ -74,7 +74,8 @@ def test_cache_deeply_nested_b(self, nested_b_t1, nested_b_t2, nested_b_result):
             'MAX PASS LIMIT REACHED': False,
             'MAX DIFF LIMIT REACHED': False
         }
-        assert expected_stats == stats
+        stats_diff = DeepDiff(expected_stats, stats, use_log_scale=True, log_scale_similarity_threshold=0.15)
+        assert not stats_diff
         assert nested_b_result == diff
 
         diff_of_diff = DeepDiff(nested_b_result, diff.to_dict(), ignore_order=False)

diff --git a/tests/test_delta.py b/tests/test_delta.py
@@ -448,7 +448,7 @@ def test_delta_dict_items_added_retain_order(self):
             }
         }
 
-        diff = DeepDiff(t1, t2)
+        diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
         delta_dict = diff._to_delta_dict()
         assert expected_delta_dict == delta_dict
         delta = Delta(diff, bidirectional=False, raise_errors=True)
@@ -828,9 +828,9 @@ def compare_func(item1, item2, level=None):
     'delta_case14b_threshold_to_diff_deeper': {
         't1': picklalbe_obj_without_item,
         't2': PicklableClass(11),
-        'deepdiff_kwargs': {'threshold_to_diff_deeper': 0.33},
+        'deepdiff_kwargs': {'threshold_to_diff_deeper': 0.5},
         'to_delta_kwargs': {},
-        'expected_delta_dict': {'values_changed': {'root': {'new_value': PicklableClass(11)}}}
+        'expected_delta_dict': {'attribute_added': {'root.item': 11}}
     },
     'delta_case15_diffing_simple_numbers': {
         't1': 1,

diff --git a/tests/test_diff_text.py b/tests/test_diff_text.py
@@ -104,7 +104,7 @@ def test_value_change(self):
     def test_item_added_and_removed(self):
         t1 = {1: 1, 2: 2, 3: [3], 4: 4}
         t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6}
-        ddiff = DeepDiff(t1, t2)
+        ddiff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
         result = {
             'dictionary_item_added': ["root[5]", "root[6]"],
             'dictionary_item_removed': ["root[4]"],
@@ -1023,7 +1023,7 @@ def test_dictionary_with_string_keys1(self):
         t1 = {"veggie": "carrots"}
         t2 = {"meat": "carrots"}
 
-        diff = DeepDiff(t1, t2)
+        diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
         assert {'dictionary_item_added': ["root['meat']"],
                 'dictionary_item_removed': ["root['veggie']"]} == diff
 
@@ -1037,9 +1037,12 @@ def test_dictionary_with_string_keys_threshold_to_diff_deeper(self):
     def test_dictionary_with_numeric_keys(self):
         t1 = {Decimal('10.01'): "carrots"}
         t2 = {10.01: "carrots"}
-        diff = DeepDiff(t1, t2)
+        diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
         assert {'dictionary_item_added': ["root[10.01]"], 'dictionary_item_removed': ["root[Decimal('10.01')]"]} == diff
 
+        diff2 = DeepDiff(t1, t2)
+        assert {'values_changed': {'root': {'new_value': {10.01: 'carrots'}, 'old_value': {Decimal('10.01'): 'carrots'}}}} == diff2
+
     def test_loop(self):
         class LoopTest:
             def __init__(self, a):
@@ -1331,6 +1334,33 @@ def test_decimal_digits(self, t1, t2, significant_digits, expected_result):
         ddiff = DeepDiff(t1, t2, ignore_numeric_type_changes=True, ignore_string_type_changes=True, significant_digits=significant_digits)
         assert expected_result == ddiff
 
+    @pytest.mark.parametrize('test_num, t1, t2, log_scale_similarity_threshold, expected', [
+        (
+            1,
+            {'foo': 110, 'bar': 306},  # t1
+            {'foo': 140, 'bar': 298},  # t2
+            0.01,  # threshold
+            {'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}, "root['bar']": {'new_value': 298, 'old_value': 306}}},  # expected
+        ),
+        (
+            2,
+            {'foo': 110, 'bar': 306},  # t1
+            {'foo': 140, 'bar': 298},  # t2
+            0.1,  # threshold
+            {'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}}},  # expected
+        ),
+        (
+            2,
+            {'foo': 110, 'bar': 306},  # t1
+            {'foo': 140, 'bar': 298},  # t2
+            0.3,  # threshold
+            {},  # expected
+        ),
+    ])
+    def test_log_scale(self, test_num, t1, t2, log_scale_similarity_threshold, expected):
+        diff = DeepDiff(t1, t2, use_log_scale=True, log_scale_similarity_threshold=log_scale_similarity_threshold)
+        assert expected == diff, f"test_log_scale #{test_num} failed."
+
     def test_ignore_type_in_groups(self):
         t1 = [1, 2, 3]
         t2 = [1.0, 2.0, 3.0]
@@ -1348,7 +1378,7 @@ def test_ignore_type_in_groups3(self):
         t1 = {Decimal('10.01'): "carrots"}
         t2 = {10.01: "carrots"}
 
-        diff1 = DeepDiff(t1, t2)
+        diff1 = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
 
         diff2 = DeepDiff(t1, t2, ignore_numeric_type_changes=True)