From caeb09e866483fc2de6765ffa4e9b8956f0a63a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 17 Oct 2023 14:25:17 +0200 Subject: [PATCH] FIX Make decision tree pickles deterministic (#27580) Co-authored-by: Olivier Grisel --- doc/whats_new/v1.3.rst | 17 +++++++++++++++++ sklearn/tree/_tree.pyx | 4 +++- sklearn/tree/tests/test_tree.py | 13 +++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index ddb6a2ebe0016..1a445b7436201 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -2,6 +2,23 @@ .. currentmodule:: sklearn +.. _changes_1_3_2: + +Version 1.3.2 +============= + +**October 2023** + +Changelog +--------- + +:mod:`sklearn.tree` +................... + +- |Fix| Do not leak data via non-initialized memory in decision tree pickle files and make + the generation of those files deterministic. :pr:`27580` by :user:`Loïc Estève `. + + .. _changes_1_3_1: Version 1.3.1 diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index b4ce56a4d2a0b..ef3a99a69ac1f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -908,11 +908,13 @@ cdef class Tree: safe_realloc(&self.nodes, capacity) safe_realloc(&self.value, capacity * self.value_stride) - # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: + # value memory is initialised to 0 to enable classifier argmax memset((self.value + self.capacity * self.value_stride), 0, (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) + # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct) + memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 2543a2f2a39ad..f876738ebba2b 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2601,3 +2601,16 @@ def test_sample_weight_non_uniform(make_data, Tree): tree_samples_removed.fit(X[1::2, :], y[1::2]) assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) + + +def test_deterministic_pickle(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/27268 + # Uninitialised memory would lead to the two pickle strings being different. + tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) + tree2 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) + + pickle1 = pickle.dumps(tree1) + pickle2 = pickle.dumps(tree2) + + assert pickle1 == pickle2