From 9c3e674c2571f946c7a6ea2dc828b4e00b81980f Mon Sep 17 00:00:00 2001 From: Florian Gardin Date: Thu, 1 Mar 2018 15:42:24 +0100 Subject: [PATCH 1/2] Add Free name choice of variables --- skrules/__init__.py | 2 +- skrules/rule.py | 6 ++++++ skrules/skope_rules.py | 31 ++++++++++++++++++++++++------- skrules/tests/test_rule.py | 12 +++++++++++- 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/skrules/__init__.py b/skrules/__init__.py index 7789da2..eeca603 100644 --- a/skrules/__init__.py +++ b/skrules/__init__.py @@ -1,4 +1,4 @@ from .skope_rules import SkopeRules -from .rule import Rule +from .rule import Rule, replace_feature_name __all__ = ['SkopeRules', 'Rule'] diff --git a/skrules/rule.py b/skrules/rule.py index 8e4413a..c50c11b 100644 --- a/skrules/rule.py +++ b/skrules/rule.py @@ -1,3 +1,8 @@ +def replace_feature_name(rule, replace_dict): + for key, value in replace_dict.items(): + rule = rule.replace(key, value) + return rule + class Rule: """ An object modelizing a logical rule and add factorization methods. It is used to simplify rules and deduplicate them. @@ -56,3 +61,4 @@ def __repr__(self): [feature, symbol, str(self.agg_dict[(feature, symbol)])]) for feature, symbol in sorted(self.agg_dict.keys()) ]) + diff --git a/skrules/skope_rules.py b/skrules/skope_rules.py index 11e8e0b..aa11045 100644 --- a/skrules/skope_rules.py +++ b/skrules/skope_rules.py @@ -12,10 +12,10 @@ from sklearn.externals import six from sklearn.tree import _tree -from .rule import Rule +from .rule import Rule, replace_feature_name INTEGER_TYPES = (numbers.Integral, np.integer) - +BASE_FEATURE_NAME = "__C__" class SkopeRules(BaseEstimator): """ An easy-interpretable classifier optimizing simple logical rules. @@ -249,11 +249,17 @@ def fit(self, X, y, sample_weight=None): self.estimators_samples_ = [] self.estimators_features_ = [] - # default columns names of the form ['c0', 'c1', ...]: - feature_names_ = (self.feature_names if self.feature_names is not None - else ['c' + x for x in - np.arange(X.shape[1]).astype(str)]) + # default columns names : + feature_names_ = [BASE_FEATURE_NAME + x for x in + np.arange(X.shape[1]).astype(str)] + if self.feature_names is not None: + self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat + for i, feat in enumerate(self.feature_names)} + else: + self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat + for i, feat in enumerate(feature_names_)} self.feature_names_ = feature_names_ + clfs = [] regs = [] @@ -356,6 +362,10 @@ def fit(self, X, y, sample_weight=None): for rule in [Rule(r, args=args) for r, args in rules_]] + + + + # keep only rules verifying precision_min and recall_min: for rule, score in rules_: if score[0] >= self.precision_min and score[1] >= self.recall_min: @@ -377,7 +387,14 @@ def fit(self, X, y, sample_weight=None): # Deduplicate the rule using semantic tree if self.max_depth_duplication is not None: self.rules_ = self.deduplicate(self.rules_) + self.rules_ = sorted(self.rules_, key=lambda x: - self.f1_score(x)) + self.rules_without_feature_names_ = self.rules_ + + # Replace generic feature names by real feature names + self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf) + for rule, perf in self.rules_] + return self def predict(self, X): @@ -432,7 +449,7 @@ def decision_function(self, X): % (X.shape[1], self.n_features_)) df = pandas.DataFrame(X, columns=self.feature_names_) - selected_rules = self.rules_ + selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) for (r, w) in selected_rules: diff --git a/skrules/tests/test_rule.py b/skrules/tests/test_rule.py index 9b8e28a..ab1048a 100644 --- a/skrules/tests/test_rule.py +++ b/skrules/tests/test_rule.py @@ -1,6 +1,6 @@ from sklearn.utils.testing import assert_equal, assert_not_equal -from skrules import Rule +from skrules import Rule, replace_feature_name def test_rule(): @@ -53,3 +53,13 @@ def test_equals_rule(): rule3 = "a < 3.0 and a == a" assert_equal(rule3, str(Rule(rule3))) + + +def test_replace_feature_name(): + rule = "__C__0 <= 3 and __C__1 > 4" + real_rule = "$b <= 3 and c(4) > 4" + replace_dict = { + "__C__0": "$b", + "__C__1": "c(4)" + } + assert_equal(replace_feature_name(rule, replace_dict=replace_dict), real_rule) From 63d3528a67cc46fbeeefbe2d3442d8b4ff883807 Mon Sep 17 00:00:00 2001 From: Florian Gardin Date: Thu, 1 Mar 2018 16:38:06 +0100 Subject: [PATCH 2/2] Fix bugs when there is more than 10 features --- skrules/rule.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/skrules/rule.py b/skrules/rule.py index c50c11b..7c11979 100644 --- a/skrules/rule.py +++ b/skrules/rule.py @@ -1,6 +1,11 @@ +import re + def replace_feature_name(rule, replace_dict): - for key, value in replace_dict.items(): - rule = rule.replace(key, value) + def replace(match): + return replace_dict[match.group(0)] + + rule = re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in replace_dict), + replace, rule) return rule class Rule: