Merge pull request #82 from rodrigo-arenas/0.8.x

Max Features in GAFeatureSelectionCV
rodrigo-arenas · Dec 8, 2021 · 40b1366 · 40b1366
2 parents a2c6c29 + c428d6f
commit 40b1366
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 29 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2020 rodrigoarenas456
+Copyright (c) 2020 rodrigo-arenas
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/docs/release_notes.rst b/docs/release_notes.rst
@@ -3,6 +3,30 @@ Release Notes
 
 Some notes on new features in various releases
 
+What's new in 0.8.0dev0
+-----------------------
+
+^^^^^^^^^
+Features:
+^^^^^^^^^
+
+* :class:`~sklearn_genetic.GAFeatureSelectionCV` now has a parameter called `max_features`, int, default=None.
+  If it's not None, it will penalize individuals with more features than max_features, putting a "soft" upper bound
+  to the number of features to be selected.
+
+^^^^^^^^^^^^
+API Changes:
+^^^^^^^^^^^^
+
+* The following parameters changed their default values to create more extensive
+  and different models with better results:
+
+  - population_size from 10 to 50
+
+  - generations from 40 to 80
+
+  - mutation_probability from 0.1 to 0.2
+
 
 What's new in 0.7.0
 -------------------

diff --git a/sklearn_genetic/_version.py b/sklearn_genetic/_version.py
@@ -1 +1 @@
-__version__ = "0.7.0"
+__version__ = "0.8.0dev0"
diff --git a/sklearn_genetic/genetic_search.py b/sklearn_genetic/genetic_search.py
@@ -191,10 +191,10 @@ def __init__(
         cv=3,
         param_grid=None,
         scoring=None,
-        population_size=10,
-        generations=40,
+        population_size=50,
+        generations=80,
         crossover_probability=0.8,
-        mutation_probability=0.1,
+        mutation_probability=0.2,
         tournament_size=3,
         elitism=True,
         verbose=True,
@@ -719,6 +719,9 @@ class GAFeatureSelectionCV(BaseSearchCV):
     elitism : bool, default=True
         If True takes the *tournament_size* best solution to the next generation.
 
+    max_features : int, default=None
+        The upper bound number of features to be selected.
+
     scoring : str or callable, default=None
         A str (see model evaluation documentation) or
         a scorer callable object / function with signature
@@ -823,12 +826,13 @@ def __init__(
         estimator,
         cv=3,
         scoring=None,
-        population_size=10,
-        generations=40,
+        population_size=50,
+        generations=80,
         crossover_probability=0.8,
-        mutation_probability=0.1,
+        mutation_probability=0.2,
         tournament_size=3,
         elitism=True,
+        max_features=None,
         verbose=True,
         keep_top_k=1,
         criteria="max",
@@ -851,6 +855,7 @@ def __init__(
         self.mutation_probability = mutation_probability
         self.tournament_size = tournament_size
         self.elitism = elitism
+        self.max_features = max_features
         self.verbose = verbose
         self.keep_top_k = keep_top_k
         self.criteria = criteria
@@ -1026,6 +1031,12 @@ def evaluate(self, individual):
         # Log the features and the cv-score
         self.logbook.record(parameters=current_generation_features)
 
+        # Penalize individuals with more features than the max_features parameter
+        if self.max_features and (
+            n_selected_features > self.max_features or n_selected_features == 0
+        ):
+            score = -self.criteria_sign * 10000
+
         return [score, n_selected_features]
 
     @if_delegate_has_method(delegate="estimator")

diff --git a/sklearn_genetic/plots.py b/sklearn_genetic/plots.py
@@ -78,7 +78,9 @@ def plot_search_space(estimator, height=2, s=25, features: list = None):
     """
 
     if isinstance(estimator, GAFeatureSelectionCV):
-        raise TypeError("Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance")
+        raise TypeError(
+            "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
+        )
 
     sns.set_style("white")
 
@@ -137,7 +139,9 @@ def plot_parallel_coordinates(estimator, features: list = None):
     """
 
     if isinstance(estimator, GAFeatureSelectionCV):
-        raise TypeError("Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance")
+        raise TypeError(
+            "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
+        )
 
     df = logbook_to_pandas(estimator.logbook)
     param_grid = estimator.space.param_grid

diff --git a/sklearn_genetic/tests/test_feature_selection.py b/sklearn_genetic/tests/test_feature_selection.py
@@ -320,3 +320,67 @@ def test_wrong_algorithm():
         str(excinfo.value)
         == "The algorithm genetic is not supported, please select one from ['eaSimple', 'eaMuPlusLambda', 'eaMuCommaLambda']"
     )
+
+
+def test_expected_ga_max_features():
+    clf = SGDClassifier(loss="log", fit_intercept=True)
+    generations = 10
+    max_features = 6
+    evolved_estimator = GAFeatureSelectionCV(
+        clf,
+        cv=3,
+        scoring="accuracy",
+        population_size=6,
+        generations=generations,
+        tournament_size=3,
+        elitism=False,
+        keep_top_k=4,
+        max_features=max_features,
+        verbose=False,
+        algorithm="eaSimple",
+        n_jobs=-1,
+        return_train_score=True,
+    )
+
+    evolved_estimator.fit(X_train, y_train)
+    features = evolved_estimator.best_features_
+
+    assert check_is_fitted(evolved_estimator) is None
+    assert features.shape[0] == X.shape[1]
+    assert sum(features) <= max_features
+    assert len(evolved_estimator) == generations + 1  # +1 random initial population
+    assert len(evolved_estimator.predict(X_test[:, features])) == len(X_test)
+    assert evolved_estimator.score(X_train[:, features], y_train) >= 0
+    assert len(evolved_estimator.decision_function(X_test[:, features])) == len(X_test)
+    assert len(evolved_estimator.predict_proba(X_test[:, features])) == len(X_test)
+    assert len(evolved_estimator.predict_log_proba(X_test[:, features])) == len(X_test)
+    assert evolved_estimator.score(X_test[:, features], y_test) == accuracy_score(
+        y_test, evolved_estimator.predict(X_test[:, features])
+    )
+    assert bool(evolved_estimator.get_params())
+    assert len(evolved_estimator.hof) == evolved_estimator.keep_top_k
+    assert "gen" in evolved_estimator[0]
+    assert "fitness_max" in evolved_estimator[0]
+    assert "fitness" in evolved_estimator[0]
+    assert "fitness_std" in evolved_estimator[0]
+    assert "fitness_min" in evolved_estimator[0]
+
+    cv_results_ = evolved_estimator.cv_results_
+    cv_result_keys = set(cv_results_.keys())
+
+    assert "split0_test_score" in cv_result_keys
+    assert "split1_test_score" in cv_result_keys
+    assert "split2_test_score" in cv_result_keys
+    assert "split0_train_score" in cv_result_keys
+    assert "split1_train_score" in cv_result_keys
+    assert "split2_train_score" in cv_result_keys
+    assert "mean_test_score" in cv_result_keys
+    assert "std_test_score" in cv_result_keys
+    assert "rank_test_score" in cv_result_keys
+    assert "mean_train_score" in cv_result_keys
+    assert "std_train_score" in cv_result_keys
+    assert "rank_train_score" in cv_result_keys
+    assert "std_fit_time" in cv_result_keys
+    assert "mean_score_time" in cv_result_keys
+    assert "rank_n_features" in cv_result_keys
+    assert "features" in cv_result_keys
diff --git a/sklearn_genetic/tests/test_plots.py b/sklearn_genetic/tests/test_plots.py
@@ -48,9 +48,9 @@ def test_plot_evolution():
         plot = plot_fitness_evolution(evolved_estimator, metric="accuracy")
 
     assert (
-            str(excinfo.value)
-            == "metric must be one of ['fitness', 'fitness_std', 'fitness_max', 'fitness_min'], "
-               "but got accuracy instead"
+        str(excinfo.value)
+        == "metric must be one of ['fitness', 'fitness_std', 'fitness_max', 'fitness_min'], "
+        "but got accuracy instead"
     )
 
 
@@ -70,32 +70,22 @@ def test_plot_parallel():
 
 
 def test_wrong_estimator_space():
-    estimator = GAFeatureSelectionCV(
-        clf,
-        cv=3,
-        scoring="accuracy",
-        population_size=6
-    )
+    estimator = GAFeatureSelectionCV(clf, cv=3, scoring="accuracy", population_size=6)
     with pytest.raises(Exception) as excinfo:
         plot = plot_search_space(estimator)
 
     assert (
-            str(excinfo.value)
-            == "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
+        str(excinfo.value)
+        == "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
     )
 
 
 def test_wrong_estimator_parallel():
-    estimator = GAFeatureSelectionCV(
-        clf,
-        cv=3,
-        scoring="accuracy",
-        population_size=6
-    )
+    estimator = GAFeatureSelectionCV(clf, cv=3, scoring="accuracy", population_size=6)
     with pytest.raises(Exception) as excinfo:
         plot = plot_parallel_coordinates(estimator)
 
     assert (
-            str(excinfo.value)
-            == "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
+        str(excinfo.value)
+        == "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
     )