Skip to content

Commit

Permalink
Merge pull request #82 from rodrigo-arenas/0.8.x
Browse files Browse the repository at this point in the history
Max Features in GAFeatureSelectionCV
  • Loading branch information
rodrigo-arenas committed Dec 8, 2021
2 parents a2c6c29 + c428d6f commit 40b1366
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 29 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2020 rodrigoarenas456
Copyright (c) 2020 rodrigo-arenas

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
24 changes: 24 additions & 0 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,30 @@ Release Notes

Some notes on new features in various releases

What's new in 0.8.0dev0
-----------------------

^^^^^^^^^
Features:
^^^^^^^^^

* :class:`~sklearn_genetic.GAFeatureSelectionCV` now has a parameter called `max_features`, int, default=None.
If it's not None, it will penalize individuals with more features than max_features, putting a "soft" upper bound
to the number of features to be selected.

^^^^^^^^^^^^
API Changes:
^^^^^^^^^^^^

* The following parameters changed their default values to create more extensive
and different models with better results:

- population_size from 10 to 50

- generations from 40 to 80

- mutation_probability from 0.1 to 0.2


What's new in 0.7.0
-------------------
Expand Down
2 changes: 1 addition & 1 deletion sklearn_genetic/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.0"
__version__ = "0.8.0dev0"
23 changes: 17 additions & 6 deletions sklearn_genetic/genetic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,10 @@ def __init__(
cv=3,
param_grid=None,
scoring=None,
population_size=10,
generations=40,
population_size=50,
generations=80,
crossover_probability=0.8,
mutation_probability=0.1,
mutation_probability=0.2,
tournament_size=3,
elitism=True,
verbose=True,
Expand Down Expand Up @@ -719,6 +719,9 @@ class GAFeatureSelectionCV(BaseSearchCV):
elitism : bool, default=True
If True takes the *tournament_size* best solution to the next generation.
max_features : int, default=None
The upper bound number of features to be selected.
scoring : str or callable, default=None
A str (see model evaluation documentation) or
a scorer callable object / function with signature
Expand Down Expand Up @@ -823,12 +826,13 @@ def __init__(
estimator,
cv=3,
scoring=None,
population_size=10,
generations=40,
population_size=50,
generations=80,
crossover_probability=0.8,
mutation_probability=0.1,
mutation_probability=0.2,
tournament_size=3,
elitism=True,
max_features=None,
verbose=True,
keep_top_k=1,
criteria="max",
Expand All @@ -851,6 +855,7 @@ def __init__(
self.mutation_probability = mutation_probability
self.tournament_size = tournament_size
self.elitism = elitism
self.max_features = max_features
self.verbose = verbose
self.keep_top_k = keep_top_k
self.criteria = criteria
Expand Down Expand Up @@ -1026,6 +1031,12 @@ def evaluate(self, individual):
# Log the features and the cv-score
self.logbook.record(parameters=current_generation_features)

# Penalize individuals with more features than the max_features parameter
if self.max_features and (
n_selected_features > self.max_features or n_selected_features == 0
):
score = -self.criteria_sign * 10000

return [score, n_selected_features]

@if_delegate_has_method(delegate="estimator")
Expand Down
8 changes: 6 additions & 2 deletions sklearn_genetic/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ def plot_search_space(estimator, height=2, s=25, features: list = None):
"""

if isinstance(estimator, GAFeatureSelectionCV):
raise TypeError("Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance")
raise TypeError(
"Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
)

sns.set_style("white")

Expand Down Expand Up @@ -137,7 +139,9 @@ def plot_parallel_coordinates(estimator, features: list = None):
"""

if isinstance(estimator, GAFeatureSelectionCV):
raise TypeError("Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance")
raise TypeError(
"Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
)

df = logbook_to_pandas(estimator.logbook)
param_grid = estimator.space.param_grid
Expand Down
64 changes: 64 additions & 0 deletions sklearn_genetic/tests/test_feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,67 @@ def test_wrong_algorithm():
str(excinfo.value)
== "The algorithm genetic is not supported, please select one from ['eaSimple', 'eaMuPlusLambda', 'eaMuCommaLambda']"
)


def test_expected_ga_max_features():
clf = SGDClassifier(loss="log", fit_intercept=True)
generations = 10
max_features = 6
evolved_estimator = GAFeatureSelectionCV(
clf,
cv=3,
scoring="accuracy",
population_size=6,
generations=generations,
tournament_size=3,
elitism=False,
keep_top_k=4,
max_features=max_features,
verbose=False,
algorithm="eaSimple",
n_jobs=-1,
return_train_score=True,
)

evolved_estimator.fit(X_train, y_train)
features = evolved_estimator.best_features_

assert check_is_fitted(evolved_estimator) is None
assert features.shape[0] == X.shape[1]
assert sum(features) <= max_features
assert len(evolved_estimator) == generations + 1 # +1 random initial population
assert len(evolved_estimator.predict(X_test[:, features])) == len(X_test)
assert evolved_estimator.score(X_train[:, features], y_train) >= 0
assert len(evolved_estimator.decision_function(X_test[:, features])) == len(X_test)
assert len(evolved_estimator.predict_proba(X_test[:, features])) == len(X_test)
assert len(evolved_estimator.predict_log_proba(X_test[:, features])) == len(X_test)
assert evolved_estimator.score(X_test[:, features], y_test) == accuracy_score(
y_test, evolved_estimator.predict(X_test[:, features])
)
assert bool(evolved_estimator.get_params())
assert len(evolved_estimator.hof) == evolved_estimator.keep_top_k
assert "gen" in evolved_estimator[0]
assert "fitness_max" in evolved_estimator[0]
assert "fitness" in evolved_estimator[0]
assert "fitness_std" in evolved_estimator[0]
assert "fitness_min" in evolved_estimator[0]

cv_results_ = evolved_estimator.cv_results_
cv_result_keys = set(cv_results_.keys())

assert "split0_test_score" in cv_result_keys
assert "split1_test_score" in cv_result_keys
assert "split2_test_score" in cv_result_keys
assert "split0_train_score" in cv_result_keys
assert "split1_train_score" in cv_result_keys
assert "split2_train_score" in cv_result_keys
assert "mean_test_score" in cv_result_keys
assert "std_test_score" in cv_result_keys
assert "rank_test_score" in cv_result_keys
assert "mean_train_score" in cv_result_keys
assert "std_train_score" in cv_result_keys
assert "rank_train_score" in cv_result_keys
assert "std_fit_time" in cv_result_keys
assert "mean_score_time" in cv_result_keys
assert "rank_n_features" in cv_result_keys
assert "features" in cv_result_keys
28 changes: 9 additions & 19 deletions sklearn_genetic/tests/test_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def test_plot_evolution():
plot = plot_fitness_evolution(evolved_estimator, metric="accuracy")

assert (
str(excinfo.value)
== "metric must be one of ['fitness', 'fitness_std', 'fitness_max', 'fitness_min'], "
"but got accuracy instead"
str(excinfo.value)
== "metric must be one of ['fitness', 'fitness_std', 'fitness_max', 'fitness_min'], "
"but got accuracy instead"
)


Expand All @@ -70,32 +70,22 @@ def test_plot_parallel():


def test_wrong_estimator_space():
estimator = GAFeatureSelectionCV(
clf,
cv=3,
scoring="accuracy",
population_size=6
)
estimator = GAFeatureSelectionCV(clf, cv=3, scoring="accuracy", population_size=6)
with pytest.raises(Exception) as excinfo:
plot = plot_search_space(estimator)

assert (
str(excinfo.value)
== "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
str(excinfo.value)
== "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
)


def test_wrong_estimator_parallel():
estimator = GAFeatureSelectionCV(
clf,
cv=3,
scoring="accuracy",
population_size=6
)
estimator = GAFeatureSelectionCV(clf, cv=3, scoring="accuracy", population_size=6)
with pytest.raises(Exception) as excinfo:
plot = plot_parallel_coordinates(estimator)

assert (
str(excinfo.value)
== "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
str(excinfo.value)
== "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
)

0 comments on commit 40b1366

Please sign in to comment.