In [2]:
# Install DEAP (only needed once per session)
!pip install deap

import random
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from deap import base, creator, tools, algorithms

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target
n_features = X.shape[1]

# --- Genetic Algorithm Setup ---
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

def eval_features(individual):
    mask = np.array(individual, dtype=bool)
    if not mask.any():  # avoid empty feature set
        return 0,
    X_selected = X[:, mask]
    clf = DecisionTreeClassifier(random_state=42)
    scores = cross_val_score(clf, X_selected, y, cv=5)
    return scores.mean(),

toolbox.register("evaluate", eval_features)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=3)

# --- Run GA ---
random.seed(42)
pop = toolbox.population(n=30)
NGEN, CXPB, MUTPB = 20, 0.7, 0.2

print("Starting feature selection using Gene Expression–style GA...\n")
for gen in range(NGEN):
    offspring = algorithms.varAnd(pop, toolbox, CXPB, MUTPB)
    fits = list(map(toolbox.evaluate, offspring))
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    pop = toolbox.select(offspring, k=len(pop))
    top_ind = tools.selBest(pop, 1)[0]
    print(f"Gen {gen+1}: Best Accuracy = {top_ind.fitness.values[0]:.4f}")

best_ind = tools.selBest(pop, 1)[0]
selected_features = [data.feature_names[i] for i, bit in enumerate(best_ind) if bit == 1]

print("\nBest feature subset found:", selected_features)
print("Best cross-validation accuracy:", eval_features(best_ind)[0])


Collecting deap
  Downloading deap-1.4.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading deap-1.4.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.0/136.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deap
Successfully installed deap-1.4.3
Starting feature selection using Gene Expression–style GA...

Gen 1: Best Accuracy = 0.9456
Gen 2: Best Accuracy = 0.9473
Gen 3: Best Accuracy = 0.9508
Gen 4: Best Accuracy = 0.9561
Gen 5: Best Accuracy = 0.9561
Gen 6: Best Accuracy = 0.9561
Gen 7: Best Accuracy = 0.9561
Gen 8: Best Accuracy = 0.9561
Gen 9: Best Accuracy = 0.9561
Gen 10: Best Accuracy = 0.9561
Gen 11: Best Accuracy = 0.9561
Gen 12: Best Accuracy = 0.9561
Gen 13: Best Accuracy = 0.9561
Gen 14: Best Accuracy = 0.9561
Gen 15: Best Accuracy = 0.9561
