In [None]:

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold

# -------------------- Helper functions --------------------
def initialize_population(pop_size, n_features, init_prob=0.5):
    return np.random.choice([0,1], size=(pop_size, n_features), p=[1-init_prob, init_prob])

def fitness_of(chromosome, X, y, clf, cv):
    if chromosome.sum() == 0:  # no features selected
        return 0.0
    Xsub = X[:, chromosome==1]
    scores = cross_val_score(clf, Xsub, y, cv=cv, scoring='accuracy')
    return scores.mean()

def evaluate_population(pop, X, y, clf, cv):
    return np.array([fitness_of(ind, X, y, clf, cv) for ind in pop])

def tournament_selection(pop, fitness, k=3):
    pop_size = len(pop)
    selected = []
    for _ in range(pop_size):
        competitors = np.random.randint(0, pop_size, size=k)
        winner = competitors[np.argmax(fitness[competitors])]
        selected.append(pop[winner])
    return np.array(selected)

def one_point_crossover(parents, crossover_rate=0.8):
    pop_size, n_features = parents.shape
    children = parents.copy()
    for i in range(0, pop_size, 2):
        if i+1 >= pop_size: break
        if np.random.rand() < crossover_rate:
            pt = np.random.randint(1, n_features)
            children[i, pt:], children[i+1, pt:] = parents[i+1, pt:], parents[i, pt:]
    return children

def mutation(pop, mutation_rate=0.01):
    mutation_mask = np.random.rand(*pop.shape) < mutation_rate
    pop[mutation_mask] = 1 - pop[mutation_mask]
    return pop

# -------------------- Main GA function --------------------
def genetic_feature_selection(X, y, pop_size=20, generations=10, crossover_rate=0.8, mutation_rate=0.02):
    n_features = X.shape[1]
    clf = LogisticRegression(max_iter=1000, solver='liblinear')
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

    # initialize
    pop = initialize_population(pop_size, n_features, init_prob=0.5)
    fitness = evaluate_population(pop, X, y, clf, cv)

    for gen in range(1, generations+1):
        parents = tournament_selection(pop, fitness, k=3)
        children = one_point_crossover(parents, crossover_rate=crossover_rate)
        children = mutation(children, mutation_rate=mutation_rate)
        fitness_children = evaluate_population(children, X, y, clf, cv)

        # elitism: keep best pop_size
        combined_pop = np.vstack([pop, children])
        combined_fit = np.concatenate([fitness, fitness_children])
        best_idx = np.argsort(combined_fit)[-pop_size:]
        pop = combined_pop[best_idx]
        fitness = combined_fit[best_idx]

        print(f"Generation {gen}: Best Accuracy = {fitness.max():.4f}, Avg = {fitness.mean():.4f}")

    best_idx = np.argmax(fitness)
    best_chrom = pop[best_idx].astype(bool)
    return best_chrom, fitness[best_idx]

# -------------------- Run example --------------------
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names

print("Dataset: Breast Cancer (sklearn)")
print("Samples:", X.shape[0], "Features:", X.shape[1])

best_chrom, best_score = genetic_feature_selection(X, y, pop_size=20, generations=10)

selected_features = feature_names[best_chrom]
print("\n=== Final Result ===")
print("Best Accuracy:", best_score)
print("Number of selected features:", best_chrom.sum())
print("Selected features:")
for f in selected_features:
    print("-", f)


Dataset: Breast Cancer (sklearn)
Samples: 569 Features: 30
Generation 1: Best Accuracy = 0.9561, Avg = 0.9471
Generation 2: Best Accuracy = 0.9578, Avg = 0.9527
Generation 3: Best Accuracy = 0.9578, Avg = 0.9565
Generation 4: Best Accuracy = 0.9578, Avg = 0.9577
Generation 5: Best Accuracy = 0.9578, Avg = 0.9578
Generation 6: Best Accuracy = 0.9578, Avg = 0.9578
Generation 7: Best Accuracy = 0.9578, Avg = 0.9578
Generation 8: Best Accuracy = 0.9578, Avg = 0.9578
Generation 9: Best Accuracy = 0.9578, Avg = 0.9578
Generation 10: Best Accuracy = 0.9578, Avg = 0.9578

=== Final Result ===
Best Accuracy: 0.9578483245149911
Number of selected features: 19
Selected features:
- mean radius
- mean texture
- mean area
- mean smoothness
- mean compactness
- area error
- smoothness error
- compactness error
- concavity error
- symmetry error
- fractal dimension error
- worst radius
- worst perimeter
- worst smoothness
- worst compactness
- worst concavity
- worst concave points
- worst symmetry
- 