In [14]:
# ===================== Standard Library =====================
import threading
import random

# ===================== GUI =====================
import tkinter as tk
from tkinter import ttk, messagebox

# ===================== Data & Math =====================
import numpy as np
import pandas as pd

# ===================== Plotting =====================
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

# ===================== Machine Learning =====================
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)



In [15]:
# ---------------------- Load Data ----------------------
df = pd.read_csv(r'C:\Users\sch\OneDrive\Desktop\ML\breast_cancer.csv')
target_variable = 'target'
X = df.drop(target_variable, axis=1).values
y = df[target_variable].values
feature_names = df.columns[:-1]

In [16]:
# ---------------------- GA Functions ----------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

def initialize_population(pop_size, num_features, fixed_population=None):
    if fixed_population is not None:
        return fixed_population.copy()
    population = np.random.randint(0, 2, (pop_size, num_features))
    for i in range(pop_size):
        if np.sum(population[i]) == 0:
            rand_pos = np.random.randint(0, num_features)
            population[i][rand_pos] = 1
    return population

def fitness_function(individual, X, y, alpha=0.6, beta=0.1, gamma=0.3):
    if np.sum(individual) == 0:
        return 0
    selected_idx = np.where(individual == 1)[0]
    X_sub = X[:, selected_idx]
    model = DecisionTreeClassifier()
    accuracy = cross_val_score(model, X_sub, y, cv=5).mean()
    recall = cross_val_score(model, X_sub, y, cv=5, scoring="recall").mean()
    feature_ratio = len(selected_idx) / X.shape[1]
    fitness = (alpha * accuracy) - (beta * feature_ratio) + (gamma * recall)
    return fitness

def tournament_selection(fitness_values, num_parents, tournament_size=3):
    n = len(fitness_values)
    selected = []
    for _ in range(num_parents):
        competitors = random.sample(range(n), tournament_size)
        best = max(competitors, key=lambda idx: fitness_values[idx])
        selected.append(best)
    return selected

def roulette_wheel_selection(fitness_values, num_parents):
    fitness = np.array(fitness_values, dtype=float)
    total_fitness = np.sum(fitness)
    if total_fitness == 0:
        return random.sample(range(len(fitness)), num_parents)
    probabilities = fitness / total_fitness
    cum_prob = np.cumsum(probabilities)
    parents = []
    for _ in range(num_parents):
        r = random.random()
        for idx, cp in enumerate(cum_prob):
            if r <= cp:
                parents.append(idx)
                break
    return parents

def rank_based_selection(fitness_values, num_parents):
    sorted_fitness = sorted(fitness_values, reverse=True)
    indv_rank_fit = {}
    for idx, fit in enumerate(fitness_values):
        rank = sorted_fitness.index(fit) + 1
        indv_rank_fit[idx] = [rank]
    D_sum = sum(1 / info[0] for info in indv_rank_fit.values())
    for idx, info in indv_rank_fit.items():
        prob = 1 / (D_sum * info[0])
        info.append(prob)
    selected = []
    for _ in range(num_parents):
        rdm = random.random()
        sumProb = 0
        for idx, info in indv_rank_fit.items():
            sumProb += info[1]
            if sumProb > rdm:
                selected.append(idx)
                break
    return selected

def selection(fitness_values, method="tournament", num_parents=None, tournament_size=3):
    if num_parents is None:
        num_parents = len(fitness_values)
    if method == "tournament":
        return tournament_selection(fitness_values, num_parents, tournament_size)
    elif method == "roulette":
        return roulette_wheel_selection(fitness_values, num_parents)
    elif method == "rank":
        return rank_based_selection(fitness_values, num_parents)
    else:
        raise ValueError("Unknown selection method")

In [17]:
# Crossover
def single_point_crossover(p1, p2):
    point = random.randint(1, len(p1)-1)
    c1 = np.concatenate([p1[:point], p2[point:]])
    c2 = np.concatenate([p2[:point], p1[point:]])
    return c1, c2

def two_point_crossover(p1, p2):
    p1_idx, p2_idx = sorted(random.sample(range(1, len(p1)-1), 2))
    c1 = np.concatenate([p1[:p1_idx], p2[p1_idx:p2_idx], p1[p2_idx:]])
    c2 = np.concatenate([p2[:p1_idx], p1[p1_idx:p2_idx], p2[p2_idx:]])
    return c1, c2

def uniform_crossover(p1, p2, p_swap=0.5):
    mask = np.random.rand(len(p1)) < p_swap
    c1 = np.where(mask, p1, p2)
    c2 = np.where(mask, p2, p1)
    return c1, c2

In [18]:
# Mutation
def mutate(pop, mutation_pop_rate, mutation_chromo_rate):
    pop = pop.copy()
    for i in range(len(pop)):
        if np.random.rand() < mutation_pop_rate:
            for j in range(len(pop[i])):
                if np.random.rand() < mutation_chromo_rate:
                    pop[i][j] = 1 - pop[i][j]
            if np.sum(pop[i]) == 0:
                pos = np.random.randint(0, len(pop[i]))
                pop[i][pos] = 1
    return pop

In [19]:
def genetic_algorithm(
        X, y,
        pop_size=20,
        generations=30,
        selection_method="tournament",
        crossover_method="single",
        mutation_pop_rate=0.5,
        mutation_chromo_rate=0.1,
        fixed_population=None
    ):
    num_features = X.shape[1]
    population = initialize_population(pop_size, num_features, fixed_population=fixed_population)
    best_global_fitness = -1
    best_global_individual = None
    convergence_curve = []

    for gen in range(generations):
        fitness_values = [fitness_function(ind, X, y) for ind in population]
        best_idx = np.argmax(fitness_values)
        best_fit = fitness_values[best_idx]
        if best_fit > best_global_fitness:
            best_global_fitness = best_fit
            best_global_individual = population[best_idx].copy()
        convergence_curve.append(best_fit)
        parent_indices = selection(fitness_values, method=selection_method, num_parents=pop_size)
        parents = population[parent_indices]
        if len(parents) % 2 != 0:
            parents = np.vstack([parents, parents[random.randint(0, len(parents)-1)]])
        children = []
        for i in range(0, len(parents), 2):
            p1 = parents[i]
            p2 = parents[i+1]
            if crossover_method == "single":
                c1, c2 = single_point_crossover(p1, p2)
            elif crossover_method == "two":
                c1, c2 = two_point_crossover(p1, p2)
            else:
                c1, c2 = uniform_crossover(p1, p2)
            children.append(c1)
            children.append(c2)
        children = np.array(children)
        children = mutate(children, mutation_pop_rate, mutation_chromo_rate)
        new_population = [best_global_individual]
        while len(new_population) < pop_size:
            new_population.append(children[random.randint(0, len(children)-1)])
        population = np.array(new_population)
    return best_global_individual, best_global_fitness, convergence_curve

# ---------------------- Baseline ----------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

def baseline_all_features(X, y, test_size=0.3, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return acc, rec, cm

In [None]:
# ---------------------- Comparison Table ----------------------
def add_baseline_vs_ga_table(frame, X, y, ga_accuracy, ga_recall, ga_feature_count):
    baseline_acc, baseline_rec, _ = baseline_all_features(X, y)
    table_frame = tk.Frame(frame, bg="#2e2e2e", bd=1, relief="solid")
    table_frame.pack(pady=10, fill="x", padx=5)

    headers = ["Metric", "Baseline (All Features)", "GA Selected Features"]
    values = [
        ["Accuracy", f"{baseline_acc:.4f}", f"{ga_accuracy:.4f}"],
        ["Recall", f"{baseline_rec:.4f}", f"{ga_recall:.4f}"],
        ["Selected Feature Count", f"{X.shape[1]}", f"{ga_feature_count}"]
    ]

    for col, header in enumerate(headers):
        lbl = tk.Label(table_frame, text=header, font=("Arial", 12, "bold"),
                       bg="#4e8cff", fg="white", borderwidth=1, relief="solid", padx=5, pady=5)
        lbl.grid(row=0, column=col, sticky="nsew")

    for row_idx, row_values in enumerate(values, start=1):
        for col_idx, val in enumerate(row_values):
            lbl = tk.Label(table_frame, text=val, font=("Arial", 12),
                           bg="#1e1e1e", fg="#87CEFA", borderwidth=1, relief="solid", padx=5, pady=5)
            lbl.grid(row=row_idx, column=col_idx, sticky="nsew")

    for col in range(len(headers)):
        table_frame.grid_columnconfigure(col, weight=1)

# ---------------------- GUI ----------------------
window = tk.Tk()
window.title("GA Feature Selection - Breast Cancer")
window.geometry("1150x780")
window.configure(bg="#1e1e1e")

title = tk.Label(window, text="Genetic Algorithm Feature Selection", font=("Arial", 22, "bold"),
                 bg="#1e1e1e", fg="white")
title.pack(pady=10)

# Input Frame
frame_inputs = tk.Frame(window, bg="#1e1e1e")
frame_inputs.pack(pady=10)

tk.Label(frame_inputs, text="Population Size:", font=("Arial", 12), bg="#1e1e1e", fg="white").grid(row=0, column=0, padx=5, pady=5, sticky="e")
pop_entry = tk.Entry(frame_inputs, font=("Arial", 12))
pop_entry.grid(row=0, column=1, padx=5, pady=5)
pop_entry.insert(0, "20")

tk.Label(frame_inputs, text="Generations:", font=("Arial", 12), bg="#1e1e1e", fg="white").grid(row=1, column=0, padx=5, pady=5, sticky="e")
gen_entry = tk.Entry(frame_inputs, font=("Arial", 12))
gen_entry.grid(row=1, column=1, padx=5, pady=5)
gen_entry.insert(0, "30")

tk.Label(frame_inputs, text="Mutation Pop Rate:", font=("Arial", 12), bg="#1e1e1e", fg="white").grid(row=2, column=0, padx=5, pady=5, sticky="e")
mut_pop_entry = tk.Entry(frame_inputs, font=("Arial", 12))
mut_pop_entry.grid(row=2, column=1, padx=5, pady=5)
mut_pop_entry.insert(0, "0.5")

tk.Label(frame_inputs, text="Mutation Chromo Rate:", font=("Arial", 12), bg="#1e1e1e", fg="white").grid(row=3, column=0, padx=5, pady=5, sticky="e")
mut_chromo_entry = tk.Entry(frame_inputs, font=("Arial", 12))
mut_chromo_entry.grid(row=3, column=1, padx=5, pady=5)
mut_chromo_entry.insert(0, "0.1")

tk.Label(frame_inputs, text="Selection Method:", font=("Arial", 12), bg="#1e1e1e", fg="white").grid(row=4, column=0, padx=5, pady=5, sticky="e")
sel_method_var = tk.StringVar(value="tournament")
sel_method_menu = ttk.Combobox(frame_inputs, textvariable=sel_method_var, values=["tournament", "roulette", "rank"], state="readonly", font=("Arial", 12))
sel_method_menu.grid(row=4, column=1, padx=5, pady=5)

tk.Label(frame_inputs, text="Crossover Method:", font=("Arial", 12), bg="#1e1e1e", fg="white").grid(row=5, column=0, padx=5, pady=5, sticky="e")
cross_method_var = tk.StringVar(value="single")
cross_method_menu = ttk.Combobox(frame_inputs, textvariable=cross_method_var, values=["single", "two", "uniform"], state="readonly", font=("Arial", 12))
cross_method_menu.grid(row=5, column=1, padx=5, pady=5)

# Tabs
tabs = ttk.Notebook(window)
tabs.pack(fill="both", expand=True, padx=10, pady=10)

# ---------------------- Run GA Function ----------------------
def run_ga_gui():
    def task():
        try:
            pop_size = int(pop_entry.get())
            generations = int(gen_entry.get())
            mutation_pop_rate = float(mut_pop_entry.get())
            mutation_chromo_rate = float(mut_chromo_entry.get())
            selection_method = sel_method_var.get()
            crossover_method = cross_method_var.get()
        except:
            messagebox.showerror("Error", "Please enter valid GA parameters")
            return

        best_ind, best_fit, curve = genetic_algorithm(
            X, y,
            pop_size=pop_size,
            generations=generations,
            selection_method=selection_method,
            crossover_method=crossover_method,
            mutation_pop_rate=mutation_pop_rate,
            mutation_chromo_rate=mutation_chromo_rate
        )

        selected_idx = np.where(best_ind == 1)[0]
        selected_feature_names = [feature_names[i] for i in selected_idx]

        X_selected = X[:, selected_idx]
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
        model = DecisionTreeClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # New Tab
        run_tab = ttk.Frame(tabs)
        tabs.add(run_tab, text=f"Run {len(tabs.tabs())+1}")

        # Scrollable Frame
        canvas = tk.Canvas(run_tab, bg="#1e1e1e", highlightthickness=0)
        scrollbar = tk.Scrollbar(run_tab, orient="vertical", command=canvas.yview)
        scrollable_frame = tk.Frame(canvas, bg="#1e1e1e")
        scrollable_frame.bind(
            "<Configure>",
            lambda e: canvas.configure(scrollregion=canvas.bbox("all"))
        )
        canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
        canvas.configure(yscrollcommand=scrollbar.set)
        canvas.pack(side="left", fill="both", expand=True)
        scrollbar.pack(side="right", fill="y")

        # Results Text
        result_text = f"Fitness: {best_fit:.4f}\nAccuracy: {acc:.4f}\nRecall: {rec:.4f}\nSelected Features ({len(selected_idx)}): {selected_feature_names}"
        result_label_tab = tk.Label(scrollable_frame, text=result_text, font=("Arial", 12),
                                    bg="#1e1e1e", fg="#87CEFA", justify="left", wraplength=1050)
        result_label_tab.pack(pady=10)

        # Add comparison table
        add_baseline_vs_ga_table(scrollable_frame, X, y, ga_accuracy=acc, ga_recall=rec, ga_feature_count=len(selected_idx))

        # Charts Frame
        frame_charts_tab = tk.Frame(scrollable_frame, bg="#1e1e1e")
        frame_charts_tab.pack(pady=10, fill="both", expand=True)

        # Convergence Curve
        fig1 = plt.Figure(figsize=(6,4), dpi=100)
        ax1 = fig1.add_subplot(111)
        ax1.plot(curve, color="blue")
        ax1.set_title("GA Convergence Curve", fontsize=12)
        ax1.set_xlabel("Generation", fontsize=10)
        ax1.set_ylabel("Best Fitness", fontsize=10)
        ax1.tick_params(axis='x', rotation=45)
        fig1.tight_layout()
        canvas1 = FigureCanvasTkAgg(fig1, master=frame_charts_tab)
        canvas1.get_tk_widget().grid(row=0, column=0, padx=5, pady=5)
        canvas1.draw()

        # Confusion Matrix - Colored
        fig2 = plt.Figure(figsize=(6,4), dpi=100)
        ax2 = fig2.add_subplot(111)
        cmap = plt.cm.Blues
        cax = ax2.matshow(cm, cmap=cmap)
        for (i, j), val in np.ndenumerate(cm):
            ax2.text(j, i, f"{val}", ha='center', va='center',
                     color="white" if val>cm.max()/2 else "black", fontsize=12)
        ax2.set_xticks(range(cm.shape[1]))
        ax2.set_yticks(range(cm.shape[0]))
        ax2.set_xticklabels(["Pred 0","Pred 1"])
        ax2.set_yticklabels(["True 0","True 1"])
        ax2.set_title("Confusion Matrix", fontsize=12, color="white")
        fig2.tight_layout()
        canvas2 = FigureCanvasTkAgg(fig2, master=frame_charts_tab)
        canvas2.get_tk_widget().grid(row=0, column=1, padx=5, pady=5)
        canvas2.draw()

       # Save Button
        def save_results():
            df_results = pd.DataFrame({
                "Selected Feature Index": selected_idx,
                "Selected Feature Name": selected_feature_names
            })
            df_results.loc[len(df_results)] = ["", ""]
            df_results.loc[len(df_results)] = ["Fitness", best_fit]
            df_results.loc[len(df_results)] = ["Accuracy", acc]
            df_results.loc[len(df_results)] = ["Recall", rec]
            csv_file = f"GA_Run_{len(tabs.tabs())}.csv"
            df_results.to_csv(csv_file, index=False)
            fig1.savefig(f"GA_Run_{len(tabs.tabs())}_Convergence.png")
            fig2.savefig(f"GA_Run_{len(tabs.tabs())}_ConfusionMatrix.png")
            messagebox.showinfo("Saved", f"Results saved as:\n{csv_file}\nGA_Run_{len(tabs.tabs())}_Convergence.png\nGA_Run_{len(tabs.tabs())}_ConfusionMatrix.png")

        save_btn = tk.Button(scrollable_frame, text="Save Results", font=("Arial", 12, "bold"), bg="#4e8cff", fg="white", command=save_results)
        save_btn.pack(pady=10)

    threading.Thread(target=task).start()

# Run Button
run_btn = tk.Button(frame_inputs, text="Run GA", font=("Arial", 14, "bold"), bg="#4e8cff", fg="white", command=run_ga_gui)
run_btn.grid(row=6, column=0, columnspan=2, pady=10)

window.mainloop()