In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
# You need to specify a file name from the dataset, e.g., 'superconductor_data.csv'
# You can check the dataset page on Kaggle for available files.
file_path = "train.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "munumbutt/superconductor-dataset",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'superconductor-dataset' dataset.
First 5 records:    number_of_elements  mean_atomic_mass  wtd_mean_atomic_mass  \
0                   4         88.944468             57.862692   
1                   5         92.729214             58.518416   
2                   4         88.944468             57.885242   
3                   4         88.944468             57.873967   
4                   4         88.944468             57.840143   

   gmean_atomic_mass  wtd_gmean_atomic_mass  entropy_atomic_mass  \
0          66.361592              36.116612             1.181795   
1          73.132787              36.396602             1.449309   
2          66.361592              36.122509             1.181795   
3          66.361592              36.119560             1.181795   
4          66.361592              36.110716             1.181795   

   wtd_entropy_atomic_mass  range_atomic_mass  wtd_range_atomic_mass  \
0                 1.062396        

### Preparation

In [None]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader


from dataclasses import dataclass


In [None]:
# =========================
# 2) Split features / label
# =========================
# Most common label column name for this dataset is "critical_temp"
# If your label column differs, update it here.
target_col = "critical_temp"
assert target_col in df.columns, f"Target column '{target_col}' not found. Columns: {df.columns.tolist()}"

X = df.drop(columns=[target_col]).values.astype(np.float32)
y = df[target_col].values.astype(np.float32).reshape(-1, 1)

print("X shape:", X.shape, "| y shape:", y.shape)


# =========================
# 3) Train / Val / Test split
# =========================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)


# =========================
# 4) Standardize (fit on train only!)
# =========================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).astype(np.float32)
X_val = scaler.transform(X_val).astype(np.float32)
X_test = scaler.transform(X_test).astype(np.float32)







# =========================
# 5) Build PyTorch DataLoaders
# =========================
batch_size = 256

train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
val_ds = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
test_ds = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

X shape: (21263, 81) | y shape: (21263, 1)
Train: (17010, 81) Val: (2126, 81) Test: (2127, 81)


### 1. NSGA-2



In [None]:

# =========================
# 1) Chromosome Definition
# =========================
ACTIVATIONS = ["relu", "gelu", "tanh", "leaky_relu"]

@dataclass
class Chromosome:
    # Feature selection
    feature_mask: np.ndarray  # shape (81,), dtype=bool or int {0,1}

    # Architecture
    num_layers: int           # 1..4
    hidden_units: np.ndarray  # shape (4,), only first num_layers used
    activation: str           # in ACTIVATIONS

    # Optional (nice for performance)
    dropout: float = 0.1
    lr: float = 1e-3
    weight_decay: float = 1e-5


def random_chromosome(n_features=81) -> Chromosome:
    # feature mask: keep ~10-40 features initially (prevents degenerate too-sparse)
    mask = np.zeros(n_features, dtype=np.int32)
    k = np.random.randint(8, 41)
    idx = np.random.choice(n_features, size=k, replace=False)
    mask[idx] = 1

    num_layers = np.random.randint(1, 5)  # 1..4
    hidden_units = np.random.randint(16, 513, size=(4,))  # 16..512

    activation = np.random.choice(ACTIVATIONS)

    dropout = np.random.uniform(0.0, 0.4)
    lr = 10 ** np.random.uniform(-4, -3)  # 1e-4 to 1e-3
    wd = np.random.uniform(0.0, 1e-3)

    return Chromosome(
        feature_mask=mask,
        num_layers=num_layers,
        hidden_units=hidden_units,
        activation=activation,
        dropout=float(dropout),
        lr=float(lr),
        weight_decay=float(wd),
    )


# =========================
# 2) Decode -> Build PyTorch model
# =========================
def get_activation(name: str):
    if name == "relu":
        return nn.ReLU()
    if name == "gelu":
        return nn.GELU()
    if name == "tanh":
        return nn.Tanh()
    if name == "leaky_relu":
        return nn.LeakyReLU(0.1)
    raise ValueError(f"Unknown activation: {name}")


class EvoMLP(nn.Module):
    def __init__(self, input_dim, num_layers, hidden_units, activation, dropout):
        super().__init__()
        layers = []
        prev = input_dim
        act = get_activation(activation)

        for i in range(num_layers):
            h = int(hidden_units[i])
            layers.append(nn.Linear(prev, h))
            layers.append(act)
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev = h

        layers.append(nn.Linear(prev, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


def count_params(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# =========================
# 3) Evaluate Chromosome -> 3 objectives
#    f1: RMSE (val)
#    f2: #params
#    f3: #selected features
# =========================
def evaluate_chromosome(
    chrom: Chromosome,
    X_train, y_train, X_val, y_val,
    epochs=30,
    batch_size=256,
    device=None,
    min_features=5
):
    """
    Returns: (rmse, n_params, n_features_selected)
    All minimized.
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # --- Feature mask ---
    mask = chrom.feature_mask.astype(bool)
    n_selected = int(mask.sum())

    # Constraint: avoid too few features
    if n_selected < min_features:
        # Huge penalty so NSGA-II will avoid it
        return (1e9, 1e9, n_selected)

    Xtr = X_train[:, mask]
    Xva = X_val[:, mask]

    # Torch tensors
    Xtr_t = torch.tensor(Xtr, dtype=torch.float32)
    ytr_t = torch.tensor(y_train, dtype=torch.float32)
    Xva_t = torch.tensor(Xva, dtype=torch.float32)
    yva_t = torch.tensor(y_val, dtype=torch.float32)

    train_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(Xtr_t, ytr_t),
        batch_size=batch_size,
        shuffle=True,
        drop_last=False,
    )

    model = EvoMLP(
        input_dim=Xtr.shape[1],
        num_layers=chrom.num_layers,
        hidden_units=chrom.hidden_units,
        activation=chrom.activation,
        dropout=chrom.dropout,
    ).to(device)

    n_params = count_params(model)

    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=chrom.lr,
        weight_decay=chrom.weight_decay,
    )
    loss_fn = nn.MSELoss()

    # --- training (fixed small budget) ---
    model.train()
    for _ in range(epochs):
        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()

    # --- validation RMSE ---
    model.eval()
    with torch.no_grad():
        preds = model(Xva_t.to(device)).cpu().numpy().reshape(-1)
        trues = yva_t.cpu().numpy().reshape(-1)
        rmse = float(np.sqrt(mean_squared_error(trues, preds)))

    return (rmse, n_params, n_selected)


# =========================
# 4) Genetic Operators
# =========================
def uniform_crossover(mask1, mask2, p=0.5):
    """Bitwise uniform crossover"""
    assert mask1.shape == mask2.shape
    swap = np.random.rand(mask1.shape[0]) < p
    child1 = mask1.copy()
    child2 = mask2.copy()
    child1[swap], child2[swap] = child2[swap], child1[swap]
    return child1, child2


def crossover(parent1: Chromosome, parent2: Chromosome) -> tuple[Chromosome, Chromosome]:
    # Feature mask crossover
    m1, m2 = uniform_crossover(parent1.feature_mask, parent2.feature_mask, p=0.5)

    # num_layers crossover (pick one)
    L1 = parent1.num_layers if np.random.rand() < 0.5 else parent2.num_layers
    L2 = parent2.num_layers if np.random.rand() < 0.5 else parent1.num_layers

    # hidden units crossover (per-gene)
    hu1 = parent1.hidden_units.copy()
    hu2 = parent2.hidden_units.copy()
    for i in range(4):
        if np.random.rand() < 0.5:
            hu1[i], hu2[i] = hu2[i], hu1[i]

    # activation crossover
    a1 = parent1.activation if np.random.rand() < 0.5 else parent2.activation
    a2 = parent2.activation if np.random.rand() < 0.5 else parent1.activation

    # dropout/lr/wd crossover (simple averaging)
    d1 = float((parent1.dropout + parent2.dropout) / 2.0)
    d2 = d1
    lr1 = float((parent1.lr + parent2.lr) / 2.0)
    lr2 = lr1
    wd1 = float((parent1.weight_decay + parent2.weight_decay) / 2.0)
    wd2 = wd1

    return (
        Chromosome(m1, L1, hu1, a1, d1, lr1, wd1),
        Chromosome(m2, L2, hu2, a2, d2, lr2, wd2),
    )


def mutate(chrom: Chromosome, p_mask=0.03, p_arch=0.2, n_features=81) -> Chromosome:
    c = Chromosome(
        feature_mask=chrom.feature_mask.copy(),
        num_layers=int(chrom.num_layers),
        hidden_units=chrom.hidden_units.copy(),
        activation=str(chrom.activation),
        dropout=float(chrom.dropout),
        lr=float(chrom.lr),
        weight_decay=float(chrom.weight_decay),
    )

    # --- Feature mask mutation (bit flip) ---
    # flip each bit with small prob
    flip = np.random.rand(n_features) < (p_mask / n_features * 81)  # scaled
    c.feature_mask[flip] = 1 - c.feature_mask[flip]

    # keep at least 1 feature alive (hard safety)
    if c.feature_mask.sum() == 0:
        c.feature_mask[np.random.randint(0, n_features)] = 1

    # --- Architecture mutation ---
    if np.random.rand() < p_arch:
        # mutate num_layers
        if np.random.rand() < 0.5:
            c.num_layers = int(np.clip(c.num_layers + np.random.choice([-1, 1]), 1, 4))

        # mutate hidden units (random reset per layer)
        for i in range(4):
            if np.random.rand() < 0.3:
                c.hidden_units[i] = np.random.randint(16, 513)

        # mutate activation
        if np.random.rand() < 0.2:
            c.activation = np.random.choice(ACTIVATIONS)

        # mutate dropout / lr / wd
        if np.random.rand() < 0.3:
            c.dropout = float(np.clip(c.dropout + np.random.normal(0, 0.05), 0.0, 0.5))
        if np.random.rand() < 0.3:
            c.lr = float(np.clip(c.lr * (10 ** np.random.normal(0, 0.15)), 1e-4, 3e-3))
        if np.random.rand() < 0.3:
            c.weight_decay = float(np.clip(c.weight_decay + np.random.normal(0, 2e-4), 0.0, 1e-3))

    return c


In [None]:
import numpy as np
from typing import List, Tuple, Dict

# -------------------------
# You already have these from earlier:
# - Chromosome
# - random_chromosome()
# - crossover()
# - mutate()
# - evaluate_chromosome()
# -------------------------


# ============================================================
# 1) NSGA-II Core Utilities
# ============================================================
def dominates(obj_a: Tuple[float, float, float], obj_b: Tuple[float, float, float]) -> bool:
    """
    True if A dominates B (all <= and at least one <), for minimization.
    """
    return (obj_a[0] <= obj_b[0] and obj_a[1] <= obj_b[1] and obj_a[2] <= obj_b[2]) and \
           (obj_a[0] <  obj_b[0] or  obj_a[1] <  obj_b[1] or  obj_a[2] <  obj_b[2])


def fast_non_dominated_sort(objs: List[Tuple[float, float, float]]) -> List[List[int]]:
    """
    Returns fronts as list of lists of indices.
    NSGA-II fast non-dominated sorting.
    """
    N = len(objs)
    S = [[] for _ in range(N)]   # who i dominates
    n = [0] * N                  # domination count
    fronts = [[]]

    for p in range(N):
        for q in range(N):
            if p == q:
                continue
            if dominates(objs[p], objs[q]):
                S[p].append(q)
            elif dominates(objs[q], objs[p]):
                n[p] += 1

        if n[p] == 0:
            fronts[0].append(p)

    i = 0
    while len(fronts[i]) > 0:
        next_front = []
        for p in fronts[i]:
            for q in S[p]:
                n[q] -= 1
                if n[q] == 0:
                    next_front.append(q)
        i += 1
        fronts.append(next_front)

    fronts.pop()  # last one empty
    return fronts


def crowding_distance(front: List[int], objs: List[Tuple[float, float, float]]) -> Dict[int, float]:
    """
    Compute crowding distance for a front. Higher is better.
    """
    dist = {idx: 0.0 for idx in front}
    if len(front) <= 2:
        for idx in front:
            dist[idx] = float("inf")
        return dist

    M = 3  # number of objectives
    for m in range(M):
        front_sorted = sorted(front, key=lambda i: objs[i][m])
        dist[front_sorted[0]] = float("inf")
        dist[front_sorted[-1]] = float("inf")

        f_min = objs[front_sorted[0]][m]
        f_max = objs[front_sorted[-1]][m]
        if f_max == f_min:
            continue

        for k in range(1, len(front_sorted) - 1):
            prev_i = front_sorted[k - 1]
            next_i = front_sorted[k + 1]
            dist[front_sorted[k]] += (objs[next_i][m] - objs[prev_i][m]) / (f_max - f_min)

    return dist


def tournament_select(
    pop_indices: List[int],
    rank: Dict[int, int],
    crowd: Dict[int, float],
) -> int:
    """
    Binary tournament selection:
    - pick 2 random individuals
    - choose lower rank
    - if tie, choose higher crowding distance
    """
    a, b = np.random.choice(pop_indices, 2, replace=False)
    if rank[a] < rank[b]:
        return a
    if rank[b] < rank[a]:
        return b
    # same rank => crowding
    return a if crowd[a] > crowd[b] else b


# ============================================================
# 2) Evaluation Cache (speed!)
# ============================================================
def chrom_key(chrom) -> Tuple:
    """
    Hashable key for caching chromosome evaluation.
    We round floats so tiny noise doesn't break caching.
    """
    mask_bytes = chrom.feature_mask.astype(np.uint8).tobytes()
    return (
        mask_bytes,
        int(chrom.num_layers),
        tuple(int(x) for x in chrom.hidden_units.tolist()),
        str(chrom.activation),
        round(float(chrom.dropout), 4),
        round(float(chrom.lr), 8),
        round(float(chrom.weight_decay), 8),
    )


def evaluate_population(
    population,
    cache: Dict[Tuple, Tuple[float, float, float]],
    X_train, y_train, X_val, y_val,
    epochs=20,
):
    """
    Evaluate all chromosomes, using cache.
    Returns list of objectives aligned with population.
    """
    objs = []
    for chrom in population:
        k = chrom_key(chrom)
        if k not in cache:
            cache[k] = evaluate_chromosome(
                chrom, X_train, y_train, X_val, y_val,
                epochs=epochs,
                batch_size=256,
                min_features=5,
            )
        objs.append(cache[k])
    return objs


# ============================================================
# 3) Main NSGA-II Loop
# ============================================================
def nsga2_optimize(
    X_train, y_train, X_val, y_val,
    pop_size=60,
    generations=30,
    crossover_prob=0.9,
    mutation_prob=0.9,
    eval_epochs=20,
    seed=42,
    verbose=True,
):
    np.random.seed(seed)

    # --- init population ---
    n_features = X_train.shape[1]
    population = [random_chromosome(n_features=n_features) for _ in range(pop_size)]
    cache = {}

    # --- initial eval ---
    objs = evaluate_population(population, cache, X_train, y_train, X_val, y_val, epochs=eval_epochs)

    for gen in range(1, generations + 1):
        # ----------------------------
        # A) Non-dominated sorting
        # ----------------------------
        fronts = fast_non_dominated_sort(objs)

        # rank and crowding distance dict for selection
        rank = {}
        crowd = {}
        for r, front in enumerate(fronts):
            for idx in front:
                rank[idx] = r
            cd = crowding_distance(front, objs)
            crowd.update(cd)

        # ----------------------------
        # B) Generate offspring
        # ----------------------------
        pop_indices = list(range(pop_size))
        offspring = []

        while len(offspring) < pop_size:
            p1_idx = tournament_select(pop_indices, rank, crowd)
            p2_idx = tournament_select(pop_indices, rank, crowd)
            parent1 = population[p1_idx]
            parent2 = population[p2_idx]

            # crossover
            if np.random.rand() < crossover_prob:
                c1, c2 = crossover(parent1, parent2)
            else:
                c1, c2 = parent1, parent2

            # mutation
            if np.random.rand() < mutation_prob:
                c1 = mutate(c1, p_mask=0.05, p_arch=0.25, n_features=n_features)
            if np.random.rand() < mutation_prob:
                c2 = mutate(c2, p_mask=0.05, p_arch=0.25, n_features=n_features)

            offspring.append(c1)
            if len(offspring) < pop_size:
                offspring.append(c2)

        offspring_objs = evaluate_population(offspring, cache, X_train, y_train, X_val, y_val, epochs=eval_epochs)

        # ----------------------------
        # C) Elitist survival selection
        # ----------------------------
        combined_pop = population + offspring
        combined_objs = objs + offspring_objs

        combined_fronts = fast_non_dominated_sort(combined_objs)

        new_population = []
        new_objs = []

        for front in combined_fronts:
            if len(new_population) + len(front) <= pop_size:
                for idx in front:
                    new_population.append(combined_pop[idx])
                    new_objs.append(combined_objs[idx])
            else:
                # partial fill using crowding distance
                cd = crowding_distance(front, combined_objs)
                sorted_front = sorted(front, key=lambda i: cd[i], reverse=True)

                remaining = pop_size - len(new_population)
                for idx in sorted_front[:remaining]:
                    new_population.append(combined_pop[idx])
                    new_objs.append(combined_objs[idx])
                break

        population = new_population
        objs = new_objs

        # ----------------------------
        # D) Logging
        # ----------------------------
        # Pareto front = rank 0 in current population
        current_fronts = fast_non_dominated_sort(objs)
        pareto = current_fronts[0]

        best_rmse = min(objs[i][0] for i in pareto)
        best_sparse = min(objs[i][2] for i in pareto)
        best_params = min(objs[i][1] for i in pareto)

        if verbose:
            print(
                f"Gen {gen:03d} | Pareto size={len(pareto)} | "
                f"Best RMSE={best_rmse:.4f} | Min Feats={best_sparse} | Min Params={int(best_params)}"
            )

    # final pareto
    final_fronts = fast_non_dominated_sort(objs)
    pareto_idx = final_fronts[0]

    pareto_solutions = [(population[i], objs[i]) for i in pareto_idx]
    return population, objs, pareto_solutions


# ============================================================
# 4) Run NSGA-II
# ============================================================
# Example usage:
# population, objs, pareto = nsga2_optimize(
#     X_train, y_train, X_val, y_val,
#     pop_size=60,
#     generations=20,
#     eval_epochs=15,
#     verbose=True,
# )
#
# print("\n=== Final Pareto Solutions (first 5) ===")
# for chrom, (rmse, n_params, n_feats) in pareto[:5]:
#     print(f"RMSE={rmse:.4f}, Params={int(n_params)}, Features={n_feats}, L={chrom.num_layers}, Act={chrom.activation}")


In [None]:
# Example usage:
population, objs, pareto = nsga2_optimize(
     X_train, y_train, X_val, y_val,
     pop_size=60,
     generations=20,
     eval_epochs=15,
     verbose=True,
 )
#
print("\n=== Final Pareto Solutions (first 5) ===")
for chrom, (rmse, n_params, n_feats) in pareto[:5]:
     print(f"RMSE={rmse:.4f}, Params={int(n_params)}, Features={n_feats}, L={chrom.num_layers}, Act={chrom.activation}")


Gen 001 | Pareto size=33 | Best RMSE=13.9332 | Min Feats=8 | Min Params=239
Gen 002 | Pareto size=42 | Best RMSE=13.6918 | Min Feats=8 | Min Params=239
Gen 003 | Pareto size=55 | Best RMSE=13.5814 | Min Feats=8 | Min Params=239
Gen 004 | Pareto size=60 | Best RMSE=13.1769 | Min Feats=8 | Min Params=239
Gen 005 | Pareto size=60 | Best RMSE=13.1769 | Min Feats=8 | Min Params=239
Gen 006 | Pareto size=60 | Best RMSE=13.1769 | Min Feats=8 | Min Params=239
Gen 007 | Pareto size=60 | Best RMSE=13.1769 | Min Feats=8 | Min Params=239
Gen 008 | Pareto size=60 | Best RMSE=13.1769 | Min Feats=8 | Min Params=239
Gen 009 | Pareto size=60 | Best RMSE=13.1769 | Min Feats=8 | Min Params=239
Gen 010 | Pareto size=60 | Best RMSE=13.1769 | Min Feats=8 | Min Params=239
Gen 011 | Pareto size=60 | Best RMSE=13.0545 | Min Feats=8 | Min Params=222
Gen 012 | Pareto size=60 | Best RMSE=12.7487 | Min Feats=8 | Min Params=222
Gen 013 | Pareto size=60 | Best RMSE=12.7487 | Min Feats=8 | Min Params=222
Gen 014 | Pa

### 2. Random Search

In [None]:
def random_search(
    X_train, y_train, X_val, y_val,
    n_trials=200,
    eval_epochs=20,
    seed=42,
    verbose=True,
):
    np.random.seed(seed)
    n_features = X_train.shape[1]

    results = []  # (chrom, (rmse, params, feats))

    for t in range(1, n_trials + 1):
        chrom = random_chromosome(n_features=n_features)

        obj = evaluate_chromosome(
            chrom,
            X_train, y_train,
            X_val, y_val,
            epochs=eval_epochs,
            batch_size=256,
            min_features=5,
        )

        results.append((chrom, obj))

        if verbose and t % 20 == 0:
            rmse, params, feats = obj
            print(f"[RandomSearch] Trial {t:04d}/{n_trials} | RMSE={rmse:.4f} | Params={int(params)} | Feats={feats}")

    return results


In [None]:
rs_results = random_search(
    X_train, y_train, X_val, y_val,
    n_trials=200,
    eval_epochs=15
)

# Best by RMSE (single-objective view)
best = min(rs_results, key=lambda x: x[1][0])
print("Best RandomSearch RMSE:", best[1][0], "Features:", best[1][2], "Params:", int(best[1][1]))


[RandomSearch] Trial 0020/200 | RMSE=21.8696 | Params=239 | Feats=12
[RandomSearch] Trial 0040/200 | RMSE=37.2296 | Params=1081 | Feats=28
[RandomSearch] Trial 0060/200 | RMSE=15.6968 | Params=41690 | Feats=33
[RandomSearch] Trial 0080/200 | RMSE=15.2213 | Params=135464 | Feats=25
[RandomSearch] Trial 0100/200 | RMSE=14.9517 | Params=242958 | Feats=23
[RandomSearch] Trial 0120/200 | RMSE=17.0529 | Params=91254 | Feats=10
[RandomSearch] Trial 0140/200 | RMSE=18.9131 | Params=13295 | Feats=32
[RandomSearch] Trial 0160/200 | RMSE=21.4045 | Params=273863 | Feats=20
[RandomSearch] Trial 0180/200 | RMSE=40.9771 | Params=29439 | Feats=37
[RandomSearch] Trial 0200/200 | RMSE=31.8832 | Params=2738 | Feats=21
Best RandomSearch RMSE: 13.801971980191533 Features: 26 Params: 234034


### 3. Grid Search

In [None]:
import numpy as np

def make_chromosome(
    n_total_features: int,
    k_features: int,
    num_layers: int,
    hidden_units: list,
    activation: str,
    dropout: float = 0.1,
    lr: float = 1e-3,
    weight_decay: float = 1e-5,
) -> Chromosome:
    """
    hidden_units must be length 4 (we only use first num_layers)
    """
    # Randomly pick k features (wrapper-style)
    mask = np.zeros(n_total_features, dtype=np.int32)
    idx = np.random.choice(n_total_features, size=k_features, replace=False)
    mask[idx] = 1

    hu = np.array(hidden_units, dtype=np.int32)
    assert hu.shape[0] == 4, "hidden_units must have length 4"

    return Chromosome(
        feature_mask=mask,
        num_layers=int(num_layers),
        hidden_units=hu,
        activation=str(activation),
        dropout=float(dropout),
        lr=float(lr),
        weight_decay=float(weight_decay),
    )


In [None]:
import itertools

def grid_search(
    X_train, y_train, X_val, y_val,
    k_feature_grid=(8, 12, 20, 30, 40, 60, 81),
    layer_grid=(1, 2, 3),
    width_grid=((64,), (128,), (256,), (256,128), (256,128,64)),
    activation_grid=("relu", "gelu"),
    repeats_per_setting=2,        # repeat each grid point with different random feature subsets
    eval_epochs=20,
    seed=42,
    verbose=True,
):
    np.random.seed(seed)
    n_total_features = X_train.shape[1]

    results = []  # (chrom, (rmse, params, feats))

    # Convert widths into 4-length hidden_units arrays
    def pad_widths(widths):
        padded = list(widths) + [16] * (4 - len(widths))  # fillers won't be used if num_layers < len(widths)
        return padded[:4]

    grid = list(itertools.product(k_feature_grid, layer_grid, width_grid, activation_grid))

    total_trials = len(grid) * repeats_per_setting
    trial = 0

    for (k, L, widths, act) in grid:
        # skip invalid combos (can't have L > len(widths))
        if L > len(widths):
            continue

        hidden_4 = pad_widths(widths)

        for _ in range(repeats_per_setting):
            trial += 1

            chrom = make_chromosome(
                n_total_features=n_total_features,
                k_features=int(k),
                num_layers=int(L),
                hidden_units=hidden_4,
                activation=act,
                dropout=0.1,
                lr=1e-3,
                weight_decay=1e-5
            )

            obj = evaluate_chromosome(
                chrom,
                X_train, y_train,
                X_val, y_val,
                epochs=eval_epochs,
                batch_size=256,
                min_features=5,
            )

            results.append((chrom, obj))

            if verbose and trial % 20 == 0:
                rmse, params, feats = obj
                print(f"[GridSearch] Trial {trial:04d}/{total_trials} | RMSE={rmse:.4f} | Params={int(params)} | Feats={feats}")

    return results


In [None]:
gs_results = grid_search(
    X_train, y_train, X_val, y_val,
    k_feature_grid=(8, 12, 20, 40, 81),
    layer_grid=(1, 2, 3),
    width_grid=((64,), (128,), (256,), (256,128), (256,128,64)),
    activation_grid=("relu", "gelu"),
    repeats_per_setting=2,
    eval_epochs=15
)

best_gs = min(gs_results, key=lambda x: x[1][0])
print("Best GridSearch RMSE:", best_gs[1][0], "Features:", best_gs[1][2], "Params:", int(best_gs[1][1]))


[GridSearch] Trial 0020/300 | RMSE=21.3228 | Params=2561 | Feats=8
[GridSearch] Trial 0040/300 | RMSE=20.7464 | Params=1793 | Feats=12
[GridSearch] Trial 0060/300 | RMSE=17.9287 | Params=36353 | Feats=12
[GridSearch] Trial 0080/300 | RMSE=18.2716 | Params=5633 | Feats=20
[GridSearch] Trial 0100/300 | RMSE=17.1573 | Params=2689 | Feats=40
[GridSearch] Trial 0120/300 | RMSE=15.2492 | Params=43521 | Feats=40
[GridSearch] Trial 0140/300 | RMSE=15.3428 | Params=21249 | Feats=81
[GridSearch] Trial 0160/300 | RMSE=13.9287 | Params=62209 | Feats=81
Best GridSearch RMSE: 13.92567003374243 Features: 81 Params: 62209
