In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# ---- Create a simple school-related dataset ----
np.random.seed(42)
n = 200

data = pd.DataFrame({
    'studytime': np.random.randint(1, 5, n),        # 1–4 hours of study per day
    'failures': np.random.randint(0, 4, n),         # past class failures
    'absences': np.random.randint(0, 20, n),        # total absences
    'health': np.random.randint(1, 6, n),           # health rating (1–5)
    'goout': np.random.randint(1, 6, n),            # how often they go out
    'G1': np.random.randint(5, 20, n),              # first period grade
    'G2': np.random.randint(5, 20, n)               # second period grade
})

# Simulated final grade (depends on studytime, G2, absences, etc.)
data['G3'] = (
    0.3 * data['G2'] +
    0.2 * data['G1'] +
    0.4 * data['studytime'] -
    0.3 * data['failures'] -
    0.2 * data['absences'] / 10 +
    np.random.randn(n)
).round().clip(0, 20)

# ---- Split the dataset ----
X = data.drop('G3', axis=1)
y = data['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ---- Grey Wolf Optimization for Feature Selection ----
def fitness(solution):
    mask = solution > 0.5
    if np.sum(mask) == 0:
        return 1e6
    X_train_sel = X_train.iloc[:, mask]
    X_test_sel = X_test.iloc[:, mask]
    model = RandomForestRegressor(n_estimators=30, random_state=42)
    model.fit(X_train_sel, y_train)
    preds = model.predict(X_test_sel)
    mse = mean_squared_error(y_test, preds)
    return mse + 0.01 * (np.sum(mask) / len(solution))  # penalty for more features

def gwo(N=8, T_max=15):
    dim = X_train.shape[1]
    wolves = np.random.rand(N, dim)
    fitness_vals = np.array([fitness(w) for w in wolves])

    idx = np.argsort(fitness_vals)
    alpha, beta, delta = wolves[idx[0]], wolves[idx[1]], wolves[idx[2]]
    alpha_score = fitness_vals[idx[0]]

    for t in range(T_max):
        a = 2 - 2 * t / T_max
        for i in range(N):
            for d in range(dim):
                r1, r2 = np.random.rand(), np.random.rand()
                A1, C1 = 2*a*r1 - a, 2*r2
                Dα = abs(C1 * alpha[d] - wolves[i][d])
                X1 = alpha[d] - A1 * Dα

                r1, r2 = np.random.rand(), np.random.rand()
                A2, C2 = 2*a*r1 - a, 2*r2
                Dβ = abs(C2 * beta[d] - wolves[i][d])
                X2 = beta[d] - A2 * Dβ

                r1, r2 = np.random.rand(), np.random.rand()
                A3, C3 = 2*a*r1 - a, 2*r2
                Dδ = abs(C3 * delta[d] - wolves[i][d])
                X3 = delta[d] - A3 * Dδ

                wolves[i][d] = np.clip((X1 + X2 + X3) / 3, 0, 1)

        fitness_vals = np.array([fitness(w) for w in wolves])
        idx = np.argsort(fitness_vals)
        alpha, beta, delta = wolves[idx[0]], wolves[idx[1]], wolves[idx[2]]
        alpha_score = fitness_vals[idx[0]]

        print(f"Iteration {t+1}/{T_max} - Best MSE: {alpha_score:.4f}")

    return alpha, alpha_score

best_sol, best_score = gwo()
selected = best_sol > 0.5
print("\n=== Final Result ===")
print("Selected Features:")
print(list(X.columns[selected]))
print("Total Selected:", np.sum(selected), "/", X.shape[1])


Iteration 1/15 - Best MSE: 1.3733
Iteration 2/15 - Best MSE: 1.1735
Iteration 3/15 - Best MSE: 1.1735
Iteration 4/15 - Best MSE: 1.1735
Iteration 5/15 - Best MSE: 1.1735
Iteration 6/15 - Best MSE: 1.1735
Iteration 7/15 - Best MSE: 1.2273
Iteration 8/15 - Best MSE: 1.1735
Iteration 9/15 - Best MSE: 1.2273
Iteration 10/15 - Best MSE: 1.1735
Iteration 11/15 - Best MSE: 1.1735
Iteration 12/15 - Best MSE: 1.1735
Iteration 13/15 - Best MSE: 1.1735
Iteration 14/15 - Best MSE: 1.1735
Iteration 15/15 - Best MSE: 1.1735

=== Final Result ===
Selected Features:
['studytime', 'failures', 'absences', 'G1', 'G2']
Total Selected: 5 / 7
