In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [5]:
print("cwd:", os.getcwd())

ROOT = Path.cwd()
if not (ROOT / "data" / "raw" / "secom.data").exists():
    ROOT = ROOT.parent  # handles running from notebooks/

x_path = ROOT / "data" / "raw" / "secom.data"
y_path = ROOT / "data" / "raw" / "secom_labels.data"

X = pd.read_csv(x_path, sep=r"\s+", header=None, na_values=["NaN", "nan"])
L = pd.read_csv(y_path, sep=r"\s+", header=None)

# Handle label/timestamp format variants
if L.shape[1] == 3:
    L.columns = ["y", "date", "time"]
    L["timestamp"] = pd.to_datetime(L["date"] + " " + L["time"], errors="coerce")
elif L.shape[1] == 2:
    L.columns = ["y", "timestamp"]
    L["timestamp"] = pd.to_datetime(L["timestamp"], format="%d/%m/%Y %H:%M:%S", errors="coerce")
else:
    raise ValueError(f"Unexpected label file shape: {L.shape}")

print("X shape:", X.shape)
print("L shape:", L.shape)
print("Row counts match:", len(X) == len(L))
print("Feature count:", X.shape[1])
print("Unique labels:", sorted(L["y"].dropna().unique().tolist()))
print("Label counts:\n", L["y"].value_counts(dropna=False))
print("Timestamp parse success:", L["timestamp"].notna().mean())
print("Head labels+time:\n", L[["y", "timestamp"]].head())

cwd: e:\GitHub\Mini-Projects\secom-yield-monitoring\notebooks
X shape: (1567, 590)
L shape: (1567, 2)
Row counts match: True
Feature count: 590
Unique labels: [-1, 1]
Label counts:
 y
-1    1463
 1     104
Name: count, dtype: int64
Timestamp parse success: 1.0
Head labels+time:
    y           timestamp
0 -1 2008-07-19 11:55:00
1 -1 2008-07-19 12:32:00
2  1 2008-07-19 13:17:00
3 -1 2008-07-19 14:43:00
4 -1 2008-07-19 15:22:00


In [6]:
y = (L["y"] == 1).astype(int)  # 1=fail, 0=pass

miss = X.isna().mean()
print("Missingness summary:")
print(miss.describe())

for t in [0.2, 0.4, 0.6, 0.8, 0.95]:
    print(f"cols with >{int(t*100)}% missing: {(miss > t).sum()}")

miss_by_class = pd.DataFrame({
    "pass_missing": X[y == 0].isna().mean(),
    "fail_missing": X[y == 1].isna().mean(),
})
miss_by_class["delta_fail_minus_pass"] = miss_by_class["fail_missing"] - miss_by_class["pass_missing"]

print("\nTop 15 features where missingness differs by class:")
print(
    miss_by_class["delta_fail_minus_pass"]
    .abs()
    .sort_values(ascending=False)
    .head(15)
)

Missingness summary:
count    590.000000
mean       0.045375
std        0.154340
min        0.000000
25%        0.001276
50%        0.003829
75%        0.005743
max        0.911934
dtype: float64
cols with >20% missing: 32
cols with >40% missing: 32
cols with >60% missing: 24
cols with >80% missing: 8
cols with >95% missing: 0

Top 15 features where missingness differs by class:
72     0.171960
73     0.171960
345    0.171960
346    0.171960
385    0.148858
112    0.148858
519    0.148858
247    0.148858
111    0.066289
109    0.066289
382    0.066289
110    0.066289
516    0.066289
244    0.066289
246    0.066289
Name: delta_fail_minus_pass, dtype: float64


In [7]:
df = L.copy()
df["fail"] = (df["y"] == 1).astype(int)
df = df.sort_values("timestamp")

print("time range:", df["timestamp"].min(), "->", df["timestamp"].max())

weekly = (
    df.set_index("timestamp")["fail"]
      .resample("W")
      .agg(["count", "sum"])
      .rename(columns={"sum": "fails"})
) # type: ignore
weekly["fail_rate"] = weekly["fails"] / weekly["count"]

print("\nweekly fail-rate summary:")
print(weekly["fail_rate"].describe())

print("\nTop 10 highest-fail weeks:")
print(weekly.sort_values("fail_rate", ascending=False).head(10))

print("\nTop 10 lowest-fail weeks:")
print(weekly.sort_values("fail_rate", ascending=True).head(10))

time range: 2008-07-19 11:55:00 -> 2008-10-17 06:07:00

weekly fail-rate summary:
count    14.000000
mean      0.087106
std       0.069633
min       0.010638
25%       0.034706
50%       0.073364
75%       0.123665
max       0.230769
Name: fail_rate, dtype: float64

Top 10 highest-fail weeks:
            count  fails  fail_rate
timestamp                          
2008-07-20     13      3   0.230769
2008-08-03     48     10   0.208333
2008-08-17     51      7   0.137255
2008-08-10    108     14   0.129630
2008-08-24    208     22   0.105769
2008-07-27     21      2   0.095238
2008-10-05    169     15   0.088757
2008-10-12    138      8   0.057971
2008-09-14     95      4   0.042105
2008-08-31    169      7   0.041420

Top 10 lowest-fail weeks:
            count  fails  fail_rate
timestamp                          
2008-10-19     94      1   0.010638
2008-09-07    133      2   0.015038
2008-09-28    166      4   0.024096
2008-09-21    154      5   0.032468
2008-08-31    169      7   0.04

In [11]:
# X already loaded
n = len(X)

# 1) Constant / near-constant
nunique = X.nunique(dropna=True)
const_cols = nunique[nunique <= 1].index.tolist()

# near-constant by dominant value frequency (ignoring NaN)
dom_frac = X.apply(lambda s: s.value_counts(dropna=True, normalize=True).iloc[0] if s.notna().any() else 1.0)
near_const_cols = dom_frac[dom_frac >= 0.995].index.tolist()

print("constant features:", len(const_cols))
print("near-constant (>=99.5% same value):", len(near_const_cols))

# 2) Correlation redundancy (after median impute only for this audit)
Xi = X.copy()
Xi = Xi.fillna(Xi.median(numeric_only=True))

corr = Xi.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
high_corr_pairs = (upper.stack().sort_values(ascending=False))

print("\n# pairs with |corr| >= 0.95:", int((high_corr_pairs >= 0.95).sum()))
print("# pairs with |corr| >= 0.90:", int((high_corr_pairs >= 0.90).sum()))
print("\nTop 20 absolute-correlation pairs:")
print(high_corr_pairs.head(20))

constant features: 116
near-constant (>=99.5% same value): 122

# pairs with |corr| >= 0.95: 316
# pairs with |corr| >= 0.90: 397

Top 20 absolute-correlation pairs:
209  347    1.000000
     342    1.000000
     478    1.000000
74   478    1.000000
     209    1.000000
     342    1.000000
     347    1.000000
342  347    1.000000
347  478    1.000000
206  209    1.000000
     347    1.000000
     478    1.000000
74   206    1.000000
206  342    1.000000
342  478    1.000000
34   36     1.000000
140  275    1.000000
172  174    1.000000
307  309    0.999999
152  287    0.999997
dtype: float64


In [13]:
y_bin = (L["y"] == 1).astype(int)

# base masks
miss = X.isna().mean()
dom = X.apply(lambda s: s.value_counts(dropna=True, normalize=True).iloc[0] if s.notna().any() else 1.0)
nunique = X.nunique(dropna=True)

drop_const = set(X.columns[nunique <= 1])
drop_near_const = set(X.columns[dom >= 0.995])

base_keep = [c for c in X.columns if c not in drop_const and c not in drop_near_const]
Xi = X[base_keep].copy().fillna(X[base_keep].median())

# high-corr duplicate graph
corr = Xi.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
pairs = upper.stack()
dup_pairs = pairs[pairs >= 0.9999].reset_index()
dup_pairs.columns = ["a", "b", "abs_corr"]

# adjacency
adj = {c: set() for c in Xi.columns}
for a, b, _ in dup_pairs.itertuples(index=False):
    adj[a].add(b)
    adj[b].add(a)

# connected components
seen, comps = set(), []
for n in Xi.columns:
    if n in seen or not adj[n]:
        continue
    stack, comp = [n], set()
    while stack:
        v = stack.pop()
        if v in seen:
            continue
        seen.add(v)
        comp.add(v)
        stack.extend(adj[v] - seen)
    comps.append(comp)

# representative chooser
var = Xi.var()
yc = y_bin - y_bin.mean()
rep_keep, rep_drop = set(), set()

for comp in comps:
    comp = list(comp)
    # rank: lower missing, then higher variance, then higher |corr with y|
    cxy = {}
    for c in comp:
        xc = Xi[c] - Xi[c].mean()
        cxy[c] = abs((xc @ yc) / (np.linalg.norm(xc) * np.linalg.norm(yc) + 1e-12))
    best = sorted(comp, key=lambda c: (miss[c], -var[c], -cxy[c]))[0]
    rep_keep.add(best)
    rep_drop.update(set(comp) - {best})

final_drop = drop_const | drop_near_const | rep_drop
final_keep = [c for c in X.columns if c not in final_drop]

print("drop_const:", len(drop_const))
print("drop_near_const:", len(drop_near_const))
print("duplicate_components:", len(comps))
print("drop_from_duplicates:", len(rep_drop))
print("final_keep_count:", len(final_keep))

drop_const: 116
drop_near_const: 122
duplicate_components: 10
drop_from_duplicates: 12
final_keep_count: 456


In [23]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_classif, r_regression

In [15]:
X_base = X[final_keep].copy()
y_bin = (L["y"] == 1).astype(int).values  # 1=fail, 0=pass

def eval_baseline(add_indicator=True):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=add_indicator)),
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=3000,
            random_state=42,
        )),
    ])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    rows = []
    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        pipe.fit(Xtr, ytr)
        pred = pipe.predict(Xte)

        tn, fp, fn, tp = confusion_matrix(yte, pred, labels=[0, 1]).ravel()
        tpr = tp / (tp + fn + 1e-12)   # True+
        tnr = tn / (tn + fp + 1e-12)   # True-
        ber = 1.0 - 0.5 * (tpr + tnr)

        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

    out = pd.DataFrame(rows)
    print(f"\nadd_indicator={add_indicator}")
    print(out[["BER","True+","True-"]].mean().rename("mean"))
    print(out[["BER","True+","True-"]].std().rename("std"))
    return out

res_no_ind = eval_baseline(add_indicator=False)
res_with_ind = eval_baseline(add_indicator=True)


add_indicator=False
BER      0.418409
True+    0.269091
True-    0.894092
Name: mean, dtype: float64
BER      0.092275
True+    0.179234
True-    0.022404
Name: std, dtype: float64

add_indicator=True
BER      0.410133
True+    0.289091
True-    0.890644
Name: mean, dtype: float64
BER      0.081925
True+    0.159130
True-    0.018448
Name: std, dtype: float64


In [18]:
X_base = X[final_keep].copy()
y_bin = (L["y"] == 1).astype(int).values

class S2NSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k=40, eps=1e-12):
        self.k = k
        self.eps = eps

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y)
        pos = X[y == 1]
        neg = X[y == 0]

        mu_pos = np.nanmean(pos, axis=0)
        mu_neg = np.nanmean(neg, axis=0)
        sd_pos = np.nanstd(pos, axis=0, ddof=0)
        sd_neg = np.nanstd(neg, axis=0, ddof=0)

        scores = np.abs(mu_pos - mu_neg) / (sd_pos + sd_neg + self.eps)
        scores = np.nan_to_num(scores, nan=0.0, posinf=0.0, neginf=0.0)

        self.scores_ = scores
        self.idx_ = np.argsort(scores)[::-1][: self.k]
        return self

    def transform(self, X):
        return np.asarray(X)[:, self.idx_]

def run_s2n(k=40):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
        ("s2n", S2NSelector(k=k)),
        ("clf", LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=3000,
            random_state=42,
        )),
    ])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    rows = []
    selected_counts = {}

    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        pipe.fit(Xtr, ytr)
        pred = pipe.predict(Xte)

        tn, fp, fn, tp = confusion_matrix(yte, pred, labels=[0,1]).ravel()
        tpr = tp / (tp + fn + 1e-12)
        tnr = tn / (tn + fp + 1e-12)
        ber = 1 - 0.5 * (tpr + tnr)

        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

        idx = pipe.named_steps["s2n"].idx_
        for j in idx:
            selected_counts[j] = selected_counts.get(j, 0) + 1

    out = pd.DataFrame(rows)
    print(out[["BER","True+","True-"]].mean().rename("mean"))
    print(out[["BER","True+","True-"]].std().rename("std"))

    freq = pd.Series(selected_counts).sort_values(ascending=False) / 10.0
    print("\nTop selected transformed columns (frequency across folds):")
    print(freq.head(20))
    return out, freq

res_s2n, freq_s2n = run_s2n(k=40)

BER      0.365012
True+    0.559091
True-    0.710884
Name: mean, dtype: float64
BER      0.072750
True+    0.134721
True-    0.038923
Name: std, dtype: float64

Top selected transformed columns (frequency across folds):
52     1.0
93     1.0
119    1.0
25     1.0
392    1.0
114    1.0
18     1.0
258    1.0
335    1.0
244    1.0
115    1.0
101    1.0
120    1.0
112    1.0
117    1.0
281    0.9
353    0.9
165    0.9
111    0.9
520    0.8
dtype: float64


In [19]:
# assumes these already exist from prior cells:
# X_base = X[final_keep].copy()
# y_bin = (L["y"] == 1).astype(int).values

class WelchTSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k=40, eps=1e-12):
        self.k = k
        self.eps = eps

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y)

        pos = X[y == 1]
        neg = X[y == 0]

        m1 = np.mean(pos, axis=0)
        m0 = np.mean(neg, axis=0)
        v1 = np.var(pos, axis=0, ddof=1)
        v0 = np.var(neg, axis=0, ddof=1)
        n1 = max(pos.shape[0], 1)
        n0 = max(neg.shape[0], 1)

        t = np.abs(m1 - m0) / (np.sqrt(v1 / n1 + v0 / n0) + self.eps)
        t = np.nan_to_num(t, nan=0.0, posinf=0.0, neginf=0.0)

        self.scores_ = t
        self.idx_ = np.argsort(t)[::-1][: self.k]
        return self

    def transform(self, X):
        return np.asarray(X)[:, self.idx_]


def run_t(k=40, random_state=42):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
        ("tsel", WelchTSelector(k=k)),
        ("clf", LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=3000,
            random_state=random_state,
        )),
    ])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

    rows = []
    selected_counts = {}

    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        pipe.fit(Xtr, ytr)
        pred = pipe.predict(Xte)

        tn, fp, fn, tp = confusion_matrix(yte, pred, labels=[0, 1]).ravel()
        tpr = tp / (tp + fn + 1e-12)  # True+
        tnr = tn / (tn + fp + 1e-12)  # True-
        ber = 1.0 - 0.5 * (tpr + tnr)

        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

        idx = pipe.named_steps["tsel"].idx_
        for j in idx:
            selected_counts[j] = selected_counts.get(j, 0) + 1

    out = pd.DataFrame(rows)
    print(out[["BER", "True+", "True-"]].mean().rename("mean"))
    print(out[["BER", "True+", "True-"]].std().rename("std"))

    freq = pd.Series(selected_counts).sort_values(ascending=False) / 10.0
    print("\nTop selected transformed columns (frequency across folds):")
    print(freq.head(20))

    return out, freq


res_t, freq_t = run_t(k=40, random_state=42)

BER      0.316597
True+    0.665455
True-    0.701351
Name: mean, dtype: float64
BER      0.091090
True+    0.175486
True-    0.032003
Name: std, dtype: float64

Top selected transformed columns (frequency across folds):
52     1.0
119    1.0
93     0.9
715    0.9
801    0.8
803    0.8
714    0.8
527    0.8
528    0.8
526    0.8
717    0.8
716    0.8
800    0.8
799    0.8
636    0.8
634    0.8
633    0.8
635    0.8
524    0.8
806    0.7
dtype: float64


In [22]:
def run_f(k=40, random_state=42):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
        ("fsel", SelectKBest(score_func=f_classif, k=k)),
        ("clf", LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=3000,
            random_state=random_state,
        )),
    ])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
    rows, selected_counts = [], {}

    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        pipe.fit(Xtr, ytr)
        pred = pipe.predict(Xte)

        tn, fp, fn, tp = confusion_matrix(yte, pred, labels=[0, 1]).ravel()
        tpr = tp / (tp + fn + 1e-12)
        tnr = tn / (tn + fp + 1e-12)
        ber = 1.0 - 0.5 * (tpr + tnr)
        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

        idx = pipe.named_steps["fsel"].get_support(indices=True)
        for j in idx:
            selected_counts[j] = selected_counts.get(j, 0) + 1

    out = pd.DataFrame(rows)
    print(out[["BER", "True+", "True-"]].mean().rename("mean"))
    print(out[["BER", "True+", "True-"]].std().rename("std"))

    freq = pd.Series(selected_counts).sort_values(ascending=False) / 10.0
    print("\nTop selected transformed columns (frequency across folds):")
    print(freq.head(20))
    return out, freq

res_f, freq_f = run_f(k=40, random_state=42)

BER      0.314160
True+    0.616364
True-    0.755316
Name: mean, dtype: float64
BER      0.057419
True+    0.132906
True-    0.043669
Name: std, dtype: float64

Top selected transformed columns (frequency across folds):
18     1.0
25     1.0
93     1.0
52     1.0
334    1.0
335    1.0
151    1.0
119    1.0
340    1.0
281    1.0
244    1.0
392    1.0
186    1.0
339    1.0
338    1.0
114    0.9
241    0.9
148    0.9
111    0.9
115    0.8
dtype: float64


In [24]:
def run_pearson(k=40, random_state=42):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
        ("psel", SelectKBest(score_func=lambda X, y: np.abs(r_regression(X, y)), k=k)),
        ("clf", LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=3000,
            random_state=random_state,
        )),
    ])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
    rows, selected_counts = [], {}

    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        pipe.fit(Xtr, ytr)
        pred = pipe.predict(Xte)

        tn, fp, fn, tp = confusion_matrix(yte, pred, labels=[0, 1]).ravel()
        tpr = tp / (tp + fn + 1e-12)
        tnr = tn / (tn + fp + 1e-12)
        ber = 1.0 - 0.5 * (tpr + tnr)
        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

        idx = pipe.named_steps["psel"].get_support(indices=True)
        for j in idx:
            selected_counts[j] = selected_counts.get(j, 0) + 1

    out = pd.DataFrame(rows)
    print(out[["BER", "True+", "True-"]].mean().rename("mean"))
    print(out[["BER", "True+", "True-"]].std().rename("std"))

    freq = pd.Series(selected_counts).sort_values(ascending=False) / 10.0
    print("\nTop selected transformed columns (frequency across folds):")
    print(freq.head(20))
    return out, freq

res_p, freq_p = run_pearson(k=40, random_state=42)

BER      0.314160
True+    0.616364
True-    0.755316
Name: mean, dtype: float64
BER      0.057419
True+    0.132906
True-    0.043669
Name: std, dtype: float64

Top selected transformed columns (frequency across folds):
18     1.0
25     1.0
93     1.0
52     1.0
334    1.0
335    1.0
151    1.0
119    1.0
340    1.0
281    1.0
244    1.0
392    1.0
186    1.0
339    1.0
338    1.0
114    0.9
241    0.9
148    0.9
111    0.9
115    0.8
dtype: float64


In [25]:
set_f = set(freq_f.index)
set_p = set(freq_p.index)
print("same selected set:", set_f == set_p)
print("Jaccard:", len(set_f & set_p) / len(set_f | set_p))

same selected set: True
Jaccard: 1.0


In [26]:
from skrebate import ReliefF

class ReliefFSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k=40, n_neighbors=10):
        self.k = k
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y)
        self.rf_ = ReliefF(
            n_features_to_select=self.k,
            n_neighbors=self.n_neighbors,
        )
        self.rf_.fit(X, y)

        imp = np.asarray(self.rf_.feature_importances_, dtype=float)
        imp = np.nan_to_num(imp, nan=-np.inf)
        self.scores_ = imp
        self.idx_ = np.argsort(imp)[::-1][: self.k]
        return self

    def transform(self, X):
        return np.asarray(X)[:, self.idx_]


def run_relieff(k=40, n_neighbors=10, random_state=42):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
        ("rsel", ReliefFSelector(k=k, n_neighbors=n_neighbors)),
        ("clf", LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=3000,
            random_state=random_state,
        )),
    ])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

    rows = []
    selected_counts = {}

    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        pipe.fit(Xtr, ytr)
        pred = pipe.predict(Xte)

        tn, fp, fn, tp = confusion_matrix(yte, pred, labels=[0, 1]).ravel()
        tpr = tp / (tp + fn + 1e-12)  # True+
        tnr = tn / (tn + fp + 1e-12)  # True-
        ber = 1.0 - 0.5 * (tpr + tnr)

        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

        idx = pipe.named_steps["rsel"].idx_
        for j in idx:
            selected_counts[j] = selected_counts.get(j, 0) + 1

    out = pd.DataFrame(rows)
    print(out[["BER", "True+", "True-"]].mean().rename("mean"))
    print(out[["BER", "True+", "True-"]].std().rename("std"))

    freq = pd.Series(selected_counts).sort_values(ascending=False) / 10.0
    print("\nTop selected transformed columns (frequency across folds):")
    print(freq.head(20))

    return out, freq

res_relief, freq_relief = run_relieff(k=40, n_neighbors=10, random_state=42)

BER      0.305183
True+    0.628182
True-    0.761453
Name: mean, dtype: float64
BER      0.096087
True+    0.171122
True-    0.049545
Name: std, dtype: float64

Top selected transformed columns (frequency across folds):
58     1.0
311    1.0
52     1.0
405    1.0
217    1.0
57     1.0
71     1.0
218    1.0
55     1.0
406    1.0
54     1.0
93     1.0
122    1.0
281    1.0
312    1.0
59     1.0
48     1.0
356    1.0
168    1.0
62     0.9
dtype: float64


In [27]:
class GramSchmidtSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k=40, eps=1e-12):
        self.k = k
        self.eps = eps

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)

        n, p = X.shape
        k = min(self.k, p)

        # Work copies
        Xw = X.copy()
        r = y - y.mean()  # residual target direction

        remaining = list(range(p))
        selected = []
        scores = []

        for _ in range(k):
            r_norm = np.linalg.norm(r)
            if r_norm < self.eps or not remaining:
                break

            # score remaining features by absolute cosine with residual
            best_j = None
            best_score = -np.inf

            for j in remaining:
                xj = Xw[:, j]
                x_norm = np.linalg.norm(xj)
                if x_norm < self.eps:
                    s = -np.inf
                else:
                    s = abs(np.dot(xj, r)) / (x_norm * r_norm + self.eps)

                if s > best_score:
                    best_score = s
                    best_j = j

            if best_j is None or not np.isfinite(best_score):
                break

            selected.append(best_j)
            scores.append(best_score)

            # orthonormal direction q of selected feature
            q = Xw[:, best_j]
            q_norm = np.linalg.norm(q)
            if q_norm < self.eps:
                remaining.remove(best_j)
                continue
            q = q / q_norm

            # remove selected direction from residual and remaining features
            r = r - np.dot(r, q) * q

            for j in remaining:
                if j == best_j:
                    continue
                Xw[:, j] = Xw[:, j] - np.dot(Xw[:, j], q) * q

            remaining.remove(best_j)

        self.idx_ = np.array(selected, dtype=int)
        self.scores_ = np.array(scores, dtype=float)

        # pad if early stop (rare)
        if len(self.idx_) < k:
            leftovers = [j for j in range(p) if j not in set(self.idx_)]
            need = k - len(self.idx_)
            self.idx_ = np.concatenate([self.idx_, np.array(leftovers[:need], dtype=int)])

        return self

    def transform(self, X):
        return np.asarray(X)[:, self.idx_]


def run_gram_schmidt(k=40, random_state=42):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
        ("gsel", GramSchmidtSelector(k=k)),
        ("clf", LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=3000,
            random_state=random_state,
        )),
    ])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

    rows = []
    selected_counts = {}

    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        pipe.fit(Xtr, ytr)
        pred = pipe.predict(Xte)

        tn, fp, fn, tp = confusion_matrix(yte, pred, labels=[0, 1]).ravel()
        tpr = tp / (tp + fn + 1e-12)  # True+
        tnr = tn / (tn + fp + 1e-12)  # True-
        ber = 1.0 - 0.5 * (tpr + tnr)

        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

        idx = pipe.named_steps["gsel"].idx_
        for j in idx:
            selected_counts[j] = selected_counts.get(j, 0) + 1

    out = pd.DataFrame(rows)
    print(out[["BER", "True+", "True-"]].mean().rename("mean"))
    print(out[["BER", "True+", "True-"]].std().rename("std"))

    freq = pd.Series(selected_counts).sort_values(ascending=False) / 10.0
    print("\nTop selected transformed columns (frequency across folds):")
    print(freq.head(20))

    return out, freq


res_gs, freq_gs = run_gram_schmidt(k=40, random_state=42)

BER      0.393663
True+    0.432727
True-    0.779946
Name: mean, dtype: float64
BER      0.069747
True+    0.131684
True-    0.028437
Name: std, dtype: float64

Top selected transformed columns (frequency across folds):
52     1.0
18     1.0
57     1.0
341    1.0
66     1.0
89     1.0
119    0.9
104    0.9
190    0.8
282    0.8
473    0.8
143    0.8
281    0.8
501    0.7
58     0.7
441    0.7
393    0.7
11     0.7
428    0.6
111    0.6
dtype: float64


In [28]:
def _metrics_from_pred(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    tpr = tp / (tp + fn + 1e-12)  # True+
    tnr = tn / (tn + fp + 1e-12)  # True-
    ber = 1.0 - 0.5 * (tpr + tnr)
    return ber, tpr, tnr


def run_l1(k=40, C=0.15, random_state=42):
    """
    Embedded selection via L1 logistic.
    Top-k selected each fold by |coef| (on preprocessed train fold).
    """
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="saga",
            C=C,
            l1_ratio=1.0,  # new sklearn style for pure L1
            class_weight="balanced",
            max_iter=8000,
            random_state=random_state,
        )),
    ])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

    rows = []
    selected_counts = {}

    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        pipe.fit(Xtr, ytr)

        # rank by absolute coefficient, keep top-k
        coef = np.abs(pipe.named_steps["clf"].coef_[0])
        idx = np.argsort(coef)[::-1][:k]

        # transform train/test through preprocessors only, then subset columns
        Xtr_t = pipe[:-1].transform(Xtr)[:, idx]
        Xte_t = pipe[:-1].transform(Xte)[:, idx]

        # refit logistic on selected columns only
        clf2 = LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=4000,
            random_state=random_state,
        )
        clf2.fit(Xtr_t, ytr)
        pred = clf2.predict(Xte_t)

        ber, tpr, tnr = _metrics_from_pred(yte, pred)
        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

        for j in idx:
            selected_counts[j] = selected_counts.get(j, 0) + 1

    out = pd.DataFrame(rows)
    print(f"L1 (C={C}, k={k})")
    print(out[["BER", "True+", "True-"]].mean().rename("mean"))
    print(out[["BER", "True+", "True-"]].std().rename("std"))

    freq = pd.Series(selected_counts).sort_values(ascending=False) / 10.0
    print("\nTop selected transformed columns (frequency across folds):")
    print(freq.head(20))
    return out, freq


def run_elasticnet(k=40, C=0.15, l1_ratio=0.4, random_state=42):
    """
    Embedded selection via Elastic Net logistic.
    Top-k selected each fold by |coef| (on preprocessed train fold).
    """
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="saga",
            C=C,
            l1_ratio=l1_ratio,
            class_weight="balanced",
            max_iter=8000,
            random_state=random_state,
        )),
    ])

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

    rows = []
    selected_counts = {}

    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        pipe.fit(Xtr, ytr)

        # rank by absolute coefficient, keep top-k
        coef = np.abs(pipe.named_steps["clf"].coef_[0])
        idx = np.argsort(coef)[::-1][:k]

        # transform train/test through preprocessors only, then subset columns
        Xtr_t = pipe[:-1].transform(Xtr)[:, idx]
        Xte_t = pipe[:-1].transform(Xte)[:, idx]

        # refit logistic on selected columns only
        clf2 = LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=4000,
            random_state=random_state,
        )
        clf2.fit(Xtr_t, ytr)
        pred = clf2.predict(Xte_t)

        ber, tpr, tnr = _metrics_from_pred(yte, pred)
        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

        for j in idx:
            selected_counts[j] = selected_counts.get(j, 0) + 1

    out = pd.DataFrame(rows)
    print(f"Elastic Net (C={C}, l1_ratio={l1_ratio}, k={k})")
    print(out[["BER", "True+", "True-"]].mean().rename("mean"))
    print(out[["BER", "True+", "True-"]].std().rename("std"))

    freq = pd.Series(selected_counts).sort_values(ascending=False) / 10.0
    print("\nTop selected transformed columns (frequency across folds):")
    print(freq.head(20))
    return out, freq

# Example usage:
res_l1, freq_l1 = run_l1(k=40, C=0.15, random_state=42)

L1 (C=0.15, k=40)
BER      0.417631
True+    0.382727
True-    0.782010
Name: mean, dtype: float64
BER      0.091526
True+    0.176355
True-    0.027663
Name: std, dtype: float64

Top selected transformed columns (frequency across folds):
52     1.0
119    1.0
49     1.0
57     1.0
122    1.0
89     1.0
143    1.0
190    1.0
380    1.0
18     0.9
7      0.9
228    0.9
393    0.9
66     0.8
60     0.8
121    0.8
327    0.8
120    0.8
36     0.8
101    0.7
dtype: float64


In [29]:
res_en, freq_en = run_elasticnet(k=40, C=0.15, l1_ratio=0.4, random_state=42)

Elastic Net (C=0.15, l1_ratio=0.4, k=40)
BER      0.402885
True+    0.421818
True-    0.772412
Name: mean, dtype: float64
BER      0.055037
True+    0.117222
True-    0.029515
Name: std, dtype: float64

Top selected transformed columns (frequency across folds):
52     1.0
49     1.0
119    1.0
122    1.0
57     1.0
143    1.0
7      0.9
36     0.9
89     0.9
210    0.9
380    0.9
299    0.9
228    0.8
60     0.8
192    0.8
297    0.8
18     0.8
393    0.8
92     0.7
121    0.7
dtype: float64


In [30]:
from collections import Counter
from itertools import combinations

In [31]:
# ---------- Selectors ----------
class WelchTSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k=40, eps=1e-12):
        self.k = k
        self.eps = eps

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y)

        pos = X[y == 1]
        neg = X[y == 0]

        m1 = np.mean(pos, axis=0)
        m0 = np.mean(neg, axis=0)
        v1 = np.var(pos, axis=0, ddof=1)
        v0 = np.var(neg, axis=0, ddof=1)

        n1 = max(pos.shape[0], 1)
        n0 = max(neg.shape[0], 1)

        t = np.abs(m1 - m0) / (np.sqrt(v1 / n1 + v0 / n0) + self.eps)
        t = np.nan_to_num(t, nan=0.0, posinf=0.0, neginf=0.0)

        self.idx_ = np.argsort(t)[::-1][: self.k]
        return self

    def transform(self, X):
        return np.asarray(X)[:, self.idx_]


class ReliefFSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k=40, n_neighbors=10):
        self.k = k
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y)

        self.rf_ = ReliefF(n_features_to_select=self.k, n_neighbors=self.n_neighbors)
        self.rf_.fit(X, y)

        imp = np.asarray(self.rf_.feature_importances_, dtype=float)
        imp = np.nan_to_num(imp, nan=-np.inf)
        self.idx_ = np.argsort(imp)[::-1][: self.k]
        return self

    def transform(self, X):
        return np.asarray(X)[:, self.idx_]


# ---------- Helpers ----------
def ber_tpr_tnr(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    tpr = tp / (tp + fn + 1e-12)  # True+
    tnr = tn / (tn + fp + 1e-12)  # True-
    ber = 1.0 - 0.5 * (tpr + tnr)
    return ber, tpr, tnr


def selected_indices(selector):
    if hasattr(selector, "get_support"):
        return selector.get_support(indices=True)
    return selector.idx_


def selected_keys_from_pipe(pipe, selector_step, n_raw_features):
    idx = selected_indices(pipe.named_steps[selector_step])
    idx = np.asarray(idx, dtype=int)

    imputer = pipe.named_steps["imputer"]
    miss_feats = []
    if hasattr(imputer, "indicator_") and imputer.indicator_ is not None:
        miss_feats = list(imputer.indicator_.features_)

    keys = []
    for j in idx:
        if j < n_raw_features:
            keys.append(f"X{j}")  # raw feature index in X_base
        else:
            off = j - n_raw_features
            if 0 <= off < len(miss_feats):
                keys.append(f"M{int(miss_feats[off])}")  # missing-indicator for raw feature
            else:
                keys.append(f"UNK{j}")
    return set(keys)


def mean_pairwise_jaccard(sets):
    if len(sets) < 2:
        return np.nan
    vals = []
    for a, b in combinations(sets, 2):
        u = len(a | b)
        vals.append(len(a & b) / u if u else 1.0)
    return float(np.mean(vals))


# ---------- Main runner ----------
def run_finalists_repeated(
    X_base,
    y_bin,
    k=40,
    seeds=(11, 22, 33, 44, 55),   # 5 repeats x 10 folds = 50 folds/method
    relief_neighbors=10,
):
    methods = {
        "F-test": ("sel", SelectKBest(score_func=f_classif, k=k)),
        "Welch-t": ("sel", WelchTSelector(k=k)),
        "ReliefF": ("sel", ReliefFSelector(k=k, n_neighbors=relief_neighbors)),
    }

    n_raw = X_base.shape[1]
    rows = []
    selected_sets = {m: [] for m in methods}
    selected_counter = {m: Counter() for m in methods}

    for method_name, (sel_name, sel_obj) in methods.items():
        for seed in seeds:
            cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

            for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
                Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
                ytr, yte = y_bin[tr], y_bin[te]

                pipe = Pipeline([
                    ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
                    ("scaler", StandardScaler()),
                    (sel_name, sel_obj),
                    ("clf", LogisticRegression(
                        solver="lbfgs",
                        class_weight="balanced",
                        max_iter=3000,
                        random_state=seed,
                    )),
                ])

                pipe.fit(Xtr, ytr)
                pred = pipe.predict(Xte)
                ber, tpr, tnr = ber_tpr_tnr(yte, pred)

                keys = selected_keys_from_pipe(pipe, sel_name, n_raw_features=n_raw)
                selected_sets[method_name].append(keys)
                selected_counter[method_name].update(keys)

                rows.append({
                    "method": method_name,
                    "seed": seed,
                    "fold": fold,
                    "BER": ber,
                    "True+": tpr,
                    "True-": tnr,
                })

    detail = pd.DataFrame(rows)

    # summary
    agg = (
        detail.groupby("method")[["BER", "True+", "True-"]]
        .agg(["mean", "std"])
    )
    agg.columns = ["_".join(c) for c in agg.columns]
    agg = agg.reset_index()

    agg["mean_pairwise_jaccard"] = agg["method"].map(
        lambda m: mean_pairwise_jaccard(selected_sets[m])
    )

    # top selection frequency per method
    total_runs = len(seeds) * 10
    top_freq = {}
    for m in methods:
        s = pd.Series(selected_counter[m]).sort_values(ascending=False) / total_runs
        top_freq[m] = s

    return detail, agg.sort_values("BER_mean"), top_freq


# ---- Execute ----
detail_df, summary_df, top_freq = run_finalists_repeated(
    X_base=X_base,
    y_bin=y_bin,
    k=40,
    seeds=(11, 22, 33, 44, 55),
    relief_neighbors=10,
)

print(summary_df)

for m in ["ReliefF", "F-test", "Welch-t"]:
    print(f"\nTop selection frequency: {m}")
    print(top_freq[m].head(20))

    method  BER_mean   BER_std  True+_mean  True+_std  True-_mean  True-_std  \
1  ReliefF  0.318685  0.072940    0.606909   0.151337    0.755721   0.039235   
2  Welch-t  0.340504  0.078633    0.631636   0.155153    0.687356   0.046366   
0   F-test  0.350492  0.067897    0.565455   0.135936    0.733562   0.036800   

   mean_pairwise_jaccard  
1               0.850505  
2               0.845169  
0               0.571730  

Top selection frequency: ReliefF
M447    1.0
M445    1.0
X281    1.0
X93     1.0
X48     1.0
X52     1.0
M65     1.0
X311    1.0
M279    1.0
X57     1.0
X405    1.0
X58     1.0
M446    1.0
M208    1.0
X312    1.0
X217    1.0
X55     1.0
M64     1.0
X218    1.0
M280    1.0
dtype: float64

Top selection frequency: F-test
X392    1.00
X25     1.00
X281    1.00
X93     1.00
X52     1.00
X119    1.00
X335    0.94
X334    0.92
X338    0.92
X186    0.92
X114    0.92
X18     0.92
X258    0.90
X340    0.90
X339    0.90
M280    0.88
X151    0.88
X244    0.88
M279    0.84
X2

In [32]:
# Decode top_freq keys to original sensor IDs and inspect missingness effect
raw_cols = list(X_base.columns)  # original feature IDs from SECOM after hygiene

def decode_key(k):
    kind = "value" if k.startswith("X") else "missing_indicator_for"
    idx = int(k[1:])
    return kind, raw_cols[idx]

def missing_delta_for_idx(idx):
    col = raw_cols[idx]
    miss_fail = X_base.loc[y_bin == 1, col].isna().mean()
    miss_pass = X_base.loc[y_bin == 0, col].isna().mean()
    return miss_fail, miss_pass, miss_fail - miss_pass

top_relief = top_freq["ReliefF"].head(20)
rows = []
for k, f in top_relief.items():
    kind, sensor = decode_key(k)
    if kind == "missing_indicator_for":
        mf, mp, d = missing_delta_for_idx(int(k[1:]))
    else:
        mf = mp = d = np.nan
    rows.append({
        "key": k, "freq": f, "kind": kind, "sensor_id": sensor,
        "miss_fail": mf, "miss_pass": mp, "delta_fail_minus_pass": d
    })

pd.DataFrame(rows)

Unnamed: 0,key,freq,kind,sensor_id,miss_fail,miss_pass,delta_fail_minus_pass
0,M447,1.0,missing_indicator_for,581,0.567308,0.608339,-0.041031
1,M445,1.0,missing_indicator_for,579,0.567308,0.608339,-0.041031
2,X281,1.0,value,348,,,
3,X93,1.0,value,103,,,
4,X48,1.0,value,55,,,
5,X52,1.0,value,59,,,
6,M65,1.0,missing_indicator_for,73,0.346154,0.518113,-0.17196
7,X311,1.0,value,405,,,
8,M279,1.0,missing_indicator_for,345,0.346154,0.518113,-0.17196
9,X57,1.0,value,64,,,


In [33]:
def ber_tpr_tnr(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    tpr = tp / (tp + fn + 1e-12)
    tnr = tn / (tn + fp + 1e-12)
    ber = 1.0 - 0.5 * (tpr + tnr)
    return ber, tpr, tnr

def run_relieff_ablation(mode="both", k=40, n_neighbors=10, random_state=42):
    # mode: "both", "values", "indicators"
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
    rows = []

    for fold, (tr, te) in enumerate(cv.split(X_base, y_bin), start=1):
        Xtr, Xte = X_base.iloc[tr], X_base.iloc[te]
        ytr, yte = y_bin[tr], y_bin[te]

        imp = SimpleImputer(strategy="median", add_indicator=True)
        Xtr_i = imp.fit_transform(Xtr)
        Xte_i = imp.transform(Xte)

        n_raw = Xtr.shape[1]
        if mode == "both":
            keep = np.arange(Xtr_i.shape[1])
        elif mode == "values":
            keep = np.arange(n_raw)
        elif mode == "indicators":
            keep = np.arange(n_raw, Xtr_i.shape[1])
        else:
            raise ValueError("mode must be one of: both, values, indicators")

        if len(keep) == 0:
            raise ValueError("No columns available for this mode.")

        sc = StandardScaler()
        Xtr_s = sc.fit_transform(Xtr_i[:, keep])
        Xte_s = sc.transform(Xte_i[:, keep])

        k_use = min(k, Xtr_s.shape[1])
        rf = ReliefF(n_features_to_select=k_use, n_neighbors=n_neighbors)
        rf.fit(Xtr_s, ytr)
        sel = np.argsort(rf.feature_importances_)[::-1][:k_use]

        clf = LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=3000,
            random_state=random_state
        )
        clf.fit(Xtr_s[:, sel], ytr)
        pred = clf.predict(Xte_s[:, sel])

        ber, tpr, tnr = ber_tpr_tnr(yte, pred)
        rows.append({"fold": fold, "BER": ber, "True+": tpr, "True-": tnr})

    out = pd.DataFrame(rows)
    print(f"\nReliefF mode={mode}")
    print(out[["BER", "True+", "True-"]].mean().rename("mean"))
    print(out[["BER", "True+", "True-"]].std().rename("std"))
    return out

res_both = run_relieff_ablation("both")
res_val  = run_relieff_ablation("values")
res_miss = run_relieff_ablation("indicators")


ReliefF mode=both
BER      0.305183
True+    0.628182
True-    0.761453
Name: mean, dtype: float64
BER      0.096087
True+    0.171122
True-    0.049545
Name: std, dtype: float64

ReliefF mode=values
BER      0.342262
True+    0.571818
True-    0.743659
Name: mean, dtype: float64
BER      0.096057
True+    0.177393
True-    0.043541
Name: std, dtype: float64

ReliefF mode=indicators
BER      0.416490
True+    0.549091
True-    0.617929
Name: mean, dtype: float64
BER      0.060119
True+    0.115804
True-    0.043169
Name: std, dtype: float64


In [34]:
def ber_tpr_tnr(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    tpr = tp / (tp + fn + 1e-12)  # True+
    tnr = tn / (tn + fp + 1e-12)  # True-
    ber = 1.0 - 0.5 * (tpr + tnr)
    return ber, tpr, tnr


class ReliefFSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k=40, n_neighbors=10):
        self.k = k
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y)
        self.rf_ = ReliefF(n_features_to_select=self.k, n_neighbors=self.n_neighbors)
        self.rf_.fit(X, y)
        imp = np.asarray(self.rf_.feature_importances_, dtype=float)
        imp = np.nan_to_num(imp, nan=-np.inf)
        self.idx_ = np.argsort(imp)[::-1][: self.k]
        return self

    def transform(self, X):
        return np.asarray(X)[:, self.idx_]


def build_pipe(method="relief", k=40, n_neighbors=10, random_state=42):
    if method == "relief":
        selector = ReliefFSelector(k=k, n_neighbors=n_neighbors)
    elif method == "f":
        selector = SelectKBest(score_func=f_classif, k=k)
    else:
        raise ValueError("method must be 'relief' or 'f'")

    return Pipeline([
        ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
        ("scaler", StandardScaler()),
        ("sel", selector),
        ("clf", LogisticRegression(
            solver="lbfgs",
            class_weight="balanced",
            max_iter=3000,
            random_state=random_state
        )),
    ])


# ---------- Build chronological split ----------
# Use aligned timestamp from L
ts = pd.to_datetime(L["timestamp"], errors="coerce")
valid = ts.notna().values

Xv = X_base.loc[valid].copy()
yv = y_bin[valid]
tv = ts.loc[valid].reset_index(drop=True)

order = np.argsort(tv.values)
Xv = Xv.iloc[order].reset_index(drop=True)
yv = yv[order]
tv = tv.iloc[order].reset_index(drop=True)

# Choose split point by time percentile (e.g., first 70% train, last 30% test)
cut = int(0.70 * len(Xv))
Xtr, Xte = Xv.iloc[:cut], Xv.iloc[cut:]
ytr, yte = yv[:cut], yv[cut:]
ttr, tte = tv.iloc[:cut], tv.iloc[cut:]

print("Train range:", ttr.min(), "->", ttr.max(), "n=", len(Xtr), "fail_rate=", ytr.mean())
print("Test  range:", tte.min(), "->", tte.max(), "n=", len(Xte), "fail_rate=", yte.mean())

# ---------- Evaluate finalists ----------
results = []
for method in ["relief", "f"]:
    pipe = build_pipe(method=method, k=40, n_neighbors=10, random_state=42)
    pipe.fit(Xtr, ytr)
    pred = pipe.predict(Xte)
    ber, tpr, tnr = ber_tpr_tnr(yte, pred)

    results.append({
        "method": "ReliefF" if method == "relief" else "F-test",
        "BER": ber,
        "True+": tpr,
        "True-": tnr,
        "train_fail_rate": float(ytr.mean()),
        "test_fail_rate": float(yte.mean()),
    })

pd.DataFrame(results)

Train range: 2008-07-19 11:55:00 -> 2008-09-26 02:26:00 n= 1096 fail_rate= 0.07116788321167883
Test  range: 2008-09-26 03:12:00 -> 2008-10-17 06:07:00 n= 471 fail_rate= 0.055201698513800426


Unnamed: 0,method,BER,True+,True-,train_fail_rate,test_fail_rate
0,ReliefF,0.46089,0.269231,0.808989,0.071168,0.055202
1,F-test,0.418064,0.307692,0.85618,0.071168,0.055202


In [35]:
# 1) Prepare chronological dataset once
ts = pd.to_datetime(L["timestamp"], format="%d/%m/%Y %H:%M:%S", errors="coerce")
valid = ts.notna().values

Xv = X_base.iloc[valid].copy()
yv = y_bin[valid]
tv = ts.iloc[valid].reset_index(drop=True)

order = np.argsort(tv.values)
Xv = Xv.iloc[order].reset_index(drop=True)
yv = yv[order]
tv = tv.iloc[order].reset_index(drop=True)

# 2) Evaluate multiple train/test cut ratios
cuts = [0.60, 0.70, 0.80]
methods = ["relief", "f"]

rows = []

for frac in cuts:
    cut = int(frac * len(Xv))
    Xtr, Xte = Xv.iloc[:cut], Xv.iloc[cut:]
    ytr, yte = yv[:cut], yv[cut:]
    ttr, tte = tv.iloc[:cut], tv.iloc[cut:]

    for method in methods:
        pipe = build_pipe(method=method, k=40, n_neighbors=10, random_state=42)
        pipe.fit(Xtr, ytr)
        pred = pipe.predict(Xte)

        ber, tpr, tnr = ber_tpr_tnr(yte, pred)

        rows.append({
            "cut": f"{int(frac*100)}/{int((1-frac)*100)}",
            "method": "ReliefF" if method == "relief" else "F-test",
            "BER": ber,
            "True+": tpr,
            "True-": tnr,
            "train_n": len(Xtr),
            "test_n": len(Xte),
            "train_fail_rate": float(ytr.mean()),
            "test_fail_rate": float(yte.mean()),
            "train_start": ttr.min(),
            "train_end": ttr.max(),
            "test_start": tte.min(),
            "test_end": tte.max(),
        })

time_sens = pd.DataFrame(rows).sort_values(["cut", "method"]).reset_index(drop=True)
time_sens

Unnamed: 0,cut,method,BER,True+,True-,train_n,test_n,train_fail_rate,test_fail_rate,train_start,train_end,test_start,test_end
0,60/40,F-test,0.501192,0.142857,0.854758,940,627,0.080851,0.044657,2008-07-19 11:55:00,2008-09-20 05:34:00,2008-09-20 06:08:00,2008-10-17 06:07:00
1,60/40,ReliefF,0.465657,0.285714,0.782972,940,627,0.080851,0.044657,2008-07-19 11:55:00,2008-09-20 05:34:00,2008-09-20 06:08:00,2008-10-17 06:07:00
2,70/30,F-test,0.418064,0.307692,0.85618,1096,471,0.071168,0.055202,2008-07-19 11:55:00,2008-09-26 02:26:00,2008-09-26 03:12:00,2008-10-17 06:07:00
3,70/30,ReliefF,0.46089,0.269231,0.808989,1096,471,0.071168,0.055202,2008-07-19 11:55:00,2008-09-26 02:26:00,2008-09-26 03:12:00,2008-10-17 06:07:00
4,80/19,F-test,0.42454,0.352941,0.79798,1253,314,0.069433,0.05414,2008-07-19 11:55:00,2008-10-02 19:25:00,2008-10-02 20:54:00,2008-10-17 06:07:00
5,80/19,ReliefF,0.355615,0.470588,0.818182,1253,314,0.069433,0.05414,2008-07-19 11:55:00,2008-10-02 19:25:00,2008-10-02 20:54:00,2008-10-17 06:07:00


In [36]:
# Assumes already defined:
# - X_base, y_bin, L
# - build_pipe(method="relief"/"f", k=40, n_neighbors=10, random_state=42)
# - ber_tpr_tnr(y_true, y_pred)

# ---------- prepare chronological arrays ----------
ts = pd.to_datetime(L["timestamp"], format="%d/%m/%Y %H:%M:%S", errors="coerce")
valid = ts.notna().values

Xv = X_base.iloc[valid].copy()
yv = y_bin[valid]
tv = ts.iloc[valid].reset_index(drop=True)

order = np.argsort(tv.values)
Xv = Xv.iloc[order].reset_index(drop=True)
yv = yv[order]
tv = tv.iloc[order].reset_index(drop=True)

# ---------- helper: pick threshold on train only ----------
def best_threshold_by_ber(y_true, p_true, grid=None):
    if grid is None:
        grid = np.linspace(0.05, 0.95, 37)
    best_t, best_ber = 0.5, np.inf
    for t in grid:
        pred = (p_true >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, pred, labels=[0,1]).ravel()
        tpr = tp / (tp + fn + 1e-12)
        tnr = tn / (tn + fp + 1e-12)
        ber = 1.0 - 0.5 * (tpr + tnr)
        if ber < best_ber:
            best_ber = ber
            best_t = float(t)
    return best_t, best_ber

# ---------- build rolling anchored splits ----------
# Anchored train start at 0. For each split:
# train = [0 : train_end), test = [train_end : train_end + test_size)
n = len(Xv)
test_frac = 0.15
test_size = int(np.floor(test_frac * n))
min_train_frac = 0.50
min_train = int(np.floor(min_train_frac * n))
step = max(30, test_size // 3)  # move window forward

splits = []
train_end = min_train
while train_end + test_size <= n:
    splits.append((0, train_end, train_end, train_end + test_size))
    train_end += step

print(f"Total rolling splits: {len(splits)}")
print(f"n={n}, min_train={min_train}, test_size={test_size}, step={step}")

# ---------- evaluate methods ----------
rows = []
for i, (tr_s, tr_e, te_s, te_e) in enumerate(splits, start=1):
    Xtr, ytr = Xv.iloc[tr_s:tr_e], yv[tr_s:tr_e]
    Xte, yte = Xv.iloc[te_s:te_e], yv[te_s:te_e]
    ttr, tte = tv.iloc[tr_s:tr_e], tv.iloc[te_s:te_e]

    # Skip pathological windows with no positive/negative
    if len(np.unique(ytr)) < 2 or len(np.unique(yte)) < 2:
        continue

    for method in ["relief", "f"]:
        pipe = build_pipe(method=method, k=40, n_neighbors=10, random_state=42)
        pipe.fit(Xtr, ytr)

        # threshold tuning on train probs only
        p_tr = pipe.predict_proba(Xtr)[:, 1]
        th, ber_tr = best_threshold_by_ber(ytr, p_tr)

        # evaluate on test with tuned threshold
        p_te = pipe.predict_proba(Xte)[:, 1]
        pred_te = (p_te >= th).astype(int)
        ber, tpr, tnr = ber_tpr_tnr(yte, pred_te)

        rows.append({
            "split_id": i,
            "method": "ReliefF" if method == "relief" else "F-test",
            "threshold": th,
            "train_BER_at_threshold": ber_tr,
            "BER": ber,
            "True+": tpr,
            "True-": tnr,
            "train_n": len(Xtr),
            "test_n": len(Xte),
            "train_fail_rate": float(ytr.mean()),
            "test_fail_rate": float(yte.mean()),
            "train_start": ttr.min(),
            "train_end": ttr.max(),
            "test_start": tte.min(),
            "test_end": tte.max(),
        })

rolling_df = pd.DataFrame(rows)
rolling_df.head()

Total rolling splits: 8
n=1567, min_train=783, test_size=235, step=78


Unnamed: 0,split_id,method,threshold,train_BER_at_threshold,BER,True+,True-,train_n,test_n,train_fail_rate,test_fail_rate,train_start,train_end,test_start,test_end
0,1,ReliefF,0.35,0.211894,0.437316,0.333333,0.792035,783,235,0.085568,0.038298,2008-07-19 11:55:00,2008-09-11 07:43:00,2008-09-11 08:06:00,2008-09-22 21:31:00
1,1,F-test,0.35,0.213989,0.508358,0.222222,0.761062,783,235,0.085568,0.038298,2008-07-19 11:55:00,2008-09-11 07:43:00,2008-09-11 08:06:00,2008-09-22 21:31:00
2,2,ReliefF,0.425,0.231895,0.549185,0.142857,0.758772,861,235,0.082462,0.029787,2008-07-19 11:55:00,2008-09-16 08:50:00,2008-09-16 08:52:00,2008-09-26 02:26:00
3,2,F-test,0.375,0.234899,0.471178,0.285714,0.77193,861,235,0.082462,0.029787,2008-07-19 11:55:00,2008-09-16 08:50:00,2008-09-16 08:52:00,2008-09-26 02:26:00
4,3,ReliefF,0.425,0.2559,0.556277,0.0,0.887446,939,235,0.080937,0.017021,2008-07-19 11:55:00,2008-09-20 05:25:00,2008-09-20 05:34:00,2008-09-29 09:15:00


In [37]:
# Summary table
summary = (
    rolling_df.groupby("method")[["BER", "True+", "True-", "threshold"]]
    .agg(["mean", "std", "median"])
)
summary

Unnamed: 0_level_0,BER,BER,BER,True+,True+,True+,True-,True-,True-,threshold,threshold,threshold
Unnamed: 0_level_1,mean,std,median,mean,std,median,mean,std,median,mean,std,median
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
F-test,0.466426,0.088824,0.471902,0.314746,0.168216,0.332721,0.752403,0.075492,0.766496,0.4375,0.053452,0.45
ReliefF,0.465773,0.09012,0.476248,0.34457,0.257429,0.309524,0.723885,0.194639,0.76012,0.4375,0.075593,0.425


In [38]:
# Optional: per-split comparison
pivot_ber = rolling_df.pivot(index="split_id", columns="method", values="BER")
pivot_ber["Relief_minus_F"] = pivot_ber["ReliefF"] - pivot_ber["F-test"]
pivot_ber

method,F-test,ReliefF,Relief_minus_F
split_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.508358,0.437316,-0.071042
2,0.471178,0.549185,0.078008
3,0.590909,0.556277,-0.034632
4,0.4722,0.44582,-0.02638
5,0.429034,0.531031,0.101997
6,0.508122,0.506676,-0.001446
7,0.471604,0.4121,-0.059503
8,0.28,0.287778,0.007778
