In [2]:
import os, math, gc
import numpy as np
import pandas as pd
from pathlib import Path

np.random.seed(42)

OUT_DIR = "./wadi_preprocessed"
SRC = f"{OUT_DIR}/wadi_test_preprocessed.csv"
assert Path(SRC).exists(), "ابتدا پیش‌پردازش WADI را اجرا کنید."

SEQ_LEN   = 60
STRIDE    = 5
DILATE    = 5
NEG_RATIO = 1.0
NON_OVERLAP = True
HARD_NEG_Q = 0.75

df = pd.read_csv(SRC)
FEATURES = [c for c in df.columns if c != "label"]
X = df[FEATURES].astype(np.float32).values
y = df["label"].astype(np.int64).values
del df; gc.collect()

if DILATE > 0:
    k = np.ones(2*DILATE+1, dtype=np.int32)
    y = (np.convolve(y, k, mode='same') > 0).astype(np.int64)

n_seq = len(X) - SEQ_LEN + 1
starts = np.arange(0, n_seq, STRIDE, dtype=np.int64)

from numpy.lib.stride_tricks import sliding_window_view
y_win_full = (sliding_window_view(y, SEQ_LEN).max(axis=1)).astype(np.int64)
y_win_sel = y_win_full[starts]

pos_idx = starts[y_win_sel==1].copy()
neg_idx = starts[y_win_sel==0].copy()

diff_act = None
try:
    diffs = np.abs(np.diff(X, axis=0))
    act = diffs.mean(axis=1)
    act_win = sliding_window_view(act, SEQ_LEN-1).mean(axis=1)
    diff_act = act_win[starts]
except Exception:
    pass

n_pos = len(pos_idx)
if n_pos == 0:
    raise RuntimeError("پنجره‌ی مثبت ندارید؛ SEQ_LEN را کم یا DILATE را زیاد کنید.")
n_neg = int(max(1, math.ceil(NEG_RATIO * n_pos)))

if diff_act is not None:
    neg_scores = diff_act[y_win_sel==0]
    thr = np.quantile(neg_scores, HARD_NEG_Q)
    hard_neg = neg_idx[neg_scores >= thr]
    if len(hard_neg) >= n_neg:
        sel_neg = np.random.choice(hard_neg, size=n_neg, replace=False)
    else:
        rest = np.setdiff1d(neg_idx, hard_neg)
        add = np.random.choice(rest, size=max(0, n_neg-len(hard_neg)), replace=False)
        sel_neg = np.concatenate([hard_neg, add])
else:
    sel_neg = np.random.choice(neg_idx, size=n_neg, replace=False)

def make_non_overlapping(idxs):
    if not NON_OVERLAP: return idxs
    idxs = np.sort(idxs)
    picked, last_end = [], -1
    for s in idxs:
        if s >= last_end:
            picked.append(s)
            last_end = s + SEQ_LEN
    return np.array(picked, dtype=np.int64)

pos_idx = make_non_overlapping(pos_idx)
sel_neg = make_non_overlapping(sel_neg)
n = min(len(pos_idx), len(sel_neg))
pos_idx = pos_idx[:n]; sel_neg = sel_neg[:n]

starts_final = np.concatenate([pos_idx, sel_neg])
labels_final = np.concatenate([np.ones(len(pos_idx), dtype=np.int64),
                               np.zeros(len(sel_neg), dtype=np.int64)])
order = np.random.permutation(len(starts_final))
starts_final = starts_final[order]; labels_final = labels_final[order]

def aggregate_window(feat_block):
    mean  = feat_block.mean(axis=0)
    std   = feat_block.std(axis=0, ddof=0)
    minv  = feat_block.min(axis=0)
    maxv  = feat_block.max(axis=0)
    last  = feat_block[-1]
    first = feat_block[0]
    slope = np.polyfit(np.arange(len(feat_block)), feat_block, deg=1)[0]
    iqr   = np.quantile(feat_block, 0.75, axis=0) - np.quantile(feat_block, 0.25, axis=0)
    dsum  = np.abs(np.diff(feat_block, axis=0)).sum(axis=0)
    return np.concatenate([mean, std, minv, maxv, last, last-first, slope, iqr, dsum], axis=0)

blocks = [aggregate_window(X[s:s+SEQ_LEN, :]) for s in starts_final]
X_agg = np.stack(blocks, axis=0)

cols = []
for prefix in ["mean","std","min","max","last","delta","slope","iqr","dsum"]:
    cols += [f"{prefix}_{f}" for f in FEATURES]
df_out = pd.DataFrame(X_agg, columns=cols)
df_out["label"] = labels_final

from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df_out, test_size=0.3, random_state=42, stratify=df_out["label"])
val_df, test_df   = train_test_split(temp_df, test_size=2/3, random_state=42, stratify=temp_df["label"])

Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
train_path = f"{OUT_DIR}/wadi_unswized_train.csv"
val_path   = f"{OUT_DIR}/wadi_unswized_val.csv"
test_path  = f"{OUT_DIR}/wadi_unswized_test.csv"
train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)
print("Saved:", train_path, val_path, test_path)
print("Shapes:", train_df.shape, val_df.shape, test_df.shape)
print("Class ratios (train/val/test):", train_df['label'].mean(), val_df['label'].mean(), test_df['label'].mean())


Saved: ./wadi_preprocessed/wadi_unswized_train.csv ./wadi_preprocessed/wadi_unswized_val.csv ./wadi_preprocessed/wadi_unswized_test.csv
Shapes: (264, 838) (38, 838) (76, 838)
Class ratios (train/val/test): 0.5 0.5 0.5


In [3]:
import os, re, sys, math, time, gc, subprocess, warnings
from pathlib import Path
from typing import Optional, List, Tuple
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

SEED=42
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
DEVICE="cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# ====================== Config ======================
KAGGLE_SLUG = "giovannimonco/wadi-data"
RAW_OUT_DIR = "./wadi_preprocessed"
Path(RAW_OUT_DIR).mkdir(parents=True, exist_ok=True)
TRAIN_CSV = f"{RAW_OUT_DIR}/wadi_train_preprocessed.csv"
TEST_CSV  = f"{RAW_OUT_DIR}/wadi_test_preprocessed.csv"

# UNSW-ization
SEQ_LEN=60
STRIDE=5
DILATE=5
NEG_RATIO=1.0
NON_OVERLAP=True
HARD_NEG_Q=0.75

# Model+train
MODEL_TYPE="trf_bilstm_sensors"   # "mlp" یا "trf_bilstm_sensors"
BATCH_SIZE=256 if DEVICE=="cpu" else 512
EPOCHS=15
LR=3e-4
DROPOUT=0.1

# Transformer across sensors
TR_DIM=128
NHEAD=4
N_LAY_TR=2
LSTM_H=128
N_LAY_LSTM=1

CKPT_DIR="./bilstm_trf_checkpoints"; os.makedirs(CKPT_DIR, exist_ok=True)
CKPT=f"{CKPT_DIR}/best_unswized_{MODEL_TYPE}.pt"

# ====================== Preprocess (WADI.ipynb-like) ======================
def strip_and_dedup(df):
    df.columns=[re.sub(r"\s+"," ",str(c).strip()) for c in df.columns]
    return df.loc[:,~df.columns.duplicated()]

def read_train_csv(p):
    name=Path(p).name.lower()
    df=pd.read_csv(p, skiprows=4, low_memory=False) if ("14days.csv" in name and "new" not in name) else pd.read_csv(p, low_memory=False)
    return strip_and_dedup(df)

def read_test_csv(p):
    name=Path(p).name.lower()
    try:
        df=pd.read_csv(p, skiprows=1, low_memory=False) if "lable" in name else pd.read_csv(p, low_memory=False)
    except Exception:
        df=pd.read_csv(p, low_memory=False)
    df=strip_and_dedup(df)
    if df.iloc[0].astype(str).str.contains("Row|Date|Time|Attack",case=False,regex=True).any():
        df.columns=df.iloc[0].astype(str).tolist()
        df=strip_and_dedup(df.iloc[1:].reset_index(drop=True))
    return df

def extract_label_from_test(df):
    cands=[c for c in df.columns if re.search(r"(label|lable|attack)",c,flags=re.I)]
    lab=None
    for c in cands:
        col=pd.to_numeric(df[c], errors="coerce")
        u=set(col.dropna().unique().tolist())
        if u.issubset({-1,0,1}) and len(u)>0:
            lab=col.map({1:0,-1:1,0:0}); break
    if lab is None: lab=pd.Series(np.zeros(len(df),dtype=np.int64), name="label")
    return lab.astype("int64").rename("label")

def numeric_common(train_df, test_df):
    drop={"Date","Time","datetime","timestamp","Row","row","index"}
    num_tr=[c for c in train_df.columns if pd.api.types.is_numeric_dtype(train_df[c]) and c not in drop]
    num_te=[c for c in test_df.columns  if pd.api.types.is_numeric_dtype(test_df[c]) and c not in drop]
    feats=sorted(list(set(num_tr)&set(num_te)))
    if not feats: raise RuntimeError("No shared numeric columns.")
    Xtr=train_df[feats].copy(); Xte=test_df[feats].copy()
    for X in (Xtr,Xte):
        X.replace(["?","NA","NaN","nan",""], np.nan, inplace=True)
        for c in X.columns: X[c]=pd.to_numeric(X[c], errors="coerce")
        X.interpolate(limit_direction="both", inplace=True)
        X.ffill(inplace=True); X.bfill(inplace=True)
    return Xtr, Xte, feats

def safe_scale(Xtr, Xte, eps=1e-8):
    std=Xtr.std(0, ddof=0); keep=std[std>eps].index.tolist()
    if len(keep)<len(std): print(f"[info] dropped {len(std)-len(keep)} low-variance cols")
    mu=Xtr[keep].mean(0); sd=Xtr[keep].std(0, ddof=0).clip(lower=eps)
    Xtr=((Xtr[keep]-mu)/sd).astype("float32").replace([np.inf,-np.inf],np.nan).fillna(0.0)
    Xte=((Xte[keep]-mu)/sd).astype("float32").replace([np.inf,-np.inf],np.nan).fillna(0.0)
    return Xtr, Xte, keep

def find_file(cands, root):
    low=[c.lower() for c in cands]
    for c in cands:
        p=Path(root)/c
        if p.exists(): return str(p)
    for r,_,files in os.walk(root):
        for f in files:
            if f.lower() in low: return str(Path(r)/f)
    for r,_,files in os.walk(root):
        for f in files:
            fl=f.lower()
            for cand in low:
                if cand.replace("_","").replace("-","") in fl.replace("_","").replace("-",""):
                    return str(Path(r)/f)
    return None

def download_or_cache():
    try:
        import kagglehub
    except Exception:
        subprocess.check_call([sys.executable,"-m","pip","install","-q","kagglehub"]); import kagglehub
    print("➜ kagglehub...")
    p=kagglehub.dataset_download(KAGGLE_SLUG)
    print("[ok]", p); return p

def ensure_preprocessed():
    if os.path.exists(TEST_CSV) and os.path.exists(TRAIN_CSV):
        print("✓ CSVهای از قبل موجودند."); return
    print("➜ دانلود WADI ...")
    root=download_or_cache()
    tr=find_file(["WADI_14days_new.csv","WADI_14days.csv"], root)
    te=find_file(["WADI_attackdataLABLE.csv","WADI_attackdata.csv"], root)
    if not tr or not te: raise FileNotFoundError("WADI CSVs not found")
    print("✓", tr); print("✓", te)
    tr_df=read_train_csv(tr); te_df=read_test_csv(te)
    ytest=extract_label_from_test(te_df)
    Xtr_raw,Xte_raw,feats=numeric_common(tr_df, te_df)
    Xtr,Xte,feats_kept=safe_scale(Xtr_raw, Xte_raw)
    tr_out=Xtr.copy(); tr_out["label"]=0
    te_out=Xte.copy(); te_out["label"]=ytest.values
    tr_out.to_csv(TRAIN_CSV,index=False); te_out.to_csv(TEST_CSV,index=False)
    print("✓ ساخته شد:", TRAIN_CSV, TEST_CSV, "| n_feats:", len(feats_kept))

ensure_preprocessed()

# ====================== UNSW-ize WADI ======================
SRC=TEST_CSV
df=pd.read_csv(SRC)
FEATURES=[c for c in df.columns if c!="label"]
X=df[FEATURES].astype(np.float32).values
y=df["label"].astype(np.int64).values
del df; gc.collect()

if DILATE>0:
    k=np.ones(2*DILATE+1, dtype=np.int32)
    y=(np.convolve(y, k, mode="same")>0).astype(np.int64)

n_seq=len(X)-SEQ_LEN+1
from numpy.lib.stride_tricks import sliding_window_view
starts=np.arange(0, n_seq, STRIDE, dtype=np.int64)
y_win_full=(sliding_window_view(y, SEQ_LEN).max(axis=1)).astype(np.int64)
y_sel=y_win_full[starts]
pos_idx=starts[y_sel==1].copy()
neg_idx=starts[y_sel==0].copy()

diff_act=None
try:
    diffs=np.abs(np.diff(X,axis=0))
    act=diffs.mean(axis=1)
    act_win=sliding_window_view(act, SEQ_LEN-1).mean(axis=1)
    diff_act=act_win[starts]
except Exception:
    pass

n_pos=len(pos_idx)
if n_pos==0: raise RuntimeError("پنجره‌ی مثبت ندارید؛ SEQ_LEN را کم یا DILATE را زیاد کنید.")
n_neg=int(max(1, math.ceil(NEG_RATIO*n_pos)))

if diff_act is not None:
    neg_scores=diff_act[y_sel==0]
    thr=np.quantile(neg_scores, HARD_NEG_Q)
    hard_neg=neg_idx[neg_scores>=thr]
    if len(hard_neg)>=n_neg:
        sel_neg=np.random.choice(hard_neg, size=n_neg, replace=False)
    else:
        rest=np.setdiff1d(neg_idx, hard_neg)
        add=np.random.choice(rest, size=max(0,n_neg-len(hard_neg)), replace=False)
        sel_neg=np.concatenate([hard_neg, add])
else:
    sel_neg=np.random.choice(neg_idx, size=n_neg, replace=False)

def make_non_overlapping(idxs):
    if not NON_OVERLAP: return idxs
    idxs=np.sort(idxs)
    picked=[]; last_end=-1
    for s in idxs:
        if s>=last_end:
            picked.append(s); last_end=s+SEQ_LEN
    return np.array(picked, dtype=np.int64)

pos_idx=make_non_overlapping(pos_idx)
sel_neg=make_non_overlapping(sel_neg)
m=min(len(pos_idx), len(sel_neg))
pos_idx=pos_idx[:m]; sel_neg=sel_neg[:m]

starts_final=np.concatenate([pos_idx, sel_neg])
labels_final=np.concatenate([np.ones(len(pos_idx),dtype=np.int64), np.zeros(len(sel_neg),dtype=np.int64)])
perm=np.random.permutation(len(starts_final))
starts_final=starts_final[perm]; labels_final=labels_final[perm]

def aggregate_window(block):
    mean=block.mean(0); std=block.std(0,ddof=0); minv=block.min(0); maxv=block.max(0)
    last=block[-1]; delta=last-block[0]
    t=np.arange(len(block))
    slope=np.polyfit(t, block, deg=1)[0]
    q75=np.quantile(block,0.75,axis=0); q25=np.quantile(block,0.25,axis=0); iqr=q75-q25
    dsum=np.abs(np.diff(block,axis=0)).sum(0)
    return np.concatenate([mean,std,minv,maxv,last,delta,slope,iqr,dsum], axis=0)

blocks=[aggregate_window(X[s:s+SEQ_LEN,:]) for s in starts_final]
X_agg=np.stack(blocks, axis=0)

stat_prefixes=["mean","std","min","max","last","delta","slope","iqr","dsum"]
cols=[]
for pfx in stat_prefixes:
    cols += [f"{pfx}_{f}" for f in FEATURES]
df_out=pd.DataFrame(X_agg, columns=cols); df_out["label"]=labels_final

from sklearn.model_selection import train_test_split
train_df, tmp_df = train_test_split(df_out, test_size=0.3, random_state=SEED, stratify=df_out["label"])
val_df, test_df  = train_test_split(tmp_df, test_size=2/3, random_state=SEED, stratify=tmp_df["label"])

UNSW_TRAIN=f"{RAW_OUT_DIR}/wadi_unswized_train.csv"
UNSW_VAL  =f"{RAW_OUT_DIR}/wadi_unswized_val.csv"
UNSW_TEST =f"{RAW_OUT_DIR}/wadi_unswized_test.csv"
train_df.to_csv(UNSW_TRAIN, index=False); val_df.to_csv(UNSW_VAL, index=False); test_df.to_csv(UNSW_TEST, index=False)
print("UNSW-ized CSVs:", UNSW_TRAIN, UNSW_VAL, UNSW_TEST)
print("Shapes:", train_df.shape, val_df.shape, test_df.shape)
print("Label ratio train/val/test:", train_df["label"].mean(), val_df["label"].mean(), test_df["label"].mean())

# ====================== Scaling ======================
all_cols=[c for c in train_df.columns if c!="label"]
Xtr=train_df[all_cols].values.astype(np.float32)
Xva=val_df[all_cols].values.astype(np.float32)
Xte=test_df[all_cols].values.astype(np.float32)
ytr=train_df["label"].values.astype(np.int64)
yva=val_df["label"].values.astype(np.int64)
yte=test_df["label"].values.astype(np.int64)

mu=Xtr.mean(0); sd=Xtr.std(0, ddof=0).clip(min=1e-8)
Xtr=((Xtr-mu)/sd).astype(np.float32)
Xva=((Xva-mu)/sd).astype(np.float32)
Xte=((Xte-mu)/sd).astype(np.float32)

# ====================== Datasets for two model types ======================
sensor_names=FEATURES[:]  # original sensor list
S=len(sensor_names); D=len(stat_prefixes)

def build_sensor_token_tensor(flat_row: np.ndarray):
    m={}
    for j, col in enumerate(all_cols):
        pfx, _, feat = col.partition("_")
        if feat not in m: m[feat]={}
        m[feat][pfx]=j
    tokens=[]
    for feat in sensor_names:
        if feat not in m: continue
        idxs=[m[feat].get(p, None) for p in stat_prefixes]
        if any(i is None for i in idxs):
            continue
        tokens.append(flat_row[idxs])
    if len(tokens)==0:
        return None
    T=np.stack(tokens, axis=0)  # [S_eff, D]
    return T

class DSMLP(Dataset):
    def __init__(self, X, y): self.X=torch.from_numpy(X); self.y=torch.from_numpy(y).float()
    def __len__(self): return len(self.y)
    def __getitem__(self,i): return self.X[i], self.y[i]

class DSSensors(Dataset):
    def __init__(self, X, y):
        self.y=torch.from_numpy(y).float()
        xs=[]
        for i in range(len(X)):
            T=build_sensor_token_tensor(X[i])
            if T is None: continue
            xs.append(torch.from_numpy(T.astype(np.float32)))
        self.X=xs
        if len(self.X)!=len(self.y):
            self.y=self.y[:len(self.X)]
    def __len__(self): return len(self.y)
    def __getitem__(self,i): return self.X[i], self.y[i]

if MODEL_TYPE=="mlp":
    tr_ds=DSMLP(Xtr, ytr); va_ds=DSMLP(Xva, yva); te_ds=DSMLP(Xte, yte)
else:
    tr_ds=DSSensors(Xtr, ytr); va_ds=DSSensors(Xva, yva); te_ds=DSSensors(Xte, yte)

train_loader=DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=None)
val_loader  =DataLoader(va_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=None)
test_loader =DataLoader(te_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=None)

# ====================== Models ======================
class MLP(nn.Module):
    def __init__(self, in_dim, p=0.2):
        super().__init__()
        self.net=nn.Sequential(
            nn.Linear(in_dim, 512), nn.ReLU(), nn.Dropout(p),
            nn.Linear(512, 256), nn.ReLU(), nn.Dropout(p),
            nn.Linear(256, 1)
        )
    def forward(self,x): return self.net(x).squeeze(1)

class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048):
        super().__init__()
        pe=torch.zeros(max_len, d_model, dtype=torch.float32)
        pos=torch.arange(0,max_len,dtype=torch.float32).unsqueeze(1)
        div=torch.exp(torch.arange(0,d_model,2,dtype=torch.float32)*(-math.log(10000.0)/d_model))
        pe[:,0::2]=torch.sin(pos*div); pe[:,1::2]=torch.cos(pos*div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self,x): return x + self.pe[:, :x.size(1), :]

class TrEncLayer(nn.Module):
    def __init__(self,d,nhead,ff,p):
        super().__init__()
        self.attn=nn.MultiheadAttention(d,nhead,dropout=p,batch_first=True)
        self.l1, self.l2=nn.Linear(d,ff), nn.Linear(ff,d)
        self.n1, self.n2=nn.LayerNorm(d), nn.LayerNorm(d)
        self.d1, self.d2=nn.Dropout(p), nn.Dropout(p)
        self.act=nn.GELU()
    def forward(self,x):
        a,_=self.attn(x,x,x,need_weights=False)
        x=self.n1(x + self.d1(a))
        x=self.n2(x + self.d2(self.l2(self.d1(self.act(self.l1(x))))))
        return x

class TrEnc(nn.Module):
    def __init__(self,d,nhead,nlayer,ff,p):
        super().__init__()
        self.layers=nn.ModuleList([TrEncLayer(d,nhead,ff,p) for _ in range(nlayer)])
    def forward(self,x):
        for lyr in self.layers: x=lyr(x)
        return x

class TrBiLSTM_Sensors(nn.Module):
    def __init__(self, token_dim, d_model, nhead, nlay_tr, lstm_h, nlay_lstm, p=0.1):
        super().__init__()
        self.proj=nn.Linear(token_dim, d_model)
        self.pos =SinusoidalPositionalEncoding(d_model)
        self.enc =TrEnc(d_model, nhead, nlay_tr, d_model*2, p)
        self.lstm=nn.LSTM(d_model, lstm_h, num_layers=nlay_lstm, batch_first=True, bidirectional=True, dropout=p if nlay_lstm>1 else 0.0)
        self.drop=nn.Dropout(p)
        self.fc  =nn.Linear(lstm_h*2, 1)
    def forward(self, x):
        x=self.proj(x)            # [B,S,Dm]
        x=self.pos(x)
        x=self.enc(x)
        x,_=self.lstm(x)
        x=x.mean(1)               # mean over sensors
        x=self.drop(x)
        return self.fc(x).squeeze(1)

# collate for variable S across items (rare if sensors fixed)
def pad_collate(batch):
    xs, ys = zip(*batch)
    Smax=max(t.shape[0] for t in xs)
    D=xs[0].shape[1]
    padded=torch.zeros(len(xs), Smax, D, dtype=torch.float32)
    for i,t in enumerate(xs):
        padded[i,:t.shape[0],:]=t
    return padded.to(DEVICE), torch.stack(ys).to(DEVICE)

# pick model
if MODEL_TYPE=="mlp":
    model=MLP(Xtr.shape[1], p=DROPOUT).to(DEVICE)
    collate_fn=None
else:
    model=TrBiLSTM_Sensors(token_dim=len(stat_prefixes), d_model=TR_DIM, nhead=NHEAD, nlay_tr=N_LAY_TR,
                           lstm_h=LSTM_H, nlay_lstm=N_LAY_LSTM, p=DROPOUT).to(DEVICE)
    train_loader=DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=pad_collate)
    val_loader  =DataLoader(va_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=pad_collate)
    test_loader =DataLoader(te_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=pad_collate)
print(model); print("Total params:", sum(p.numel() for p in model.parameters()))

opt=torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
crit=nn.BCEWithLogitsLoss()

def run_epoch(loader, train=False):
    (model.train if train else model.eval)()
    total, Ys, Ps = 0.0, [], []
    for batch in loader:
        if MODEL_TYPE=="mlp":
            xb, yb = batch[0].to(DEVICE), batch[1].to(DEVICE)
        else:
            xb, yb = batch
        with torch.set_grad_enabled(train):
            logit=model(xb)
            loss=crit(logit, yb)
            if train:
                opt.zero_grad(set_to_none=True); loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0); opt.step()
        total += loss.item()*len(yb)
        Ps.append(torch.sigmoid(logit).detach().cpu().numpy()); Ys.append(yb.detach().cpu().numpy())
    y=np.concatenate(Ys); p=np.concatenate(Ps)
    pred=(p>=0.5).astype(int)
    return total/max(1,len(loader.dataset)), accuracy_score(y,pred), f1_score(y,pred,zero_division=0), y, p

def pick_threshold(y, p):
    grid=np.unique(np.concatenate([np.linspace(0.01,0.99,49), np.quantile(p, np.linspace(0.01,0.99,49))]))
    f1s=[f1_score(y,(p>=t).astype(int),zero_division=0) for t in grid]
    i=int(np.argmax(f1s)); return float(grid[i]), float(f1s[i])

best_f1, best_thr, patience, noimp = -1.0, 0.5, 3, 0
for e in range(1, EPOCHS+1):
    t0=time.time()
    tr_loss,tr_acc,tr_f1,_,_=run_epoch(train_loader, True)
    va_loss,va_acc,va_f1,yv,pv=run_epoch(val_loader, False)
    thr_opt,f1_opt=pick_threshold(yv,pv)
    print(f"Epoch {e:02d}/{EPOCHS} | {time.time()-t0:.1f}s | train {tr_loss:.4f}/{tr_acc:.3f}/{tr_f1:.3f} | val f1@0.5 {va_f1:.3f} | best f1 {f1_opt:.3f} @ thr={thr_opt:.2f}")
    if f1_opt>best_f1:
        best_f1, best_thr, noimp = f1_opt, thr_opt, 0
        torch.save({"model":model.state_dict(),"thr":best_thr}, CKPT)
        print("  ✓ Saved:", CKPT)
    else:
        noimp += 1
        if noimp >= patience:
            print("Early stopping."); break

ckpt=torch.load(CKPT, map_location=DEVICE)
model.load_state_dict(ckpt["model"]); best_thr=ckpt["thr"]

def evaluate(loader, thr):
    model.eval(); Ys, Ps = [], []
    with torch.no_grad():
        for batch in loader:
            if MODEL_TYPE=="mlp":
                xb, yb = batch[0].to(DEVICE), batch[1]
            else:
                xb, yb = batch; yb = yb.cpu()
            pr=torch.sigmoid(model(xb.to(DEVICE))).cpu().numpy()
            Ys.append(yb.numpy()); Ps.append(pr)
    y=np.concatenate(Ys); p=np.concatenate(Ps)
    pred=(p>=thr).astype(int)
    acc=accuracy_score(y,pred); f1=f1_score(y,pred,zero_division=0)
    return acc, f1, y, pred

acc,f1,y_true,y_pred=evaluate(test_loader, best_thr)
print("\n--- Test (UNSW-ized records) ---")
print({"acc": round(acc,4), "f1": round(f1,4), "thr": round(best_thr,2)})
print(confusion_matrix(y_true,y_pred))
print(classification_report(y_true,y_pred, target_names=["Normal (0)","Attack (1)"], zero_division=0))


Device: cuda
✓ CSVهای از قبل موجودند.
UNSW-ized CSVs: ./wadi_preprocessed/wadi_unswized_train.csv ./wadi_preprocessed/wadi_unswized_val.csv ./wadi_preprocessed/wadi_unswized_test.csv
Shapes: (264, 838) (38, 838) (76, 838)
Label ratio train/val/test: 0.5 0.5 0.5
TrBiLSTM_Sensors(
  (proj): Linear(in_features=9, out_features=128, bias=True)
  (pos): SinusoidalPositionalEncoding()
  (enc): TrEnc(
    (layers): ModuleList(
      (0-1): 2 x TrEncLayer(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (l1): Linear(in_features=128, out_features=256, bias=True)
        (l2): Linear(in_features=256, out_features=128, bias=True)
        (n1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (n2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (d1): Dropout(p=0.1, inplace=False)
        (d2): Dropout(p=0.1, inplace=False)
        (act): GELU(approximate='none')
    

In [4]:
import os, re, sys, math, time, gc, subprocess, warnings
from pathlib import Path
from typing import Optional, List, Tuple
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, precision_recall_curve

# ===================== Setup =====================
SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# ===================== Config =====================
KAGGLE_SLUG = "giovannimonco/wadi-data"

OUT_DIR_RAW = "./wadi_preprocessed"
Path(OUT_DIR_RAW).mkdir(parents=True, exist_ok=True)
TRAIN_CSV = f"{OUT_DIR_RAW}/wadi_train_preprocessed.csv"
TEST_CSV  = f"{OUT_DIR_RAW}/wadi_test_preprocessed.csv"

# UNSW-ization (recordization)
SEQ_LEN     = 40
STRIDE      = 1
DILATE      = 7
NEG_RATIO   = 2.0
NON_OVERLAP = False
HARD_NEG_Q  = 0.90
HARD_NEG_Q_TR = 0.90
VAL_SIZE = 0.15
TEST_SIZE = 0.15

# Model/training
MODEL_TYPE = "trf_bilstm_sensors"  # "mlp" or "trf_bilstm_sensors"
BATCH_SIZE = 512 if DEVICE == "cuda" else 256
EPOCHS     = 20
LR         = 3e-4
DROPOUT    = 0.15
WEIGHT_DECAY = 3e-4
MIN_PREC_AT_TUNE = 0.6
EARLY_PATIENCE = 4

# Transformer across sensors (token = per-sensor stats)
TR_DIM    = 192
NHEAD     = 4
N_LAY_TR  = 2
LSTM_H    = 192
N_LAY_LSTM= 1

CKPT_DIR = "./bilstm_trf_checkpoints"
Path(CKPT_DIR).mkdir(parents=True, exist_ok=True)
CKPT = f"{CKPT_DIR}/best_unswized_{MODEL_TYPE}.pt"

# ===================== Preprocess (WADI-like) =====================
def strip_and_dedup(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [re.sub(r"\s+"," ",str(c).strip()) for c in df.columns]
    return df.loc[:, ~df.columns.duplicated()]

def read_train_csv(p: str) -> pd.DataFrame:
    name = Path(p).name.lower()
    df = pd.read_csv(p, skiprows=4, low_memory=False) if ("14days.csv" in name and "new" not in name) else pd.read_csv(p, low_memory=False)
    return strip_and_dedup(df)

def read_test_csv(p: str) -> pd.DataFrame:
    name = Path(p).name.lower()
    try:
        df = pd.read_csv(p, skiprows=1, low_memory=False) if "lable" in name else pd.read_csv(p, low_memory=False)
    except Exception:
        df = pd.read_csv(p, low_memory=False)
    df = strip_and_dedup(df)
    if df.iloc[0].astype(str).str.contains("Row|Date|Time|Attack", case=False, regex=True).any():
        df.columns = df.iloc[0].astype(str).tolist()
        df = strip_and_dedup(df.iloc[1:].reset_index(drop=True))
    return df

def extract_label_from_test(df: pd.DataFrame) -> pd.Series:
    cands = [c for c in df.columns if re.search(r"(label|lable|attack)", c, flags=re.I)]
    lab = None
    for c in cands:
        col = pd.to_numeric(df[c], errors="coerce")
        u = set(col.dropna().unique().tolist())
        if u.issubset({-1,0,1}) and len(u)>0:
            lab = col.map({1:0, -1:1, 0:0}); break
    if lab is None: lab = pd.Series(np.zeros(len(df), dtype=np.int64))
    return lab.astype(np.int64).rename("label")

def numeric_common(train_df: pd.DataFrame, test_df: pd.DataFrame):
    drop = {"Date","Time","datetime","timestamp","Row","row","index"}
    num_tr = [c for c in train_df.columns if pd.api.types.is_numeric_dtype(train_df[c]) and c not in drop]
    num_te = [c for c in test_df.columns  if pd.api.types.is_numeric_dtype(test_df[c]) and c not in drop]
    feats = sorted(list(set(num_tr) & set(num_te)))
    if not feats: raise RuntimeError("No shared numeric columns.")
    Xtr = train_df[feats].copy(); Xte = test_df[feats].copy()
    for X in (Xtr, Xte):
        X.replace(["?","NA","NaN","nan",""], np.nan, inplace=True)
        for c in X.columns: X[c] = pd.to_numeric(X[c], errors="coerce")
        X.interpolate(limit_direction="both", inplace=True)
        X.ffill(inplace=True); X.bfill(inplace=True)
    return Xtr, Xte, feats

def safe_scale(Xtr: pd.DataFrame, Xte: pd.DataFrame, eps: float=1e-8):
    std = Xtr.std(0, ddof=0)
    keep = std[std > eps].index.tolist()
    if len(keep) < len(std): print(f"[info] dropped {len(std)-len(keep)} low-variance cols")
    mu = Xtr[keep].mean(0); sd = Xtr[keep].std(0, ddof=0).clip(lower=eps)
    Xtr = ((Xtr[keep]-mu)/sd).astype(np.float32).replace([np.inf,-np.inf], np.nan).fillna(0.0)
    Xte = ((Xte[keep]-mu)/sd).astype(np.float32).replace([np.inf,-np.inf], np.nan).fillna(0.0)
    return Xtr, Xte, keep

def find_file(cands: List[str], root: str) -> Optional[str]:
    low = [c.lower() for c in cands]
    for c in cands:
        p = Path(root)/c
        if p.exists(): return str(p)
    for r,_,files in os.walk(root):
        for f in files:
            if f.lower() in low: return str(Path(r)/f)
    for r,_,files in os.walk(root):
        for f in files:
            fl = f.lower()
            for cand in low:
                if cand.replace("_","").replace("-","") in fl.replace("_","").replace("-",""):
                    return str(Path(r)/f)
    return None

def download_or_cache() -> str:
    try:
        import kagglehub
    except Exception:
        subprocess.check_call([sys.executable,"-m","pip","install","-q","kagglehub"]); import kagglehub
    print("➜ kagglehub ...")
    p = kagglehub.dataset_download(KAGGLE_SLUG)
    print("[ok]", p); return p

def ensure_preprocessed():
    if os.path.exists(TEST_CSV) and os.path.exists(TRAIN_CSV):
        print("✓ CSVهای از قبل موجودند.")
        return
    print("➜ دانلود WADI ...")
    root = download_or_cache()
    tr = find_file(["WADI_14days_new.csv","WADI_14days.csv"], root)
    te = find_file(["WADI_attackdataLABLE.csv","WADI_attackdata.csv"], root)
    if not tr or not te: raise FileNotFoundError("WADI CSVs not found")
    train_df = read_train_csv(tr); test_df = read_test_csv(te)
    ytest = extract_label_from_test(test_df)
    Xtr_raw, Xte_raw, feats = numeric_common(train_df, test_df)
    Xtr, Xte, kept = safe_scale(Xtr_raw, Xte_raw)
    tr_out = Xtr.copy(); tr_out["label"] = 0
    te_out = Xte.copy(); te_out["label"] = ytest.values
    tr_out.to_csv(TRAIN_CSV, index=False); te_out.to_csv(TEST_CSV, index=False)
    print("✓ ساخته شد:", TRAIN_CSV, TEST_CSV, "| n_feats:", len(kept))

ensure_preprocessed()

# ===================== UNSW-ize (record extraction) =====================
df_attack = pd.read_csv(TEST_CSV)
FEATURES = [c for c in df_attack.columns if c != "label"]
X_attack = df_attack[FEATURES].astype(np.float32).values
y_attack = df_attack["label"].astype(np.int64).values
del df_attack; gc.collect()

if DILATE > 0:
    k = np.ones(2*DILATE+1, dtype=np.int32)
    y_attack = (np.convolve(y_attack, k, mode="same") > 0).astype(np.int64)

n_seq_att = len(X_attack) - SEQ_LEN + 1
starts_att = np.arange(0, n_seq_att, STRIDE, dtype=np.int64)

from numpy.lib.stride_tricks import sliding_window_view
y_win_full_att = (sliding_window_view(y_attack, SEQ_LEN).max(axis=1)).astype(np.int64)
y_sel_att = y_win_full_att[starts_att]
pos_idx_att = starts_att[y_sel_att==1].copy()
neg_idx_att = starts_att[y_sel_att==0].copy()

diffs = np.abs(np.diff(X_attack, axis=0))
act = diffs.mean(axis=1)
act_win = sliding_window_view(act, SEQ_LEN-1).mean(axis=1)
scores_att = act_win[starts_att]
thr_att = np.quantile(scores_att[y_sel_att==0], HARD_NEG_Q)
hard_neg_att = neg_idx_att[scores_att[y_sel_att==0] >= thr_att]

n_pos = len(pos_idx_att)
if n_pos == 0:
    raise RuntimeError("No positive windows; decrease SEQ_LEN or increase DILATE.")
n_neg = int(max(1, math.ceil(NEG_RATIO * n_pos)))

if len(hard_neg_att) >= n_neg:
    sel_neg_att = np.random.choice(hard_neg_att, size=n_neg, replace=False)
else:
    rest_att = np.setdiff1d(neg_idx_att, hard_neg_att)
    add_att = np.random.choice(rest_att, size=max(0, n_neg-len(hard_neg_att)), replace=False)
    sel_neg_att = np.concatenate([hard_neg_att, add_att]) if len(hard_neg_att)>0 else add_att

def make_non_overlapping(idxs: np.ndarray) -> np.ndarray:
    if not NON_OVERLAP: return idxs
    idxs = np.sort(idxs)
    picked, last_end = [], -1
    for s in idxs:
        if s >= last_end:
            picked.append(s); last_end = s + SEQ_LEN
    return np.array(picked, dtype=np.int64)

pos_idx_att = make_non_overlapping(pos_idx_att)
sel_neg_att = make_non_overlapping(sel_neg_att)

# negatives from 14-day normal
df_trnorm = pd.read_csv(TRAIN_CSV)
X_trn = df_trnorm.drop(columns=["label"], errors="ignore").astype(np.float32).values
n_seq_tr = len(X_trn) - SEQ_LEN + 1
starts_tr = np.arange(0, n_seq_tr, STRIDE, dtype=np.int64)
diffs_tr = np.abs(np.diff(X_trn, axis=0))
act_tr = diffs_tr.mean(axis=1)
act_win_tr = sliding_window_view(act_tr, SEQ_LEN-1).mean(axis=1)
scores_tr = act_win_tr[starts_tr]
thr_tr = np.quantile(scores_tr, HARD_NEG_Q_TR)
hard_neg_tr = starts_tr[scores_tr >= thr_tr]
take_tr = int(0.3 * len(sel_neg_att))
take_tr = min(take_tr, len(hard_neg_tr))
sel_neg_tr = np.random.choice(hard_neg_tr, size=take_tr, replace=False) if take_tr>0 else np.array([], dtype=np.int64)

def aggregate_window(block: np.ndarray) -> np.ndarray:
    mean = block.mean(0)
    std  = block.std(0, ddof=0)
    minv = block.min(0)
    maxv = block.max(0)
    last = block[-1]
    delta= last - block[0]
    t = np.arange(len(block))
    slope = np.polyfit(t, block, deg=1)[0]
    q75 = np.quantile(block, 0.75, axis=0); q25 = np.quantile(block, 0.25, axis=0)
    iqr = q75 - q25
    dsum = np.abs(np.diff(block, axis=0)).sum(0)
    return np.concatenate([mean, std, minv, maxv, last, delta, slope, iqr, dsum], axis=0)

stat_prefixes = ["mean","std","min","max","last","delta","slope","iqr","dsum"]

blocks_pos = [aggregate_window(X_attack[s:s+SEQ_LEN,:]) for s in pos_idx_att]
y_pos = np.ones(len(blocks_pos), dtype=np.int64)

blocks_neg_att = [aggregate_window(X_attack[s:s+SEQ_LEN,:]) for s in sel_neg_att]
y_neg_att = np.zeros(len(blocks_neg_att), dtype=np.int64)

blocks_neg_tr = [aggregate_window(X_trn[s:s+SEQ_LEN,:]) for s in sel_neg_tr]
y_neg_tr = np.zeros(len(blocks_neg_tr), dtype=np.int64)

X_agg = np.vstack([np.stack(blocks_pos), np.stack(blocks_neg_att)] + ([np.stack(blocks_neg_tr)] if len(blocks_neg_tr)>0 else []))
y_agg = np.concatenate([y_pos, y_neg_att] + ([y_neg_tr] if len(y_neg_tr)>0 else []))

cols = []
for pfx in stat_prefixes:
    cols += [f"{pfx}_{f}" for f in FEATURES]
df_out = pd.DataFrame(X_agg, columns=cols)
df_out["label"] = y_agg

from sklearn.model_selection import train_test_split
tmp_size = VAL_SIZE + TEST_SIZE
train_df, tmp_df = train_test_split(df_out, test_size=tmp_size, random_state=SEED, stratify=df_out["label"])
rel_test = TEST_SIZE / (VAL_SIZE + TEST_SIZE)
val_df, test_df = train_test_split(tmp_df, test_size=rel_test, random_state=SEED, stratify=tmp_df["label"])

UNSW_TRAIN = f"{OUT_DIR_RAW}/wadi_unswized_train.csv"
UNSW_VAL   = f"{OUT_DIR_RAW}/wadi_unswized_val.csv"
UNSW_TEST  = f"{OUT_DIR_RAW}/wadi_unswized_test.csv"
train_df.to_csv(UNSW_TRAIN, index=False)
val_df.to_csv(UNSW_VAL, index=False)
test_df.to_csv(UNSW_TEST, index=False)

print("UNSW-ized CSVs:", UNSW_TRAIN, UNSW_VAL, UNSW_TEST)
print("Shapes:", train_df.shape, val_df.shape, test_df.shape)
print("Label ratio train/val/test:", round(train_df["label"].mean(),3), round(val_df["label"].mean(),3), round(test_df["label"].mean(),3))

# ===================== Scaling =====================
all_cols = [c for c in train_df.columns if c != "label"]
Xtr = train_df[all_cols].values.astype(np.float32)
Xva = val_df[all_cols].values.astype(np.float32)
Xte = test_df[all_cols].values.astype(np.float32)
ytr = train_df["label"].values.astype(np.float32)
yva = val_df["label"].values.astype(np.float32)
yte = test_df["label"].values.astype(np.float32)

mu = Xtr.mean(0); sd = Xtr.std(0, ddof=0).clip(min=1e-8)
Xtr = ((Xtr - mu)/sd).astype(np.float32)
Xva = ((Xva - mu)/sd).astype(np.float32)
Xte = ((Xte - mu)/sd).astype(np.float32)
Xtr = np.nan_to_num(Xtr, nan=0.0, posinf=0.0, neginf=0.0)
Xva = np.nan_to_num(Xva, nan=0.0, posinf=0.0, neginf=0.0)
Xte = np.nan_to_num(Xte, nan=0.0, posinf=0.0, neginf=0.0)

# ===================== Datasets =====================
sensor_names = FEATURES[:]  # original feature (sensor) names
PFX = stat_prefixes
token_dim = len(PFX)

# build fixed index map: for every sensor, indices of its stats in all_cols
sensor_to_idx = {}
for feat in sensor_names:
    idxs = []
    ok = True
    for p in PFX:
        col = f"{p}_{feat}"
        if col not in all_cols:
            ok = False; break
        idxs.append(all_cols.index(col))
    if ok:
        sensor_to_idx[feat] = idxs
sensor_order = [s for s in sensor_names if s in sensor_to_idx]
S_eff = len(sensor_order)
sensor_index_matrix = np.array([sensor_to_idx[s] for s in sensor_order], dtype=np.int64)  # [S_eff, token_dim]

def row_to_tokens(row_vec: np.ndarray) -> np.ndarray:
    return row_vec[sensor_index_matrix]  # [S_eff, token_dim]

class DSMLP(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self): return len(self.y)
    def __getitem__(self,i): return self.X[i], self.y[i]

class DSSensors(Dataset):
    def __init__(self, X, y):
        toks = np.stack([row_to_tokens(X[i]) for i in range(len(X))], axis=0)  # [N,S_eff,token_dim]
        self.X = torch.from_numpy(toks.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.float32))
    def __len__(self): return len(self.y)
    def __getitem__(self,i): return self.X[i], self.y[i]

if MODEL_TYPE == "mlp":
    tr_ds = DSMLP(Xtr, ytr); va_ds = DSMLP(Xva, yva); te_ds = DSMLP(Xte, yte)
else:
    tr_ds = DSSensors(Xtr, ytr); va_ds = DSSensors(Xva, yva); te_ds = DSSensors(Xte, yte)

train_loader = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=0)
val_loader   = DataLoader(va_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_loader  = DataLoader(te_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# ===================== Models =====================
class MLP(nn.Module):
    def __init__(self, in_dim, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512), nn.ReLU(), nn.Dropout(p),
            nn.Linear(512, 256), nn.ReLU(), nn.Dropout(p),
            nn.Linear(256, 1)
        )
    def forward(self, x): return self.net(x).squeeze(1)

class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=4096):
        super().__init__()
        pe = torch.zeros(max_len, d_model, dtype=torch.float32)
        pos = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(pos*div); pe[:,1::2] = torch.cos(pos*div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x): return x + self.pe[:, :x.size(1), :]

class TrEncLayer(nn.Module):
    def __init__(self, d, nhead, ff, p):
        super().__init__()
        self.attn = nn.MultiheadAttention(d, nhead, dropout=p, batch_first=True)
        self.l1, self.l2 = nn.Linear(d, ff), nn.Linear(ff, d)
        self.n1, self.n2 = nn.LayerNorm(d), nn.LayerNorm(d)
        self.d1, self.d2 = nn.Dropout(p), nn.Dropout(p)
        self.act = nn.GELU()
    def forward(self, x):
        a,_ = self.attn(x,x,x,need_weights=False)
        x = self.n1(x + self.d1(a))
        x = self.n2(x + self.d2(self.l2(self.d1(self.act(self.l1(x))))))
        return x

class TrEnc(nn.Module):
    def __init__(self, d, nhead, nlayer, ff, p):
        super().__init__()
        self.layers = nn.ModuleList([TrEncLayer(d, nhead, ff, p) for _ in range(nlayer)])
    def forward(self, x):
        for lyr in self.layers: x = lyr(x)
        return x

class TrBiLSTM_Sensors_Att(nn.Module):
    def __init__(self, token_dim, d_model, nhead, nlay_tr, lstm_h, nlay_lstm, p=0.1):
        super().__init__()
        self.proj = nn.Linear(token_dim, d_model)
        self.pos  = SinusoidalPositionalEncoding(d_model)
        self.enc  = TrEnc(d_model, nhead, nlay_tr, d_model*2, p)
        self.lstm = nn.LSTM(d_model, lstm_h, num_layers=nlay_lstm, batch_first=True,
                            bidirectional=True, dropout=p if nlay_lstm>1 else 0.0)
        self.drop = nn.Dropout(p)
        self.att  = nn.Sequential(
            nn.Linear(lstm_h*2, d_model//2),
            nn.GELU(),
            nn.Linear(d_model//2, 1)
        )
        self.fc   = nn.Linear(lstm_h*2, 1)
    def forward(self, x):              # x: [B,S,token_dim]
        x = self.proj(x)               # [B,S,D]
        x = self.pos(x)
        x = self.enc(x)
        x,_ = self.lstm(x)             # [B,S,2H]
        x = self.drop(x)
        w = torch.softmax(self.att(x).squeeze(-1), dim=1)  # [B,S]
        x = (x * w.unsqueeze(-1)).sum(1)                   # [B,2H]
        return self.fc(x).squeeze(1)

if MODEL_TYPE == "mlp":
    model = MLP(Xtr.shape[1], p=DROPOUT).to(DEVICE)
else:
    model = TrBiLSTM_Sensors_Att(token_dim=token_dim, d_model=TR_DIM, nhead=NHEAD, nlay_tr=N_LAY_TR,
                                 lstm_h=LSTM_H, nlay_lstm=N_LAY_LSTM, p=DROPOUT).to(DEVICE)

print(model)
print("Total params:", sum(p.numel() for p in model.parameters()))

# ===================== Loss/Opt =====================
class FocalBCEWithLogitsLoss(nn.Module):
    def __init__(self, alpha=0.6, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.bce = nn.BCEWithLogitsLoss(reduction="none")
    def forward(self, logits, targets):
        loss_bce = self.bce(logits, targets)
        p = torch.sigmoid(logits)
        pt = torch.where(targets==1.0, p, 1.0-p)
        mod = (1.0 - pt).pow(self.gamma)
        loss = self.alpha * mod * loss_bce
        return loss.mean()

crit = FocalBCEWithLogitsLoss(alpha=0.6, gamma=2.0)
opt  = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

def run_epoch(loader, train=False):
    (model.train if train else model.eval)()
    total, Ys, Ps = 0.0, [], []
    for xb, yb in loader:
        xb = xb.to(DEVICE); yb = yb.to(DEVICE)
        with torch.set_grad_enabled(train):
            logit = model(xb)
            loss = crit(logit, yb)
            if train:
                opt.zero_grad(set_to_none=True); loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                opt.step()
        total += loss.item() * len(yb)
        Ps.append(torch.sigmoid(logit).detach().cpu().numpy())
        Ys.append(yb.detach().cpu().numpy())
    y = np.concatenate(Ys) if Ys else np.array([])
    p = np.concatenate(Ps) if Ps else np.array([])
    pred = (p >= 0.5).astype(int) if len(p) else np.array([])
    acc = accuracy_score(y, pred) if len(y) else 0.0
    f1  = f1_score(y, pred, zero_division=0) if len(y) else 0.0
    return total/max(1,len(loader.dataset)), acc, f1, y, p

def pick_threshold_constrained(y, p, min_prec=0.6):
    prec, rec, thr = precision_recall_curve(y, p)
    best_t, best_f1 = None, -1.0
    for pr, rc, t in zip(prec[:-1], rec[:-1], thr):
        if pr >= min_prec:
            f1 = 0.0 if (pr+rc)==0 else 2*pr*rc/(pr+rc)
            if f1 > best_f1:
                best_f1, best_t = f1, float(t)
    if best_t is None:
        qs = np.quantile(p, np.linspace(0.4,0.9,6))
        f1s = [f1_score(y, (p>=t).astype(int), zero_division=0) for t in qs]
        best_t = float(qs[int(np.argmax(f1s))])
    return best_t, float(f1_score(y, (p>=best_t).astype(int), zero_division=0))

best_f1, best_thr, patience, noimp = -1.0, 0.5, EARLY_PATIENCE, 0
for e in range(1, EPOCHS+1):
    t0 = time.time()
    tr_loss, tr_acc, tr_f1, _, _ = run_epoch(train_loader, True)
    va_loss, va_acc, va_f1, yv, pv = run_epoch(val_loader, False)
    thr_opt, f1_opt = pick_threshold_constrained(yv, pv, min_prec=MIN_PREC_AT_TUNE)
    print(f"Epoch {e:02d}/{EPOCHS} | {time.time()-t0:.1f}s | train {tr_loss:.4f}/{tr_acc:.3f}/{tr_f1:.3f} | val f1@0.5 {va_f1:.3f} | best f1 {f1_opt:.3f} @ thr={thr_opt:.2f}")
    if f1_opt > best_f1:
        best_f1, best_thr, noimp = f1_opt, thr_opt, 0
        torch.save({"model": model.state_dict(), "thr": best_thr}, CKPT)
        print("  ✓ Saved:", CKPT)
    else:
        noimp += 1
        if noimp >= patience:
            print("Early stopping."); break

ckpt = torch.load(CKPT, map_location=DEVICE)
model.load_state_dict(ckpt["model"]); best_thr = ckpt["thr"]

def evaluate(loader, thr):
    model.eval(); Ys, Ps = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(DEVICE)
            pr = torch.sigmoid(model(xb)).cpu().numpy()
            Ys.append(yb.numpy()); Ps.append(pr)
    y = np.concatenate(Ys); p = np.concatenate(Ps)
    pred = (p >= thr).astype(int)
    acc = accuracy_score(y, pred); f1 = f1_score(y, pred, zero_division=0)
    return acc, f1, y, pred

acc, f1, y_true, y_pred = evaluate(test_loader, best_thr)
print("\n--- Test (UNSW-ized records) ---")
print({"acc": round(acc,4), "f1": round(f1,4), "thr": round(best_thr,3)})
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred, target_names=["Normal (0)","Attack (1)"], zero_division=0))


Device: cuda
✓ CSVهای از قبل موجودند.
UNSW-ized CSVs: ./wadi_preprocessed/wadi_unswized_train.csv ./wadi_preprocessed/wadi_unswized_val.csv ./wadi_preprocessed/wadi_unswized_test.csv
Shapes: (27011, 838) (5788, 838) (5789, 838)
Label ratio train/val/test: 0.278 0.278 0.278
TrBiLSTM_Sensors_Att(
  (proj): Linear(in_features=9, out_features=192, bias=True)
  (pos): SinusoidalPositionalEncoding()
  (enc): TrEnc(
    (layers): ModuleList(
      (0-1): 2 x TrEncLayer(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (l1): Linear(in_features=192, out_features=384, bias=True)
        (l2): Linear(in_features=384, out_features=192, bias=True)
        (n1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (n2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (d1): Dropout(p=0.15, inplace=False)
        (d2): Dropout(p=0.15, inplace=False)
        (act): GELU(approx