<a href="https://colab.research.google.com/github/sankeawthong/Project-1-Lita-Chatbot/blob/main/WSN-DS_Hybrid%20RF-LSTM_kfold_logging%20%5B20250903%5D..ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

WSN-DS — Leakage-Safe 10-Fold CV with SMOTE–NearMiss and RF→LSTM Hybrid
This notebook reproduces fold-wise metrics (mean ± std) for WSN SecureNet on WSN-DS with:
# **bold text**
Stratified 10-fold CV
No resampling leakage: SMOTE→NearMiss is applied only on training split inside each fold
Sequence modeling: sliding windows (configurable TT, stride) for LSTM
Hybrid pipeline: RF (feature selection / reduction) → LSTM (temporal modeling)
Baselines: RF-only, LSTM-only, and LSTM→RF (optional)
Metrics per fold: Accuracy, Precision (macro), Recall (macro), F1 (macro), micro-AUC (one-vs-rest)
Outputs: per-fold logs and summary CSVs saved to ./outputs/
Important: Please set the dataset path and column names in the Configuration cell.

In [1]:
# ==========================
# Configuration
# ==========================
DATASET_PATH = "/content/dataset_WSN-DS.csv"   # <-- update to your file path
LABEL_COL     = "Class"                  # <-- update to your ground-truth label column
TIME_COL      = None                     # e.g., "timestamp" or None if not available
GROUP_COL     = None                     # e.g., "node_id"/"flow_id" if you want per-entity windowing; else None

# Feature columns: if None, will auto-detect (all numeric except LABEL_COL)
FEATURE_COLS  = None

# Sequence modeling params
TT            = 20   # sequence length (time steps)
STRIDE        = 5    # window stride
PAD_MODE      = "edge"   # padding mode if sequences needed (edge/constant)

# Cross-validation & training params
N_FOLDS       = 10
RANDOM_STATE  = 42

# RF params
RF_N_ESTIMATORS = 100
RF_MAX_DEPTH    = None

# LSTM params
LSTM_HIDDEN     = [64, 32, 16]
LSTM_DROPOUT    = 0.2
EPOCHS          = 100
BATCH_SIZE      = 64
VAL_SPLIT       = 0.1  # internal validation for early stopping inside training split

# Resampling params (applied only within train split per fold)
SMOTE_K_NEIGHBORS  = 5
NEARMISS_VERSION   = 1  # 1, 2, or 3

# Output directory
OUTPUT_DIR = "./outputs"
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Config loaded. Update DATASET_PATH/LABEL_COL/etc. before running.")

Config loaded. Update DATASET_PATH/LABEL_COL/etc. before running.


In [2]:
# ==========================
# Imports
# ==========================
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)


In [3]:
# ==========================
# Utilities
# ==========================
def load_dataset(path, label_col, feature_cols=None):
    df = pd.read_csv(path)
    if feature_cols is None:
        feature_cols = [c for c in df.columns if c != label_col and pd.api.types.is_numeric_dtype(df[c])]
    X = df[feature_cols].copy()
    y_raw = df[label_col].astype(str).values
    le = LabelEncoder()
    y = le.fit_transform(y_raw)
    return df, X, y, feature_cols, le

def safe_fit_scaler(X_train):
    scaler = StandardScaler()
    scaler.fit(X_train)
    return scaler

def apply_scaler(scaler, X):
    return scaler.transform(X)

def smote_nearmiss_balance(X_train, y_train, smote_k=5, nm_version=1):
    sm = SMOTE(k_neighbors=smote_k, random_state=42)
    X_sm, y_sm = sm.fit_resample(X_train, y_train)
    nm = NearMiss(version=nm_version)
    X_bal, y_bal = nm.fit_resample(X_sm, y_sm)
    return X_bal, y_bal

def make_sequences(X_df, y_arr, tt=20, stride=5, time_col=None, group_col=None):
    if group_col is not None:
        groups = X_df[group_col].values
    else:
        groups = np.zeros(len(X_df), dtype=int)
    if time_col is not None:
        order = np.argsort(X_df[time_col].values)
    else:
        order = np.arange(len(X_df))
    X_ord = X_df.iloc[order].reset_index(drop=True)
    y_ord = y_arr[order]
    grp_ord = groups[order]
    X_values = X_ord.drop(columns=[c for c in [time_col, group_col] if c is not None and c in X_ord.columns], errors='ignore').values
    X_seq_list, y_seq_list = [], []
    for g in np.unique(grp_ord):
        idx = np.where(grp_ord == g)[0]
        seq_data = X_values[idx]
        seq_labels = y_ord[idx]
        for s in range(0, max(0, len(seq_data) - tt + 1), stride):
            window = seq_data[s:s+tt]
            if window.shape[0] < tt:
                continue
            window_labels = seq_labels[s:s+tt]
            vals, counts = np.unique(window_labels, return_counts=True)
            maj = vals[np.argmax(counts)]
            X_seq_list.append(window)
            y_seq_list.append(maj)
    if len(X_seq_list) == 0:
        return None, None
    return np.stack(X_seq_list), np.array(y_seq_list)

def rf_feature_transformer(X_train, y_train, X_val, n_estimators=100, max_depth=None, top_k=32):
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    importances = rf.feature_importances_
    k = min(top_k, X_train.shape[1])
    top_idx = np.argsort(importances)[::-1][:k]
    return rf, top_idx

def build_lstm(input_shape, hidden=[64,32,16], dropout=0.2, n_classes=2):
    model = models.Sequential()
    model.add(layers.Input(shape=input_shape))
    for h in hidden[:-1]:
        model.add(layers.LSTM(h, return_sequences=True))
        model.add(layers.Dropout(dropout))
    model.add(layers.LSTM(hidden[-1]))
    model.add(layers.Dropout(dropout))
    model.add(layers.Dense(n_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def compute_fold_metrics(y_true, y_pred, y_proba, average='macro', n_classes=None):
    acc = accuracy_score(y_true, y_pred)
    pr  = precision_score(y_true, y_pred, average=average, zero_division=0)
    rc  = recall_score(y_true, y_pred, average=average, zero_division=0)
    f1  = f1_score(y_true, y_pred, average=average, zero_division=0)
    auc_micro = np.nan
    if y_proba is not None and n_classes is not None and n_classes > 2:
        y_true_ovr = tf.keras.utils.to_categorical(y_true, num_classes=n_classes)
        try:
            auc_micro = roc_auc_score(y_true_ovr, y_proba, average='micro', multi_class='ovr')
        except Exception:
            auc_micro = np.nan
    elif y_proba is not None and y_proba.shape[1] == 2:
        try:
            auc_micro = roc_auc_score(y_true, y_proba[:,1])
        except Exception:
            auc_micro = np.nan
    return acc, pr, rc, f1, auc_micro

In [4]:
# ==========================
# Main: Leakage-safe CV + logging
# ==========================
df, X_full, y_full, feature_cols, le = load_dataset(DATASET_PATH, LABEL_COL, feature_cols=FEATURE_COLS)
n_classes = len(np.unique(y_full))
print(f"Loaded: {df.shape[0]} rows, {len(feature_cols)} features, {n_classes} classes")

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

columns = ['fold','ACC','Precision','Recall','F1','AUC_micro']
logs_rf, logs_lstm, logs_hybrid = [], [], []

fold_idx = 0
for train_idx, val_idx in skf.split(X_full.values, y_full):
    fold_idx += 1
    print(f"\n=== Fold {fold_idx}/{N_FOLDS} ===")
    X_tr_raw = X_full.iloc[train_idx].copy()
    y_tr     = y_full[train_idx].copy()
    X_val_raw= X_full.iloc[val_idx].copy()
    y_val    = y_full[val_idx].copy()

    scaler = safe_fit_scaler(X_tr_raw.values)
    X_tr_scaled  = apply_scaler(scaler, X_tr_raw.values)
    X_val_scaled = apply_scaler(scaler, X_val_raw.values)

    X_tr_bal, y_tr_bal = smote_nearmiss_balance(X_tr_scaled, y_tr, smote_k=SMOTE_K_NEIGHBORS, nm_version=NEARMISS_VERSION)

    # RF-only
    rf_base = RandomForestClassifier(n_estimators=RF_N_ESTIMATORS, max_depth=RF_MAX_DEPTH, random_state=RANDOM_STATE, n_jobs=-1)
    rf_base.fit(X_tr_bal, y_tr_bal)
    rf_pred = rf_base.predict(X_val_scaled)
    try:
        rf_proba = rf_base.predict_proba(X_val_scaled)
    except Exception:
        rf_proba = None
    acc, pr, rc, f1, auc_m = compute_fold_metrics(y_val, rf_pred, rf_proba, average='macro', n_classes=n_classes)
    logs_rf.append([fold_idx, acc, pr, rc, f1, auc_m])

    # LSTM-only sequences
    X_tr_bal_df = pd.DataFrame(X_tr_bal, columns=feature_cols)
    X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=feature_cols)
    if TIME_COL and TIME_COL in df.columns:
        X_tr_bal_df[TIME_COL] = df.iloc[train_idx][TIME_COL].values[:len(X_tr_bal_df)]
        X_val_scaled_df[TIME_COL] = df.iloc[val_idx][TIME_COL].values[:len(X_val_scaled_df)]
    if GROUP_COL and GROUP_COL in df.columns:
        X_tr_bal_df[GROUP_COL] = df.iloc[train_idx][GROUP_COL].values[:len(X_tr_bal_df)]
        X_val_scaled_df[GROUP_COL] = df.iloc[val_idx][GROUP_COL].values[:len(X_val_scaled_df)]

    Xtr_seq, ytr_seq = make_sequences(X_tr_bal_df, y_tr_bal, tt=TT, stride=STRIDE, time_col=TIME_COL, group_col=GROUP_COL)
    Xva_seq, yva_seq = make_sequences(X_val_scaled_df, y_val,    tt=TT, stride=STRIDE, time_col=TIME_COL, group_col=GROUP_COL)
    if Xtr_seq is not None and Xva_seq is not None:
        lstm_model = build_lstm(input_shape=(Xtr_seq.shape[1], Xtr_seq.shape[2]), hidden=LSTM_HIDDEN, dropout=LSTM_DROPOUT, n_classes=n_classes)
        cb = [callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)]
        lstm_model.fit(Xtr_seq, ytr_seq, validation_split=VAL_SPLIT, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, callbacks=cb)
        yva_proba = lstm_model.predict(Xva_seq, verbose=0)
        yva_pred  = np.argmax(yva_proba, axis=1)
        acc, pr, rc, f1, auc_m = compute_fold_metrics(yva_seq, yva_pred, yva_proba, average='macro', n_classes=n_classes)
        logs_lstm.append([fold_idx, acc, pr, rc, f1, auc_m])
    else:
        print('Warning: Not enough samples to form sequences in this fold; skipping LSTM-only metrics for this fold.')

    # Hybrid RF->LSTM
    rf_ft, top_idx = rf_feature_transformer(X_tr_bal, y_tr_bal, X_val_scaled, n_estimators=RF_N_ESTIMATORS, max_depth=RF_MAX_DEPTH)
    X_tr_top  = X_tr_bal[:, top_idx]
    X_val_top = X_val_scaled[:, top_idx]
    top_cols = [feature_cols[i] for i in top_idx]
    X_tr_top_df = pd.DataFrame(X_tr_top, columns=top_cols)
    X_val_top_df= pd.DataFrame(X_val_top, columns=top_cols)
    if TIME_COL and TIME_COL in df.columns:
        X_tr_top_df[TIME_COL]  = X_tr_bal_df[TIME_COL].values[:len(X_tr_top_df)]
        X_val_top_df[TIME_COL] = X_val_scaled_df[TIME_COL].values[:len(X_val_top_df)]
    if GROUP_COL and GROUP_COL in df.columns:
        X_tr_top_df[GROUP_COL]  = X_tr_bal_df[GROUP_COL].values[:len(X_tr_top_df)]
        X_val_top_df[GROUP_COL] = X_val_scaled_df[GROUP_COL].values[:len(X_val_top_df)]

    Xtr_seq_h, ytr_seq_h = make_sequences(X_tr_top_df, y_tr_bal, tt=TT, stride=STRIDE, time_col=TIME_COL, group_col=GROUP_COL)
    Xva_seq_h, yva_seq_h = make_sequences(X_val_top_df, y_val,    tt=TT, stride=STRIDE, time_col=TIME_COL, group_col=GROUP_COL)
    if Xtr_seq_h is not None and Xva_seq_h is not None:
        lstm_h = build_lstm(input_shape=(Xtr_seq_h.shape[1], Xtr_seq_h.shape[2]), hidden=LSTM_HIDDEN, dropout=LSTM_DROPOUT, n_classes=n_classes)
        lstm_h.fit(Xtr_seq_h, ytr_seq_h, validation_split=VAL_SPLIT, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, callbacks=cb)
        yva_proba_h = lstm_h.predict(Xva_seq_h, verbose=0)
        yva_pred_h  = np.argmax(yva_proba_h, axis=1)
        acc, pr, rc, f1, auc_m = compute_fold_metrics(yva_seq_h, yva_pred_h, yva_proba_h, average='macro', n_classes=n_classes)
        logs_hybrid.append([fold_idx, acc, pr, rc, f1, auc_m])
    else:
        print('Warning: Not enough samples to form sequences for Hybrid in this fold; skipping hybrid metrics for this fold.')

# Save per-fold logs
df_rf     = pd.DataFrame(logs_rf,    columns=columns)
df_lstm   = pd.DataFrame(logs_lstm,  columns=columns)
df_hybrid = pd.DataFrame(logs_hybrid,columns=columns)
df_rf.to_csv(os.path.join(OUTPUT_DIR, 'fold_logs_rf.csv'), index=False)
df_lstm.to_csv(os.path.join(OUTPUT_DIR, 'fold_logs_lstm.csv'), index=False)
df_hybrid.to_csv(os.path.join(OUTPUT_DIR, 'fold_logs_hybrid.csv'), index=False)
print('Saved per-fold logs to:', OUTPUT_DIR)

def summarize(df, name):
    if len(df)==0:
        return pd.Series({'model': name})
    s = df[['ACC','Precision','Recall','F1','AUC_micro']].agg(['mean','std']).T
    s.columns = [f'{c}_{name}' for c in s.columns]
    return s

sum_rf     = summarize(df_rf, 'RF')
sum_lstm   = summarize(df_lstm, 'LSTM')
sum_hybrid = summarize(df_hybrid, 'RF_LSTM')
summary = pd.concat([sum_rf, sum_lstm, sum_hybrid], axis=1)
summary.to_csv(os.path.join(OUTPUT_DIR, 'summary_mean_std.csv'))
summary


Loaded: 374661 rows, 18 features, 5 classes

=== Fold 1/10 ===

=== Fold 2/10 ===

=== Fold 3/10 ===

=== Fold 4/10 ===

=== Fold 5/10 ===

=== Fold 6/10 ===

=== Fold 7/10 ===

=== Fold 8/10 ===

=== Fold 9/10 ===

=== Fold 10/10 ===
Saved per-fold logs to: ./outputs


Unnamed: 0,mean_RF,std_RF,mean_LSTM,std_LSTM,mean_RF_LSTM,std_RF_LSTM
ACC,0.996618,0.00046,0.939052,0.022599,0.927597,0.022802
Precision,0.974779,0.003919,0.619391,0.081494,0.614726,0.078901
Recall,0.982854,0.002701,0.530625,0.04228,0.549956,0.06444
F1,0.978492,0.003132,0.538911,0.050911,0.549988,0.055212
AUC_micro,0.999639,8.9e-05,0.991157,0.007024,0.98988,0.005925


In [5]:
# ==========================
# Optional: Paired t-tests
# ==========================
from scipy.stats import ttest_rel

def paired_t(df_a, df_b, metric):
    if len(df_a)==0 or len(df_b)==0:
        return np.nan, np.nan
    common = sorted(set(df_a['fold']).intersection(set(df_b['fold'])))
    if len(common) < 2:
        return np.nan, np.nan
    a = df_a[df_a['fold'].isin(common)][metric].values
    b = df_b[df_b['fold'].isin(common)][metric].values
    stat, p = ttest_rel(a, b)
    return stat, p

mean_rf_acc   = df_rf['ACC'].mean() if len(df_rf) else -1
mean_lstm_acc = df_lstm['ACC'].mean() if len(df_lstm) else -1
best_baseline = 'RF' if mean_rf_acc >= mean_lstm_acc else 'LSTM'
print('Strongest baseline (by mean ACC):', best_baseline)
base_df = df_rf if best_baseline=='RF' else df_lstm
metrics = ['ACC','Precision','Recall','F1','AUC_micro']
rows = []
for m in metrics:
    stat, p = paired_t(df_hybrid, base_df, m)
    rows.append([m, stat, p])
ttest_df = pd.DataFrame(rows, columns=['Metric','t_stat','p_value'])
ttest_df.to_csv(os.path.join(OUTPUT_DIR, 'paired_ttests_hybrid_vs_baseline.csv'), index=False)
ttest_df


Strongest baseline (by mean ACC): RF


Unnamed: 0,Metric,t_stat,p_value
0,ACC,-9.490517,5.520005e-06
1,Precision,-14.864831,1.220342e-07
2,Recall,-21.341103,5.118565e-09
3,F1,-25.131629,1.201558e-09
4,AUC_micro,-5.162993,0.0005927069


In [6]:
# ==========================
# Download results
# ==========================
import shutil
from google.colab import files
import os

# Define the output directory and the name of the zip file
output_dir = "./outputs"
zip_filename = 'outputs.zip'
zip_filepath = os.path.join('.', zip_filename) # Create in the current directory for easy download

# Zip the output directory
shutil.make_archive(zip_filepath.replace('.zip', ''), 'zip', output_dir)

# Provide a download link
try:
  files.download(zip_filepath)
except FileNotFoundError:
  print(f"Error: The file {zip_filename} was not found. Please check if the outputs directory exists and contains files.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>