<a href="https://colab.research.google.com/github/prksh830/Healthcare/blob/main/WSN-BFSFSmotetomek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: installs (uncomment if needed, e.g., in Colab)
# !pip install -q imbalanced-learn xgboost lightgbm seaborn

# Core imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# sklearn / imbalanced-learn / models
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc, roc_auc_score)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.combine import SMOTETomek

# optional models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import seaborn as sns
sns.set(style='whitegrid')

In [2]:
# Cell 2: configuration
INPUT_CSV = "/mnt/data/WSNBFSFdataset.csv"   # change if different
OUTPUT_DIR = "./outputs"
RANDOM_STATE = 42
TEST_SIZE = 0.20   # 80/20 split
FIG_DPI = 300
FIG_FORMAT = 'tiff'  # all images saved as .tiff

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# Cell 3: read CSV and quick inspection
df = pd.read_csv("/content/WSNBFSFdataset.csv")
print("Dataset shape:", df.shape)
display(df.head())
print("\nColumn names:", df.columns.tolist())

Dataset shape: (312106, 18)


Unnamed: 0,Event,Time,S_Node,Node_id,Rest_Energy,Trace_Level,Mac_Type_Pckt,Source_IP_Port,Des_IP_Port,Packet_Size,TTL,Hop_Count,Broadcast_ID,Dest_Node_Num,Dest_Seq_Num,Src_Node_ID,Src_Seq_Num,Class
0,1,0.1,79,79,600.0,5,0,79.255,1.255,48,30,1,1,100,0,79,4,normal
1,2,0.100963,78,78,599.979723,5,800,79.255,1.255,48,30,1,1,100,0,79,4,normal
2,2,0.100963,76,76,599.979722,5,800,79.255,1.255,48,30,1,1,100,0,79,4,normal
3,2,0.100964,75,75,599.979722,5,800,79.255,1.255,48,30,1,1,100,0,79,4,normal
4,2,0.100964,118,118,599.979722,5,800,79.255,1.255,48,30,1,1,100,0,79,4,normal



Column names: ['Event', 'Time', 'S_Node', 'Node_id', 'Rest_Energy', 'Trace_Level', 'Mac_Type_Pckt', 'Source_IP_Port', 'Des_IP_Port', 'Packet_Size', 'TTL', 'Hop_Count', 'Broadcast_ID', 'Dest_Node_Num', 'Dest_Seq_Num', 'Src_Node_ID', 'Src_Seq_Num', 'Class']


In [4]:
# Cell 4: pick target column (auto-detect common names, else last column)
possible_targets = ['label','class','target','attack','Attack','type']
target_col = None
for c in possible_targets:
    if c in df.columns:
        target_col = c
        break
if target_col is None:
    target_col = df.columns[-1]  # fallback: last column

print("Using target column:", target_col)

Using target column: Class


In [5]:
# Cell 5: split X/y, basic NA handling, encode categorical features
X = df.drop(columns=[target_col]).copy()
y = df[target_col].copy()

# identify numeric and categorical columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

# Fill missing values
if len(num_cols) > 0:
    X[num_cols] = X[num_cols].fillna(X[num_cols].median())
if len(cat_cols) > 0:
    X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])

# Encode categorical features (OrdinalEncoder)
if len(cat_cols) > 0:
    oe = OrdinalEncoder()
    X[cat_cols] = oe.fit_transform(X[cat_cols])

# Standardize numeric features
scaler = StandardScaler()
if len(num_cols) > 0:
    X[num_cols] = scaler.fit_transform(X[num_cols])

# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = le.classes_
n_classes = len(classes)

print("Features shape:", X.shape)
print("Number of classes:", n_classes, "Classes:", list(classes))

Features shape: (312106, 17)
Number of classes: 4 Classes: ['Blackhole', 'Flooding', 'Forwarding', 'normal']


In [6]:
# Cell 6: split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_enc
)
print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (249684, 17) Test: (62422, 17)


In [7]:
# Cell 7: SMOTETomek on TRAINING set
smt = SMOTETomek(random_state=RANDOM_STATE)
X_train_res, y_train_res = smt.fit_resample(X_train, y_train)
print("After SMOTETomek: X_train_res:", X_train_res.shape)
# show counts per class after resampling
unique, counts = np.unique(y_train_res, return_counts=True)
print("Label counts after resample:", dict(zip(le.inverse_transform(unique), counts)))

After SMOTETomek: X_train_res: (839992, 17)
Label counts after resample: {'Blackhole': np.int64(209909), 'Flooding': np.int64(210143), 'Forwarding': np.int64(210087), 'normal': np.int64(209853)}


In [8]:
# Cell 8: helper functions for saving TIFF plots

def save_confusion_matrix(y_true, y_pred, model_name, classes, outdir=OUTPUT_DIR):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=classes, yticklabels=classes)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title(f'Confusion Matrix - {model_name}')
    fname = os.path.join(outdir, f'confusion_{model_name}.{FIG_FORMAT}')
    plt.tight_layout()
    fig.savefig(fname, dpi=FIG_DPI, format=FIG_FORMAT)
    plt.close(fig)
    return cm, fname

def save_roc_curves(fpr_dict, tpr_dict, roc_auc_dict, model_name, classes, outdir=OUTPUT_DIR):
    fig, ax = plt.subplots(figsize=(7,6))
    # micro-average if present
    if 'micro' in fpr_dict:
        ax.plot(fpr_dict['micro'], tpr_dict['micro'],
                label=f"micro-average (AUC = {roc_auc_dict['micro']:.4f})", lw=2)
    # per-class
    for key in sorted([k for k in roc_auc_dict.keys() if k != 'micro']):
        ax.plot(fpr_dict[key], tpr_dict[key],
                label=f"{classes[int(key)]} (AUC = {roc_auc_dict[key]:.4f})", lw=1)
    ax.plot([0,1],[0,1], 'k--', lw=0.7)
    ax.set_xlim([-0.01,1.01])
    ax.set_ylim([-0.01,1.01])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'ROC Curve - {model_name}')
    ax.legend(loc='lower right', fontsize='small')
    fname = os.path.join(outdir, f'roc_{model_name}.{FIG_FORMAT}')
    plt.tight_layout()
    fig.savefig(fname, dpi=FIG_DPI, format=FIG_FORMAT)
    plt.close(fig)
    return fname

In [9]:
# Cell 9: define models
models = {
    'DecisionTree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=400, random_state=RANDOM_STATE),
    'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE),
    'LightGBM': LGBMClassifier(random_state=RANDOM_STATE)
}

In [10]:
# Cell 10: train & evaluate
results = []
# Prepare binarized y_test for multiclass ROC if needed
if n_classes > 2:
    y_test_binarized = label_binarize(y_test, classes=np.arange(n_classes))
else:
    y_test_binarized = label_binarize(y_test, classes=np.arange(n_classes))

for name, model in models.items():
    print("Training", name)
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)

    # Probabilities / scores for ROC
    y_score = None
    try:
        y_score = model.predict_proba(X_test)
    except Exception:
        try:
            y_score = model.decision_function(X_test)
        except Exception:
            y_score = None

    # Metrics (macro averages appropriate for multiclass)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    # Save confusion matrix TIFF
    cm, cm_file = save_confusion_matrix(y_test, y_pred, name, [str(c) for c in classes], outdir=OUTPUT_DIR)

    # Compute ROC if possible
    roc_fname = None
    roc_auc_out = None
    if y_score is not None:
        if n_classes == 2:
            # binary: use positive-class probs if shape (n,2)
            if y_score.ndim == 1:
                fpr, tpr, _ = roc_curve(y_test, y_score)
            else:
                fpr, tpr, _ = roc_curve(y_test, y_score[:, 1])
            roc_auc = auc(fpr, tpr)
            fpr_dict = {1: fpr, 'micro': fpr}
            tpr_dict = {1: tpr, 'micro': tpr}
            roc_auc_dict = {1: roc_auc, 'micro': roc_auc}
            roc_fname = save_roc_curves(fpr_dict, tpr_dict, roc_auc_dict, name, [str(c) for c in classes], outdir=OUTPUT_DIR)
            roc_auc_out = roc_auc
        else:
            # multiclass: per-class ROC + micro-average
            y_score_arr = np.array(y_score)
            if y_score_arr.ndim == 1:
                roc_fname = None
            else:
                fpr = dict(); tpr = dict(); roc_auc = dict()
                for i in range(n_classes):
                    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score_arr[:, i])
                    roc_auc[i] = auc(fpr[i], tpr[i])
                fpr['micro'], tpr['micro'], _ = roc_curve(y_test_binarized.ravel(), y_score_arr.ravel())
                roc_auc['micro'] = auc(fpr['micro'], tpr['micro'])
                roc_fname = save_roc_curves(fpr, tpr, roc_auc, name, [str(c) for c in classes], outdir=OUTPUT_DIR)
                roc_auc_out = roc_auc.get('micro', None)

    # record result row
    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision_macro': precision,
        'Recall_macro': recall,
        'F1_macro': f1,
        'ConfusionMatrixFile': cm_file,
        'ROCFile': roc_fname,
        'ROC_AUC_micro': roc_auc_out
    })
    print(f" -> {name}: Acc={acc:.4f}, F1={f1:.4f}, ROC_saved={bool(roc_fname)}")

Training DecisionTree
 -> DecisionTree: Acc=0.9994, F1=0.9972, ROC_saved=True
Training RandomForest
 -> RandomForest: Acc=0.9996, F1=0.9984, ROC_saved=True
Training MLP
 -> MLP: Acc=0.9375, F1=0.8259, ROC_saved=True
Training KNN
 -> KNN: Acc=0.9329, F1=0.7987, ROC_saved=True
Training XGBoost
 -> XGBoost: Acc=0.9700, F1=0.8934, ROC_saved=True
Training LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.135157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3085
[LightGBM] [Info] Number of data points in the train set: 839992, number of used features: 15
[LightGBM] [Info] Start training from score -1.386718
[LightGBM] [Info] Start training from score -1.385604
[LightGBM] [Info] Start training from score -1.385871
[LightGBM] [Info] Start training from score -1.386985
 -> LightGBM: Acc=0.9982, F1=0.9933, ROC_saved=True


In [11]:
# New Cell: Comparison of ROC curves for all models

from sklearn.preprocessing import label_binarize

# Prepare binarized labels for multiclass ROC (same as in section 10)
if n_classes > 2:
    y_test_binarized = label_binarize(y_test, classes=np.arange(n_classes))
else:
    y_test_binarized = label_binarize(y_test, classes=np.arange(n_classes))

plt.figure(figsize=(8,7))

for res in results:
    model_name = res['Model']
    clf = models[model_name]

    try:
        y_score = clf.predict_proba(X_test)
    except:
        try:
            y_score = clf.decision_function(X_test)
        except:
            y_score = None

    if y_score is not None:
        if n_classes == 2:
            # binary case
            if y_score.ndim > 1:
                fpr, tpr, _ = roc_curve(y_test, y_score[:,1])
            else:
                fpr, tpr, _ = roc_curve(y_test, y_score)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.4f})')
        else:
            # multiclass: plot only micro-average ROC
            fpr, tpr, _ = roc_curve(y_test_binarized.ravel(), y_score.ravel())
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.4f})')

# Random baseline
plt.plot([0,1],[0,1],'k--', lw=1)

plt.xlim([-0.01,1.01])
plt.ylim([-0.01,1.01])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison Across Models")
plt.legend(loc="lower right", fontsize="small")

# Save figure
roc_comparison_path = os.path.join(OUTPUT_DIR, f'roc_comparison_all.{FIG_FORMAT}')
plt.tight_layout()
plt.savefig(roc_comparison_path, dpi=FIG_DPI, format=FIG_FORMAT)
plt.close()

print("ROC comparison figure saved at:", roc_comparison_path)

ROC comparison figure saved at: ./outputs/roc_comparison_all.tiff


In [13]:
# New Cell: Comparison of performance metrics across all models

# Select relevant metrics
# metrics_df = res_df[['Model', 'Accuracy', 'Precision_macro', 'Recall_macro', 'F1_macro']]
metrics_df = pd.DataFrame(results)[['Model', 'Accuracy', 'Precision_macro', 'Recall_macro', 'F1_macro']]

# Plot grouped bar chart
ax = metrics_df.set_index('Model').plot(kind='bar', figsize=(10,6))
plt.title("Comparison of Performance Metrics Across Models")
plt.ylabel("Score")
plt.ylim(0,1.05)
plt.legend(title="Metrics", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha="right")

# Save figure
metrics_comparison_path = os.path.join(OUTPUT_DIR, f'performance_comparison.{FIG_FORMAT}')
plt.tight_layout()
plt.savefig(metrics_comparison_path, dpi=FIG_DPI, format=FIG_FORMAT)
plt.close()

print("Performance comparison figure saved at:", metrics_comparison_path)

Performance comparison figure saved at: ./outputs/performance_comparison.tiff


In [16]:
# New Cell: Comparison of performance metrics with custom labels

# Mapping full names -> abbreviations
name_map = {
    'DecisionTree': 'DT',
    'RandomForest': 'RF',
    'MLP': 'MLP',
    'KNN': 'KNN',
    'XGBoost': 'XGBoost',
    'LightGBM': 'LightGBM'
}

# Replace names in DataFrame
metrics_df = pd.DataFrame(results)[['Model', 'Accuracy', 'Precision_macro', 'Recall_macro', 'F1_macro']].copy()
metrics_df['Model'] = metrics_df['Model'].map(name_map)

# Plot grouped bar chart
ax = metrics_df.set_index('Model').plot(kind='bar', figsize=(10,6))
plt.title("Comparison of Performance Metrics Across Models")
plt.ylabel("Performance Metrics")
plt.ylim(0,1.05)
plt.legend(title="Metrics", bbox_to_anchor=(1.05, 1), loc='upper left')

# Set x-axis labels horizontal
plt.xticks(rotation=0, ha="center")

# Save figure
metrics_comparison_path = os.path.join(OUTPUT_DIR, f'performance_comparison.{FIG_FORMAT}')
plt.tight_layout()
plt.savefig(metrics_comparison_path, dpi=FIG_DPI, format=FIG_FORMAT)
plt.close()

print("Performance comparison figure saved at:", metrics_comparison_path)

Performance comparison figure saved at: ./outputs/performance_comparison.tiff
