In [6]:
import os, time, random, math, gc
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.utils.class_weight import compute_class_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score
import joblib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


2025-11-30 13:12:51.387417: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-30 13:12:51.804721: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-30 13:12:53.935515: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [1]:

# STREAM & BALANCE: create a balanced CSV on disk without loading everything in memory
import os, glob, math, random, gc
import pandas as pd, numpy as np
from pathlib import Path

DATA_DIR = "/home/rafia-khan/S7/Cnt/kaggle"   # your input folder
OUT_BALANCED = "cic_iot_balanced.csv"    # output smaller balanced file
SAMPLE_FRAC = 1.0          # keep 1.0 here; we'll undersample majority classes to cap
UNDERSAMPLE_CAP = 50000    # same as paper-style
CHUNKSIZE = 200_000       # adjust to available RAM (100k - 500k)
RANDOM_STATE = 42

# 1) find CSV files
csv_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
if not csv_files:
    raise FileNotFoundError("No CSV files found in " + DATA_DIR)
print("Found", len(csv_files), "csv files.")

# 2) quick scan first file to detect candidate numeric columns and label column
sample = pd.read_csv(csv_files[0], nrows=2000)
print("Sample columns:", sample.columns.tolist()[:40])

# heuristics for label column (you can modify if different)
label_candidates = ['label','Label','attack','Attack','Type','type','category','Category','Class','class']
label_col = None
for c in label_candidates:
    if c in sample.columns:
        label_col = c; break
if label_col is None:
    for c in sample.columns:
        if 'label' in c.lower() or 'attack' in c.lower() or 'type' in c.lower():
            label_col = c; break
if label_col is None:
    raise RuntimeError("Label column not found automatically. Please set label_col manually.")
print("Detected label column:", label_col)

# detect numeric candidate columns (drop obvious non-features)
drop_guesses = {'src_ip','dst_ip','timestamp','time','date','flow_id','__source_file__'}
numeric_candidates = [c for c in sample.columns if c not in drop_guesses and pd.api.types.is_numeric_dtype(sample[c])]
print("Numeric candidate count (sample):", len(numeric_candidates))
print("Numeric sample:", numeric_candidates[:40])

# 3) two-pass approach:
#   Pass A: compute per-class total counts by streaming
#   Pass B: stream again and write undersampled rows to balanced CSV

# PASS A: count class totals
from collections import Counter
class_counts = Counter()
for f in csv_files:
    print("Counting classes in", os.path.basename(f))
    for chunk in pd.read_csv(f, usecols=[label_col], chunksize=CHUNKSIZE):
        class_counts.update(chunk[label_col].astype(str).value_counts().to_dict())
print("Class counts (sample):", list(class_counts.items())[:10])

# Determine desired keep counts per class (paper-style)
keep_counts = {}
for cls, cnt in class_counts.items():
    if cnt > UNDERSAMPLE_CAP:
        keep_counts[cls] = UNDERSAMPLE_CAP
    else:
        keep_counts[cls] = cnt
print("Example keep counts (first 10):", list(keep_counts.items())[:10])
total_keep = sum(keep_counts.values())
print("Total rows after balancing (approx):", total_keep)

# PASS B: stream again and write undersampled sample per class
# We'll keep per-class reservoir sampling / counters to choose rows deterministically.
# For classes needing full keep_counts == original count we can write all rows.
if os.path.exists(OUT_BALANCED):
    os.remove(OUT_BALANCED)

# We'll track how many we've written per class
written_counts = Counter()

for f in csv_files:
    print("Processing", os.path.basename(f))
    # read in chunks, but keep only numeric candidates + label + maybe source
    usecols = [label_col] + numeric_candidates  # if too many features, trim numeric_candidates list manually
    reader = pd.read_csv(f, usecols=[c for c in usecols if c in pd.read_csv(f, nrows=0).columns], chunksize=CHUNKSIZE)
    for chunk in reader:
        # ensure label is string for consistent keys
        chunk[label_col] = chunk[label_col].astype(str)
        # iterate groups to avoid large memory copies
        for cls, group in chunk.groupby(label_col):
            to_keep = keep_counts.get(cls, 0)
            already = written_counts.get(cls, 0)
            remain = to_keep - already
            if remain <= 0:
                continue
            gsize = len(group)
            if gsize <= remain:
                # write entire group
                mode = 'a' if os.path.exists(OUT_BALANCED) else 'w'
                header = not os.path.exists(OUT_BALANCED)
                group.to_csv(OUT_BALANCED, mode=mode, header=header, index=False)
                written_counts[cls] += gsize
            else:
                # undersample uniformly from this chunk (deterministic seed)
                sel = group.sample(n=remain, random_state=RANDOM_STATE)
                mode = 'a' if os.path.exists(OUT_BALANCED) else 'w'
                header = not os.path.exists(OUT_BALANCED)
                sel.to_csv(OUT_BALANCED, mode=mode, header=header, index=False)
                written_counts[cls] += remain
        # small GC
        del chunk
        gc.collect()

print("Finished writing balanced file:", OUT_BALANCED)
print("Written counts sample:", list(written_counts.items())[:10])


Found 169 csv files.
Sample columns: ['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT']
Detected label column: label
Numeric candidate count (sample): 46
Numeric sample: ['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', '

In [2]:
import pandas as pd, numpy as np

BALANCED_FILE = "cic_iot_balanced.csv"

# 1) load
df_bal = pd.read_csv(BALANCED_FILE)
print("Loaded balanced file:", BALANCED_FILE)
print("Shape:", df_bal.shape)

# 2) alias to df so existing code works without edits
df = df_bal

Loaded balanced file: cic_iot_balanced.csv
Shape: (1279753, 47)


In [7]:

# --------- USER CONFIG ----------
BALANCED_FILE = "cic_iot_balanced.csv"
TOP_K_FEATURES = 40     # set None to use all numeric features
BATCH_SIZE = 512
EPOCHS = 20             # encoder training epochs (kept modest)
LR = 1e-4
EMBED_DIM = 64
TRANSFORMER_LAYERS = 1
TRANSFORMER_HEADS = 2
DROP_RATE = 0.4
RANDOM_STATE = 42
RF_EST = 200
VERBOSE = 2
# --------------------------------

np.random.seed(RANDOM_STATE); random.seed(RANDOM_STATE); tf.random.set_seed(RANDOM_STATE)
start_time = time.time()

# ---- 1) Load balanced dataset ----
if not os.path.exists(BALANCED_FILE):
    raise FileNotFoundError(f"{BALANCED_FILE} not found")
df = pd.read_csv(BALANCED_FILE)
print("Loaded:", BALANCED_FILE, "shape:", df.shape)

Loaded: cic_iot_balanced.csv shape: (1279753, 47)


In [8]:
# detect label column
label_col = None
for cand in ['label','Label','attack','Attack','Type','type','category','Category','Class','class']:
    if cand in df.columns:
        label_col = cand; break
if label_col is None:
    for c in df.columns:
        if 'label' in c.lower() or 'attack' in c.lower() or 'type' in c.lower():
            label_col = c; break
if label_col is None:
    raise RuntimeError("Label column not found in balanced CSV.")
print("Using label column:", label_col)

# ---- 2) select numeric features ----
drop_guesses = {'src_ip','dst_ip','timestamp','time','date','flow_id','__source_file__','__domain_synth__'}
numeric_candidates = [c for c in df.columns if c not in drop_guesses and pd.api.types.is_numeric_dtype(df[c])]
if not numeric_candidates:
    raise RuntimeError("No numeric features found.")
if TOP_K_FEATURES and len(numeric_candidates) > TOP_K_FEATURES:
    print("Selecting top features by mutual info...")
    tmp_y = LabelEncoder().fit_transform(df[label_col].astype(str).values)
    selector = SelectKBest(mutual_info_classif, k=TOP_K_FEATURES)
    selector.fit(df[numeric_candidates].fillna(0).values, tmp_y)
    chosen = [numeric_candidates[i] for i in selector.get_support(indices=True)]
else:
    chosen = numeric_candidates
print("Using {} features (sample): {}".format(len(chosen), chosen[:20]))



Using label column: label
Selecting top features by mutual info...
Using 40 features (sample): ['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS']


In [9]:
# ---- 3) prepare arrays & split ----
df = df.dropna(subset=chosen + [label_col]).reset_index(drop=True)
X = df[chosen].astype(float).values
y_raw = df[label_col].astype(str).values
le_y = LabelEncoder(); y = le_y.fit_transform(y_raw)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print("Train/test shapes:", X_tr.shape, X_te.shape)

Train/test shapes: (1023802, 40) (255951, 40)
