# Cleaning Dataset and Obtaining Samples for Testing:

## a preprocessing guide for raw NIDS data for use in QML testing

### Import dependencies

In [1]:
import os, glob
import pandas as pd
import numpy as np
import xgboost as xgb
import dask.dataframe as dd

In [2]:
import sys
print(sys.executable) 
import xgboost
print(xgboost.__version__) 

import glob
csv_files = glob.glob("datasets/Phishing/*.csv")

/Users/dr.ivanramirez/venvs/qiskit_env/bin/python
3.1.0


In [3]:
import re
from pathlib import Path

def load_wordlist(path: str):
    words = []
    p = Path(path)
    if not p.exists():
        print(f"⚠️ Missing wordlist: {path}")
        return words

    for line in p.read_text(encoding="utf-8", errors="ignore").splitlines():
        s = line.strip()
        if not s:
            continue
        if s.startswith("#"):
            continue
        words.append(s.lower())
    return sorted(set(words))

def compile_phrase_regex(phrases):
    phrases = sorted(set([p.strip().lower() for p in phrases if p.strip()]), key=len, reverse=True)
    escaped = []
    for p in phrases:
        e = re.escape(p)
        e = e.replace(r"\ ", r"\s+")
        escaped.append(e)

    if not escaped:
        return None

    pattern = r"(?i)(?:\b" + r"\b|\b".join(escaped) + r"\b)"
    return re.compile(pattern)

def count_phrase_hits(text: str, rx):
    if rx is None:
        return 0
    if text is None:
        return 0
    s = str(text).lower()
    return len(rx.findall(s))

# Load lists once (paths are relative to QML-NIDS)
spam_words = load_wordlist("wordlists/spam_words_prasidhda.txt")
urgency_words = load_wordlist("wordlists/phishing_urgency_words.txt")

combined_words = sorted(set(spam_words + urgency_words))

spam_rx = compile_phrase_regex(spam_words)
urgency_rx = compile_phrase_regex(urgency_words)
combined_rx = compile_phrase_regex(combined_words)

url_rx = re.compile(r"https?://|www\.", re.IGNORECASE)

print("✅ spam_words:", len(spam_words), "urgency_words:", len(urgency_words), "combined:", len(combined_words))

✅ spam_words: 556 urgency_words: 18 combined: 568


### Load chosen datasets

In [4]:
# 2. Load chosen datasets
# If you want to use specific files, set them here. Otherwise it will load all CSVs in datasets/Phishing/
attack_files = [
    "datasets/Phishing/CEAS_08.csv",
    "datasets/Phishing/Enron.csv",
    "datasets/Phishing/Ling.csv",
    "datasets/Phishing/Nazario.csv",
    "datasets/Phishing/Nigerian_Fraud.csv",
    "datasets/Phishing/phishing_email.csv",
    "datasets/Phishing/SpamAssasin.csv",
    "datasets/Phishing/TREC_07.csv"

]

# Optionally list benign files; if empty we will split by 'label' if present in CSVs
benign_files = []  # keep empty unless you have explicit benign CSVs

# Read attack files with Dask and compute to pandas (safer for mixed schemas)
ddf_attack = dd.read_csv(attack_files, low_memory=False)
attack_df = ddf_attack.compute()
attack_df["_source"] = attack_df.get("_source_file", "")  # tag source if present

# If explicit benign_files provided, load them; otherwise try to split by label
if benign_files:
    benign_df = dd.read_csv(benign_files, low_memory=False).compute()
else:
    benign_df = pd.DataFrame()  # may fill below if 'label' exists

# If there is a 'label' column in the combined attack_df, split on it
if "label" in attack_df.columns and benign_df.empty:
    # normalize label to 0/1
    attack_df["label"] = attack_df["label"].apply(lambda v: 1 if (pd.notna(v) and int(float(v)) != 0) else 0)
    benign_df = attack_df[attack_df["label"] == 0].copy().reset_index(drop=True)
    attack_df = attack_df[attack_df["label"] == 1].copy().reset_index(drop=True)
    print("split by label -> attack:", attack_df.shape, "benign:", benign_df.shape)
else:
    print("Loaded attack files shape:", attack_df.shape, "benign explicit provided:", not benign_df.empty)

# Random sample 500 each (if available)
n_sample = 500
if len(attack_df) > n_sample:
    attack_df = attack_df.sample(n=n_sample, random_state=42).reset_index(drop=True)
if not benign_df.empty and len(benign_df) > n_sample:
    benign_df = benign_df.sample(n=n_sample, random_state=42).reset_index(drop=True)

print("After sampling -> attack:", attack_df.shape, "benign:", benign_df.shape)

split by label -> attack: (115181, 9) benign: (103548, 9)
After sampling -> attack: (500, 9) benign: (500, 9)


## DATA CLEANING

In [6]:
# 3. DATA CLEANING

def text_to_numeric_features(df):
    df = df.copy()

    subj = df.get("subject", "").astype(str).fillna("")
    body = df.get("body", "").astype(str).fillna("")

    # Length + word count
    df["subject_len"] = subj.apply(len)
    df["subject_words"] = subj.apply(lambda s: len(s.split()))
    df["body_len"] = body.apply(len)
    df["body_words"] = body.apply(lambda s: len(s.split()))

    # URLs
    df["body_num_urls"] = body.apply(lambda s: len(url_rx.findall(s)))
    if "urls" in df.columns:
        df["urls_count"] = pd.to_numeric(df["urls"], errors="coerce").fillna(0).astype(int)
    else:
        df["urls_count"] = df["body_num_urls"]

    # NEW: spam/phishing keyword metrics (Mary + Brian request)
    df["subject_spamword_hits"] = subj.apply(lambda s: count_phrase_hits(s, spam_rx))
    df["body_spamword_hits"] = body.apply(lambda s: count_phrase_hits(s, spam_rx))

    df["subject_urgency_hits"] = subj.apply(lambda s: count_phrase_hits(s, urgency_rx))
    df["body_urgency_hits"] = body.apply(lambda s: count_phrase_hits(s, urgency_rx))

    df["subject_keyword_hits"] = subj.apply(lambda s: count_phrase_hits(s, combined_rx))
    df["body_keyword_hits"] = body.apply(lambda s: count_phrase_hits(s, combined_rx))

    # Extra simple phishing signals
    df["subject_exclaim"] = subj.apply(lambda s: s.count("!"))
    df["body_exclaim"] = body.apply(lambda s: s.count("!"))

    df["subject_qmark"] = subj.apply(lambda s: s.count("?"))
    df["body_qmark"] = body.apply(lambda s: s.count("?"))

    df["subject_digits"] = subj.apply(lambda s: sum(ch.isdigit() for ch in s))
    df["body_digits"] = body.apply(lambda s: sum(ch.isdigit() for ch in s))

    df["subject_upper_ratio"] = subj.apply(lambda s: (sum(ch.isupper() for ch in s) / max(len(s), 1)))
    df["body_upper_ratio"] = body.apply(lambda s: (sum(ch.isupper() for ch in s) / max(len(s), 1)))

    df["subject_money"] = subj.apply(lambda s: s.count("$") + s.lower().count("usd"))
    df["body_money"] = body.apply(lambda s: s.count("$") + s.lower().count("usd"))

    return df
    

attack_df = text_to_numeric_features(attack_df)
if not benign_df.empty:
    benign_df = text_to_numeric_features(benign_df)

attack_num_preview = attack_df.select_dtypes(include=[np.number])
print("✅ Numeric features available:", attack_num_preview.shape[1])
print(attack_num_preview.columns.tolist())
attack_num_preview.head() 

# Keep only numeric columns (your original intent)
attack_num = attack_df.select_dtypes(include=[np.number]).copy()
benign_num = benign_df.select_dtypes(include=[np.number]).copy() if not benign_df.empty else pd.DataFrame(columns=attack_num.columns)

# Drop label + urls so they never become features (prevents leakage / weird encoding)
attack_num = attack_num.drop(columns=["label", "urls"], errors="ignore")
if not benign_num.empty:
    benign_num = benign_num.drop(columns=["label", "urls"], errors="ignore")

# Replace inf with NaN
attack_num.replace([np.inf, -np.inf], np.nan, inplace=True)
if not benign_num.empty:
    benign_num.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop columns with >20% missing in attack (use attack_num only)
cols_ok = attack_num.isnull().mean() < 0.2
cols_ok_names = cols_ok[cols_ok].index.tolist()
attack_num = attack_num.loc[:, cols_ok_names]

# Align benign to same columns if exists (use intersection to avoid boolean index misalignment)
if not benign_num.empty:
    shared = [c for c in cols_ok_names if c in benign_num.columns]
    attack_num = attack_num.loc[:, shared]
    benign_num = benign_num.loc[:, shared]
else:
    shared = attack_num.columns.tolist()

# Drop single-value columns in attack
multi_cols = attack_num.nunique()[attack_num.nunique() > 1].index.tolist()
attack_num = attack_num.loc[:, multi_cols]
if not benign_num.empty:
    benign_num = benign_num.reindex(columns=multi_cols, fill_value=np.nan)

# Drop highly collinear columns (corr > 0.95) computed on attack_num
if attack_num.shape[1] > 1:
    corr = attack_num.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]
    if to_drop:
        attack_num.drop(columns=to_drop, inplace=True, errors='ignore')
        if not benign_num.empty:
            benign_num.drop(columns=to_drop, inplace=True, errors='ignore')

# Instead of dropping columns (too aggressive), fill missing values
attack_num = attack_num.fillna(0)
if not benign_num.empty:
    benign_num = benign_num.fillna(0)

# Remove rows that are all zeros
if not benign_num.empty and benign_num.shape[1] > 0:
    benign_num = benign_num.loc[~(benign_num == 0.0).all(axis=1)]
attack_num = attack_num.loc[~(attack_num == 0.0).all(axis=1)]

print("After cleaning -> attack_num:", attack_num.shape, "benign_num:", benign_num.shape)

✅ Numeric features available: 24
['label', 'urls', 'subject_len', 'subject_words', 'body_len', 'body_words', 'body_num_urls', 'urls_count', 'subject_spamword_hits', 'body_spamword_hits', 'subject_urgency_hits', 'body_urgency_hits', 'subject_keyword_hits', 'body_keyword_hits', 'subject_exclaim', 'body_exclaim', 'subject_qmark', 'body_qmark', 'subject_digits', 'body_digits', 'subject_upper_ratio', 'body_upper_ratio', 'subject_money', 'body_money']
After cleaning -> attack_num: (500, 19) benign_num: (500, 19)


## Get importances from GBF model

In [7]:
# 4. Get importances from GBF model

# If we have no numeric columns at all, abort
if attack_num.shape[1] == 0:
    raise RuntimeError("No numeric features available after cleaning. Check the data or relax numeric-only requirement.")

# If we have benign rows, combine and train; otherwise fallback selection
if benign_num.empty or len(benign_num) < 2:
    print("No benign data available or too few benign rows to train classifier.")
    # fallback: choose top 8 features by variance in attack
    top_features = attack_num.var().nlargest(8).index.tolist()
    print("Fallback top_features (by variance):", top_features)
else:
    # 1. Combine and label
    X = pd.concat([benign_num, attack_num], axis=0).reset_index(drop=True)
    y = np.array([0] * len(benign_num) + [1] * len(attack_num))

    # verify there are two classes
    if len(np.unique(y)) < 2:
        print("Warning: labels contain only one class. Falling back to variance-based feature selection.")
        top_features = attack_num.var().nlargest(8).index.tolist()
    else:
        # 2. Train XGBoost
        model = xgb.XGBClassifier(
            eval_metric='logloss',
            n_estimators=100,
            max_depth=4,
            learning_rate=0.1,
            random_state=42,
            use_label_encoder=False
        )
        model.fit(X, y)
        # 3. Pick top 8 features
        importances = pd.Series(model.feature_importances_, index=X.columns)
        top_features = importances.nlargest(8).index.tolist()
        print("Top 8 features:", top_features)
        print("Top features selected:", top_features)


Top 8 features: ['body_num_urls', 'body_exclaim', 'subject_spamword_hits', 'body_len', 'body_digits', 'body_spamword_hits', 'subject_upper_ratio', 'subject_digits']
Top features selected: ['body_num_urls', 'body_exclaim', 'subject_spamword_hits', 'body_len', 'body_digits', 'body_spamword_hits', 'subject_upper_ratio', 'subject_digits']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## Construct new cleaned dataset for testing using selected features

In [8]:
# 5. Construct new cleaned dataset for testing using selected features

selected = [c for c in top_features if c in attack_num.columns]
attack_final = attack_num[selected].reset_index(drop=True)
benign_final = benign_num[selected].reset_index(drop=True) if not benign_num.empty else pd.DataFrame(columns=selected)

# Save arrays
np.save('500-attack.npy', attack_final.to_numpy())
np.save('500-benign.npy', benign_final.to_numpy())

print("Saved: 500-attack.npy (shape {})".format(attack_final.shape))
print("Saved: 500-benign.npy (shape {})".format(benign_final.shape))

Saved: 500-attack.npy (shape (500, 8))
Saved: 500-benign.npy (shape (500, 8))
