# Cleaning Dataset and Obtaining Samples for Testing:

## a preprocessing guide for raw NIDS data for use in QML testing

### Import dependencies

In [1]:
import os, glob
import pandas as pd
import numpy as np
import xgboost as xgb
import dask.dataframe as dd

In [2]:
import sys
print(sys.executable) 
import xgboost
print(xgboost.__version__) 

import glob
csv_files = glob.glob("datasets/Phishing/*.csv")

/Users/dr.ivanramirez/venvs/qiskit_env/bin/python
3.1.0


### Load chosen datasets

In [3]:
# 2. Load chosen datasets
# If you want to use specific files, set them here. Otherwise it will load all CSVs in datasets/Phishing/
attack_files = [
    "datasets/Phishing/CEAS_08.csv",
    "datasets/Phishing/Enron.csv",
    "datasets/Phishing/Ling.csv",
    "datasets/Phishing/Nazario.csv",
    "datasets/Phishing/Nigerian_Fraud.csv",
    "datasets/Phishing/phishing_email.csv",
    "datasets/Phishing/SpamAssasin.csv",
    "datasets/Phishing/TREC_07.csv"

]

# Optionally list benign files; if empty we will split by 'label' if present in CSVs
benign_files = []  # keep empty unless you have explicit benign CSVs

# Read attack files with Dask and compute to pandas (safer for mixed schemas)
ddf_attack = dd.read_csv(attack_files, low_memory=False)
attack_df = ddf_attack.compute()
attack_df["_source"] = attack_df.get("_source_file", "")  # tag source if present

# If explicit benign_files provided, load them; otherwise try to split by label
if benign_files:
    benign_df = dd.read_csv(benign_files, low_memory=False).compute()
else:
    benign_df = pd.DataFrame()  # may fill below if 'label' exists

# If there is a 'label' column in the combined attack_df, split on it
if "label" in attack_df.columns and benign_df.empty:
    # normalize label to 0/1
    attack_df["label"] = attack_df["label"].apply(lambda v: 1 if (pd.notna(v) and int(float(v)) != 0) else 0)
    benign_df = attack_df[attack_df["label"] == 0].copy().reset_index(drop=True)
    attack_df = attack_df[attack_df["label"] == 1].copy().reset_index(drop=True)
    print("split by label -> attack:", attack_df.shape, "benign:", benign_df.shape)
else:
    print("Loaded attack files shape:", attack_df.shape, "benign explicit provided:", not benign_df.empty)

# Random sample 500 each (if available)
n_sample = 500
if len(attack_df) > n_sample:
    attack_df = attack_df.sample(n=n_sample, random_state=42).reset_index(drop=True)
if not benign_df.empty and len(benign_df) > n_sample:
    benign_df = benign_df.sample(n=n_sample, random_state=42).reset_index(drop=True)

print("After sampling -> attack:", attack_df.shape, "benign:", benign_df.shape)

split by label -> attack: (115181, 9) benign: (103548, 9)
After sampling -> attack: (500, 9) benign: (500, 9)


## DATA CLEANING

In [4]:
# 3. DATA CLEANING

# --- Ensure we have some numeric features: if dataset is text-only, derive simple numeric features ---
def text_to_numeric_features(df):
    df = df.copy()
    # safe create text-derived numeric features if columns exist
    df["subject_len"] = df.get("subject", "").astype(str).fillna("").apply(len)
    df["subject_words"] = df.get("subject", "").astype(str).fillna("").apply(lambda s: len(s.split()))
    df["body_len"] = df.get("body", "").astype(str).fillna("").apply(len)
    df["body_words"] = df.get("body", "").astype(str).fillna("").apply(lambda s: len(s.split()))
    # count urls in body or urls column
    df["body_num_urls"] = df.get("body", "").astype(str).fillna("").apply(lambda s: len(__import__("re").findall(r"https?://|www\.", s)))
    if "urls" in df.columns:
        # try numeric otherwise count occurrences
        try:
            df["urls_count"] = pd.to_numeric(df["urls"], errors="coerce").fillna(0).astype(int)
        except:
            df["urls_count"] = df["urls"].astype(str).fillna("").apply(lambda s: len(__import__("re").findall(r"https?://|www\.", s)))
    else:
        df["urls_count"] = df["body_num_urls"]
    return df

attack_df = text_to_numeric_features(attack_df)
if not benign_df.empty:
    benign_df = text_to_numeric_features(benign_df)

# Keep only numeric columns (your original intent)
attack_num = attack_df.select_dtypes(include=[np.number]).copy()
benign_num = benign_df.select_dtypes(include=[np.number]).copy() if not benign_df.empty else pd.DataFrame(columns=attack_num.columns)

# Replace inf with NaN
attack_num.replace([np.inf, -np.inf], np.nan, inplace=True)
if not benign_num.empty:
    benign_num.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop columns with >20% missing in attack (use attack_num only)
cols_ok = attack_num.isnull().mean() < 0.2
cols_ok_names = cols_ok[cols_ok].index.tolist()
attack_num = attack_num.loc[:, cols_ok_names]

# Align benign to same columns if exists (use intersection to avoid boolean index misalignment)
if not benign_num.empty:
    shared = [c for c in cols_ok_names if c in benign_num.columns]
    attack_num = attack_num.loc[:, shared]
    benign_num = benign_num.loc[:, shared]
else:
    shared = attack_num.columns.tolist()

# Drop single-value columns in attack
multi_cols = attack_num.nunique()[attack_num.nunique() > 1].index.tolist()
attack_num = attack_num.loc[:, multi_cols]
if not benign_num.empty:
    benign_num = benign_num.reindex(columns=multi_cols, fill_value=np.nan)

# Drop highly collinear columns (corr > 0.95) computed on attack_num
if attack_num.shape[1] > 1:
    corr = attack_num.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]
    if to_drop:
        attack_num.drop(columns=to_drop, inplace=True, errors='ignore')
        if not benign_num.empty:
            benign_num.drop(columns=to_drop, inplace=True, errors='ignore')

# Drop any column that still has NaNs: if benign exists, use benign non-null columns; otherwise drop columns with NaN in attack
if not benign_num.empty:
    no_nan_cols = benign_num.columns[~benign_num.isnull().any()].tolist()
    attack_num = attack_num.loc[:, [c for c in no_nan_cols if c in attack_num.columns]]
    benign_num = benign_num.loc[:, [c for c in no_nan_cols if c in benign_num.columns]]
else:
    attack_num = attack_num.dropna(axis=1, how='any')

# Remove rows that are all zeros
if not benign_num.empty and benign_num.shape[1] > 0:
    benign_num = benign_num.loc[~(benign_num == 0.0).all(axis=1)]
attack_num = attack_num.loc[~(attack_num == 0.0).all(axis=1)]

print("After cleaning -> attack_num:", attack_num.shape, "benign_num:", benign_num.shape)

After cleaning -> attack_num: (500, 5) benign_num: (500, 5)


## Get importances from GBF model

In [5]:
# 4. Get importances from GBF model

# If we have no numeric columns at all, abort
if attack_num.shape[1] == 0:
    raise RuntimeError("No numeric features available after cleaning. Check the data or relax numeric-only requirement.")

# If we have benign rows, combine and train; otherwise fallback selection
if benign_num.empty or len(benign_num) < 2:
    print("No benign data available or too few benign rows to train classifier.")
    # fallback: choose top 8 features by variance in attack
    top_features = attack_num.var().nlargest(8).index.tolist()
    print("Fallback top_features (by variance):", top_features)
else:
    # 1. Combine and label
    X = pd.concat([benign_num, attack_num], axis=0).reset_index(drop=True)
    y = np.array([0] * len(benign_num) + [1] * len(attack_num))

    # verify there are two classes
    if len(np.unique(y)) < 2:
        print("Warning: labels contain only one class. Falling back to variance-based feature selection.")
        top_features = attack_num.var().nlargest(8).index.tolist()
    else:
        # 2. Train XGBoost
        model = xgb.XGBClassifier(
            eval_metric='logloss',
            n_estimators=100,
            max_depth=4,
            learning_rate=0.1,
            random_state=42,
            use_label_encoder=False
        )
        model.fit(X, y)
        # 3. Pick top 8 features
        importances = pd.Series(model.feature_importances_, index=X.columns)
        top_features = importances.nlargest(8).index.tolist()
        print("Top 8 features:", top_features)


Top 8 features: ['body_num_urls', 'body_len', 'subject_words', 'subject_len', 'urls_count']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## Construct new cleaned dataset for testing using selected features

In [6]:
# 5. Construct new cleaned dataset for testing using selected features

selected = [c for c in top_features if c in attack_num.columns]
attack_final = attack_num[selected].reset_index(drop=True)
benign_final = benign_num[selected].reset_index(drop=True) if not benign_num.empty else pd.DataFrame(columns=selected)

# Save arrays
np.save('500-attack.npy', attack_final.to_numpy())
np.save('500-benign.npy', benign_final.to_numpy())

print("Saved: 500-attack.npy (shape {})".format(attack_final.shape))
print("Saved: 500-benign.npy (shape {})".format(benign_final.shape))

Saved: 500-attack.npy (shape (500, 5))
Saved: 500-benign.npy (shape (500, 5))


In [7]:
# Print 20 cells