# Healthcare IoT Device Network Intrusion Detection System

### Creating the dataframes

In [111]:
import pandas as pd
import os
import glob

# options to see all columns in dataframes
pd.set_option("display.max_columns", None)

# get directory folder of train and test csv files (make sure to replace with path on your computer)
train_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Train".replace(
    "\\", "/"
)
test_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Test".replace(
    "\\", "/"
)

# store paths of all csv files in the folder in a list
train_csv_files = glob.glob(os.path.join(train_dir, "*.csv"))
test_csv_files = glob.glob(os.path.join(test_dir, "*.csv"))

# list to store csv files as dataframes
train_dataframes = []
test_dataframes = []

# create dataframes for each csv with column to label the class type
# get filename from path
for file in train_csv_files:
    filename = os.path.basename(file)

    # split the file name to get the class of the attack
    attack_type = filename.split("_train.pcap.csv")[0]

    # since some attacks are split into multiple files we need to remove numbers from the string
    # if last character is a number we remove it to get the name of the attack type
    if attack_type[-1].isdigit():
        attack_type = attack_type[:-1]

    # create a dataframe for the specific csv file
    df = pd.read_csv(file)

    # create a new column to classify the attack type in that csv file
    df["classification"] = attack_type

    # append the dataframe to the dataframes list
    train_dataframes.append(df)

    # delete the dataframe
    del df

# do the same for the test data
for file in test_csv_files:
    filename = os.path.basename(file)

    # split the file name to get the class of the attack
    attack_type = filename.split("_test.pcap.csv")[0]

    # since some attacks are split into multiple files we need to remove numbers from the string
    # if last character is a number we remove it to get the name of the attack type
    if attack_type[-1].isdigit():
        attack_type = attack_type[:-1]

    # create a dataframe for the specific csv file
    df = pd.read_csv(file)

    # create a new column to classify the attack type in that csv file
    df["classification"] = attack_type

    # append the dataframe to the dataframes list
    test_dataframes.append(df)

    # delete the dataframe
    del df


# combine all of the dataframes into one
train_df = pd.concat(train_dataframes, ignore_index=True)
test_df = pd.concat(test_dataframes, ignore_index=True)

# downsample to a certain fraction of random samples per class depending on how many rows in the class
# if smaller class don't cut out rows or if normal traffic don't cut out
result_dfs = []
for class_name, group in train_df.groupby("classification"):
    if len(group) > 1000 and class_name.lower() != "benign":
        result_dfs.append(group.sample(n=1000, random_state=42))
    elif class_name.lower() == "benign":
        result_dfs.append(group.sample(n=2000, random_state=42))
    else:
        result_dfs.append(group)
train_df = pd.concat(result_dfs, ignore_index=True)

# add binary label to the training and testing dataframe in order to test binary classification as well
train_df["binary_classification"] = train_df["classification"].apply(
    lambda binary: 0 if binary.lower() == "benign" else 1
)
test_df["binary_classification"] = test_df["classification"].apply(
    lambda binary: 0 if binary.lower() == "benign" else 1
)

print(train_df["classification"].value_counts())
print(train_df["binary_classification"].value_counts())
print(test_df["binary_classification"].value_counts())

print(f"rows in training dataset: {train_df.shape[0]}")
print(f"rows in testing dataset: {test_df.shape[0]}")

classification
Benign                     2000
ARP_Spoofing               1000
MQTT-DDoS-Connect_Flood    1000
MQTT-DDoS-Publish_Flood    1000
MQTT-DoS-Connect_Flood     1000
MQTT-DoS-Publish_Flood     1000
MQTT-Malformed_Data        1000
Recon-OS_Scan              1000
Recon-Port_Scan            1000
Recon-VulScan              1000
TCP_IP-DDoS-ICMP           1000
TCP_IP-DoS-ICMP            1000
TCP_IP-DDoS-SYN            1000
TCP_IP-DDoS-TCP            1000
TCP_IP-DDoS-UDP            1000
TCP_IP-DoS-TCP             1000
TCP_IP-DoS-SYN             1000
TCP_IP-DoS-UDP             1000
Recon-Ping_Sweep            740
Name: count, dtype: int64
binary_classification
1    17740
0     2000
Name: count, dtype: int64
binary_classification
1    1576575
0      37607
Name: count, dtype: int64
rows in training dataset: 19740
rows in testing dataset: 1614182


### Data Preprocessing

In [112]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# create training and testing data from the dataframes made earlier
# must drop the classification rows in order to work with the feature data
# create both y training and testing data for multi class classification and binary classification
X_train = train_df.drop(columns=["classification", "binary_classification"])
y_train_multi = train_df["classification"]
y_train_binary = train_df["binary_classification"]

X_test = test_df.drop(columns=["classification", "binary_classification"])
y_test_multi = test_df["classification"]
y_test_binary = test_df["binary_classification"]

# standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# use PCA to reduce feature
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")

del X_train, X_test, X_train_scaled, X_test_scaled

Reduced number of features after PCA: 23


### Random Forest Classifier Training

In [113]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV

In [114]:
# train random forest classifier model
# create two different RF classifiers, one to fit the multi class classification data
# and one to fit the binary classification data
rf_multi = RandomForestClassifier(random_state=42, n_jobs=12)
rf_binary = RandomForestClassifier(random_state=42, n_jobs=12)

rf_multi.fit(X_train_pca, y_train_multi)
rf_binary.fit(X_train_pca, y_train_binary)

#### Accuracy of Random Forest Classifier Model

In [115]:
# check accuracy of model
rf_y_pred_multi = rf_multi.predict(X_test_pca)
rf_y_pred_binary = rf_binary.predict(X_test_pca)
rf_multi_classification_report = classification_report(
    y_test_multi, rf_y_pred_multi, output_dict=True
)
rf_binary_classification_report = classification_report(
    y_test_binary, rf_y_pred_binary, output_dict=True
)

print(
    "\nClassification Report for Multi-Class Classification:\n",
    rf_multi_classification_report,
)
print(
    "\nClassification Report for Binary Classification:\n",
    rf_binary_classification_report,
)


Classification Report for Multi-Class Classification:
 {'ARP_Spoofing': {'precision': 0.14668185702079178, 'recall': 0.5905963302752294, 'f1-score': 0.2349988592288387, 'support': 1744.0}, 'Benign': {'precision': 0.9483335941167267, 'recall': 0.8058074294679182, 'f1-score': 0.871280296713723, 'support': 37607.0}, 'MQTT-DDoS-Connect_Flood': {'precision': 0.9900966066313899, 'recall': 0.9731367496898559, 'f1-score': 0.9815434222874606, 'support': 41916.0}, 'MQTT-DDoS-Publish_Flood': {'precision': 0.9270664505672609, 'recall': 0.20389733840304183, 'f1-score': 0.334274861205805, 'support': 8416.0}, 'MQTT-DoS-Connect_Flood': {'precision': 0.7331736526946108, 'recall': 0.9776429255828809, 'f1-score': 0.8379414180125924, 'support': 3131.0}, 'MQTT-DoS-Publish_Flood': {'precision': 0.5709095801937567, 'recall': 0.9977660199882422, 'f1-score': 0.7262612863194831, 'support': 8505.0}, 'MQTT-Malformed_Data': {'precision': 0.4382202304737516, 'recall': 0.7836290784201488, 'f1-score': 0.562102237733

#### Optimizing the Random Forest Classifier

In [116]:
import numpy as np

# parameters to search through
rf_params_grid = {
    "n_estimators": [100, 200, 400, 800],
    "max_depth": [None, 10, 20, 40, 80],
    "min_samples_split": [2, 4, 8, 16],
    "min_samples_leaf": [1, 2, 4, 8],
}

# initialize RandomizedSearchCV object for both multi class and binary classification
rf_multi_randomized_search = RandomizedSearchCV(
    estimator=rf_multi,
    param_distributions=rf_params_grid,
    cv=3,
    n_iter=1,
    random_state=42,
    n_jobs=12,
    verbose=2,
    scoring="f1_macro",
)
rf_binary_randomized_search = RandomizedSearchCV(
    estimator=rf_binary,
    param_distributions=rf_params_grid,
    cv=3,
    n_iter=1,
    random_state=42,
    n_jobs=12,
    verbose=2,
    scoring="f1_macro",
)

# start search on hyper parameters for both binary and multi class classification
rf_multi_search = rf_multi_randomized_search.fit(X_train_pca, y_train_multi)
rf_binary_search = rf_binary_randomized_search.fit(X_train_pca, y_train_binary)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [117]:
# print best estimator and best parameters
print(
    f"Best estimator for multi class classification: {rf_multi_search.best_estimator_}"
)
print(f"Best parameters for multi class classification: {rf_multi_search.best_params_}")
print(f"Best estimator for binary classification: {rf_binary_search.best_estimator_}")
print(f"Best parameters for binary classification: {rf_binary_search.best_params_}")

# print classification report for those parameters for both multi class and binary classification
rf_y_pred_multi_random_search = rf_multi_search.predict(X_test_pca)
rf_y_pred_binary_random_search = rf_binary_search.predict(X_test_pca)

# classification reports
rf_multi_search_classification_report = classification_report(
    y_test_multi, rf_y_pred_multi_random_search, output_dict=True
)
rf_binary_search_classification_report = classification_report(
    y_test_binary, rf_y_pred_binary_random_search, output_dict=True
)

print(
    "\nClassification Report for RandomizedSearchCV Multi Class Classification:\n",
    rf_multi_search_classification_report,
)
print(
    "\nClassification Report for RandomizedSearchCV Binary Classification:\n",
    rf_binary_search_classification_report,
)

Best estimator for multi class classification: RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=4,
                       n_estimators=400, n_jobs=12, random_state=42)
Best parameters for multi class classification: {'n_estimators': 400, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_depth': 10}
Best estimator for binary classification: RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=4,
                       n_estimators=400, n_jobs=12, random_state=42)
Best parameters for binary classification: {'n_estimators': 400, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_depth': 10}

Classification Report for RandomizedSearchCV Multi Class Classification:
 {'ARP_Spoofing': {'precision': 0.15051104516979888, 'recall': 0.5235091743119266, 'f1-score': 0.23380281690140844, 'support': 1744.0}, 'Benign': {'precision': 0.9245973863602536, 'recall': 0.8259100699337889, 'f1-score': 0.8724719101123596, 'support': 37607.0}, 'MQTT-DDoS-Connec

#### Tables for Model Scores Before and After Optimization

In [None]:
def classification_report_tables(
    report_before,
    report_after,
    algorithm_name="Model",
    classification_type="Multiclass",
):
    metrics = ["precision", "recall", "f1-score"]

    # Get all class labels (not avg or accuracy keys)
    class_labels = [
        k
        for k in report_before
        if isinstance(report_before[k], dict) and k not in ["macro avg", "weighted avg"]
    ]

    # Table 1: Per-class scores + Support
    class_rows = []
    for label in class_labels:
        row = {"Label": label}
        for metric in metrics:
            row[f"{metric.capitalize()} (Before)"] = report_before[label].get(metric)
            row[f"{metric.capitalize()} (After)"] = report_after[label].get(metric)
        row["Support"] = report_after[label].get("support")
        class_rows.append(row)

    df_class = pd.DataFrame(class_rows).set_index("Label")

    # Reorder columns: move "Support" after "F1-score (After)"
    cols = df_class.columns.tolist()
    if "F1-score (After)" in cols and "Support" in cols:
        idx = cols.index("F1-score (After)")
        cols.insert(idx + 1, cols.pop(cols.index("Support")))
        df_class = df_class[cols]

    # Sort by F1-score (After)
    df_class = df_class.sort_values(by="F1-score (After)")

    # Table 2: Macro avg and Weighted avg
    avg_types = ["macro avg", "weighted avg"]
    avg_rows = []
    for avg in avg_types:
        row = {"Average Type": avg}
        for metric in metrics:
            row[f"{metric.capitalize()} (Before)"] = report_before[avg].get(metric)
            row[f"{metric.capitalize()} (After)"] = report_after[avg].get(metric)
        avg_rows.append(row)
    df_avg = pd.DataFrame(avg_rows).set_index("Average Type")

    df_avg = df_avg.sort_values(by="F1-score (After)")

    # Table 3: Accuracy
    df_acc = pd.DataFrame(
        {
            "Accuracy (Before)": [report_before.get("accuracy")],
            "Accuracy (After)": [report_after.get("accuracy")],
        },
        index=["Accuracy"],
    )

    # Title prefix
    title_prefix = f"{algorithm_name} ({classification_type})"

    # Background gradient subset: exclude "Support"
    gradient_cols = [col for col in df_class.columns if col != "Support"]

    # Format dict: all columns as 3 decimals, except "Support"
    format_dict = {col: "{:.3f}" for col in df_class.columns}
    format_dict["Support"] = "{:.1f}"

    # Display with captions
    display(
        df_class.style.set_caption(f"{title_prefix} Per-Class Metrics")
        .background_gradient(cmap="YlGnBu", axis=None, subset=gradient_cols)
        .format(format_dict)
    )

    display(
        df_avg.style.set_caption(f"{title_prefix} Average Metrics")
        .background_gradient(cmap="YlGnBu", axis=None)
        .format("{:.3f}")
    )

    display(
        df_acc.style.set_caption(f"{title_prefix} Overall Accuracy")
        .background_gradient(cmap="YlGnBu", axis=None)
        .format("{:.3f}")
    )

In [120]:
classification_report_tables(
    rf_multi_classification_report,
    rf_multi_search_classification_report,
    algorithm_name="Random Forest Classifier",
    classification_type="Multiclass Classification",
)

classification_report_tables(
    rf_binary_classification_report,
    rf_binary_search_classification_report,
    algorithm_name="Random Forest Classifier",
    classification_type="Binary Classification",
)


Unnamed: 0_level_0,Precision (Before),Precision (After),Recall (Before),Recall (After),F1-score (Before),F1-score (After),Support
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Recon-VulScan,0.152,0.136,0.404,0.302,0.221,0.187,1034.0
Recon-Ping_Sweep,0.112,0.123,0.844,0.849,0.198,0.216,186.0
ARP_Spoofing,0.147,0.151,0.591,0.524,0.235,0.234,1744.0
MQTT-DDoS-Publish_Flood,0.927,0.916,0.204,0.212,0.334,0.344,8416.0
Recon-OS_Scan,0.342,0.524,0.593,0.376,0.434,0.438,3834.0
TCP_IP-DoS-UDP,0.337,0.335,0.685,0.71,0.452,0.455,137553.0
MQTT-Malformed_Data,0.438,0.38,0.784,0.777,0.562,0.51,1747.0
TCP_IP-DDoS-UDP,0.803,0.808,0.488,0.464,0.607,0.589,362070.0
MQTT-DoS-Publish_Flood,0.571,0.571,0.998,0.996,0.726,0.726,8505.0
TCP_IP-DoS-SYN,0.861,0.865,0.843,0.673,0.852,0.757,98595.0


Unnamed: 0_level_0,Precision (Before),Precision (After),Recall (Before),Recall (After),F1-score (Before),F1-score (After)
Average Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
macro avg,0.671,0.674,0.763,0.741,0.666,0.66
weighted avg,0.84,0.831,0.791,0.78,0.799,0.786


Unnamed: 0,Accuracy (Before),Accuracy (After)
Accuracy,0.791,0.78


Unnamed: 0_level_0,Precision (Before),Precision (After),Recall (Before),Recall (After),F1-score (Before),F1-score (After),Support
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.986,0.991,0.747,0.727,0.85,0.839,37607.0
1,0.994,0.994,1.0,1.0,0.997,0.997,1576575.0


Unnamed: 0_level_0,Precision (Before),Precision (After),Recall (Before),Recall (After),F1-score (Before),F1-score (After)
Average Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
macro avg,0.99,0.992,0.873,0.863,0.923,0.918
weighted avg,0.994,0.993,0.994,0.993,0.993,0.993


Unnamed: 0,Accuracy (Before),Accuracy (After)
Accuracy,0.994,0.993


### Gradient Boosting Classifier Training

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# train gradient boosting classifier model
os.environ["LOKY_MAX_CPU_COUNT"] = "12"

hgbc_multi = HistGradientBoostingClassifier(random_state=42)
hgbc_binary = HistGradientBoostingClassifier(random_state=42)
hgbc_multi.fit(X_train_pca, y_train_multi)
hgbc_binary.fit(X_train_pca, y_train_binary)

#### Accuracy of Gradient Boosting Classifier Model

In [None]:
# check accuracy of gradient boosting classifier model
hgbc_y_pred_multi = hgbc_multi.predict(X_test_pca)
hgbc_y_pred_binary = hgbc_binary.predict(X_test_pca)
print(
    "\nClassification Report for RandomizedSearchCV Multi Class Classification:\n",
    classification_report(y_test_multi, hgbc_y_pred_multi),
)
print(
    "\nClassification Report for RandomizedSearchCV Binary Classification:\n",
    classification_report(y_test_binary, hgbc_y_pred_binary),
)

#### Optimizing Gradient Boosting Classifier Model

In [None]:
import numpy as np

# create params grid
hgbc_params_grid = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_iter": np.linspace(100, 1000, 10, dtype=int),
    "max_leaf_nodes": [None] + list(np.linspace(10, 100, 10, dtype=int)),
    "max_depth": [None] + list(np.linspace(1, 100, 10, dtype=int)),
    "min_samples_leaf": np.linspace(1, 200, 10, dtype=int),
    "l2_regularization": np.linspace(0.0, 1.0, 10),
    "max_features": np.linspace(0.1, 1.0, 10),
    "max_bins": np.linspace(2, 255, 10, dtype=int),
}


# initialize RandomizedSearchCV object
hgbc_multi_randomized_search = RandomizedSearchCV(
    estimator=hgbc_multi,
    param_distributions=hgbc_params_grid,
    cv=3,
    random_state=42,
    n_jobs=12,
    verbose=2,
    scoring="f1_macro",
)
hgbc_binary_randomized_search = RandomizedSearchCV(
    estimator=hgbc_binary,
    param_distributions=hgbc_params_grid,
    cv=3,
    random_state=42,
    n_jobs=12,
    verbose=2,
    scoring="f1_macro",
)
# start search on hyper parameters
hgbc_multi_search = hgbc_multi_randomized_search.fit(X_train_pca, y_train_multi)
hgbc_binary_search = hgbc_binary_randomized_search.fit(X_train_pca, y_train_binary)

In [None]:
# print best estimator and best parameters for both multi class and binary classifications
print(
    f"Best estimator for multi class classification: {hgbc_multi_search.best_estimator_}"
)
print(
    f"Best parameters for multi class classification: {hgbc_multi_search.best_params_}"
)
print(f"Best estimator for binary classification: {hgbc_binary_search.best_estimator_}")
print(f"Best parameters for binary classification: {hgbc_binary_search.best_params_}")

# print classification report for those parameters
hgbc_multi_y_pred_random_search = hgbc_multi_search.predict(X_test_pca)
hgbc_binary_y_pred_random_search = hgbc_binary_search.predict(X_test_pca)
print(
    "\nClassification Report for best parameters for multi class classification:\n",
    classification_report(y_test_multi, hgbc_multi_y_pred_random_search),
)
print(
    "\nClassification Report for best parameters for binary classification:\n",
    classification_report(y_test_binary, hgbc_binary_y_pred_random_search),
)

### Multi-layer Perceptron (MLP) Classifier Training

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [None]:
# train MLP model
mlpc = MLPClassifier(random_state=42, max_iter=10000)
mlpc.fit(X_train_pca, y_train)

#### Accuracy of MLP Classifier Model

In [None]:
# check accuracy of MLP neural network
y_pred = mlpc.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

### Optimizing MLP Classifier Model