# Healthcare IoT Device Network Intrusion Detection System

### Dataframe Creation

In [1]:
import pandas as pd
import os
import glob

# options to see all columns in dataframes
pd.set_option("display.max_columns", None)

# get directory folder of train and test csv files (make sure to replace with path on your computer)
train_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Train".replace(
    "\\", "/"
)
test_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Test".replace(
    "\\", "/"
)

# store paths of all csv files in the folder in a list
train_csv_files = glob.glob(os.path.join(train_dir, "*.csv"))
test_csv_files = glob.glob(os.path.join(test_dir, "*.csv"))

# list to store csv files as dataframes
train_dataframes = []
test_dataframes = []

# create dataframes for each csv with column to label the class type
# get filename from path
for file in train_csv_files:
    filename = os.path.basename(file)

    # split the file name to get the class of the attack
    attack_type = filename.split("_train.pcap.csv")[0]

    # since some attacks are split into multiple files we need to remove numbers from the string
    # if last character is a number we remove it to get the name of the attack type
    if attack_type[-1].isdigit():
        attack_type = attack_type[:-1]

    # create a dataframe for the specific csv file
    df = pd.read_csv(file)

    # create a new column to classify the attack type in that csv file
    df["classification"] = attack_type

    # append the dataframe to the dataframes list
    train_dataframes.append(df)

    # delete the dataframe
    del df

# do the same for the test data
for file in test_csv_files:
    filename = os.path.basename(file)

    # split the file name to get the class of the attack
    attack_type = filename.split("_test.pcap.csv")[0]

    # since some attacks are split into multiple files we need to remove numbers from the string
    # if last character is a number we remove it to get the name of the attack type
    if attack_type[-1].isdigit():
        attack_type = attack_type[:-1]

    # create a dataframe for the specific csv file
    df = pd.read_csv(file)

    # create a new column to classify the attack type in that csv file
    df["classification"] = attack_type

    # append the dataframe to the dataframes list
    test_dataframes.append(df)

    # delete the dataframe
    del df


# combine all of the dataframes into one
train_df = pd.concat(train_dataframes, ignore_index=True)
test_df = pd.concat(test_dataframes, ignore_index=True)

print(train_df["classification"].value_counts())

# downsample to a certain fraction of random samples per class depending on how many rows in the class
# if smaller class don't cut out rows
result_dfs = []
for class_name, group in train_df.groupby("classification"):
    if len(group) > 500000:
        result_dfs.append(group.sample(frac=0.01, random_state=42))
    elif len(group) > 100000:
        result_dfs.append(group.sample(frac=0.1, random_state=42))
    elif len(group) > 10000:
        result_dfs.append(group.sample(frac=0.2, random_state=42))
    else:
        result_dfs.append(group)
train_df = pd.concat(result_dfs, ignore_index=True)

print(train_df["classification"].value_counts())
print(test_df["classification"].value_counts())

classification
TCP_IP-DDoS-UDP            1635956
TCP_IP-DDoS-ICMP           1537476
TCP_IP-DDoS-TCP             804465
TCP_IP-DDoS-SYN             801962
TCP_IP-DoS-UDP              566950
TCP_IP-DoS-SYN              441903
TCP_IP-DoS-ICMP             416292
TCP_IP-DoS-TCP              380384
Benign                      192732
MQTT-DDoS-Connect_Flood     173036
Recon-Port_Scan              83981
MQTT-DoS-Publish_Flood       44376
MQTT-DDoS-Publish_Flood      27623
Recon-OS_Scan                16832
ARP_Spoofing                 16047
MQTT-DoS-Connect_Flood       12773
MQTT-Malformed_Data           5130
Recon-VulScan                 2173
Recon-Ping_Sweep               740
Name: count, dtype: int64
classification
TCP_IP-DoS-SYN             44190
TCP_IP-DoS-ICMP            41629
TCP_IP-DoS-TCP             38038
Benign                     19273
MQTT-DDoS-Connect_Flood    17304
Recon-Port_Scan            16796
TCP_IP-DDoS-UDP            16360
TCP_IP-DDoS-ICMP           15375
MQTT-DoS-Publis

### Data Preprocessing

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# create training and testing data from the dataframes made earlier
X_train = train_df.drop(columns=["classification"])
y_train = train_df["classification"]

X_test = test_df.drop(columns=["classification"])
y_test = test_df["classification"]

# standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# use PCA to reduce feature
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")

del train_df, test_df


Reduced number of features after PCA: 22


### Random Forest Classifier Training

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV

In [4]:
# train random forest classifier model
rf = RandomForestClassifier(random_state=42, n_jobs=4)
rf.fit(X_train_pca, y_train)

### Accuracy of Random Forest Classifier Model

In [5]:
# check accuracy of model
y_pred = rf.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
                          precision    recall  f1-score   support

           ARP_Spoofing       0.20      0.52      0.29      1744
                 Benign       0.93      0.89      0.91     37607
MQTT-DDoS-Connect_Flood       0.99      1.00      1.00     41916
MQTT-DDoS-Publish_Flood       0.98      0.15      0.26      8416
 MQTT-DoS-Connect_Flood       0.95      0.94      0.95      3131
 MQTT-DoS-Publish_Flood       0.54      1.00      0.70      8505
    MQTT-Malformed_Data       0.68      0.80      0.74      1747
          Recon-OS_Scan       0.70      0.47      0.56      3834
       Recon-Ping_Sweep       0.46      0.67      0.54       186
        Recon-Port_Scan       0.87      0.92      0.90     22622
          Recon-VulScan       0.24      0.32      0.28      1034
       TCP_IP-DDoS-ICMP       0.99      0.97      0.98    349699
        TCP_IP-DDoS-SYN       0.98      0.90      0.94    172397
        TCP_IP-DDoS-TCP       0.98      0.96      0.97    182598

### Optimizing the Random Forest Classifier

In [6]:
# parameters to search through
params_grid = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 10, 20, 30, 40],
    "min_samples_split": [2, 4, 8],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "class_weight": [None, "balanced"],
}

# initialize RandomizedSearchCV object
rf_randomized_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=params_grid,
    n_iter=10,
    cv=5,
    random_state=42,
    n_jobs=4,
    verbose=2,
    scoring="f1_macro",
)
# start search on hyper parameters
search = rf_randomized_search.fit(X_train_pca, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [7]:
# get best parameters from randomized search cv
print(f"best estimator: {search.best_estimator_}")
print(f"best parameters: {search.best_params_}")

# get classification report from best model
y_pred = search.best_estimator_.predict(X_test_pca)
print(print("\nClassification Report:\n", classification_report(y_test, y_pred)))

best estimator: RandomForestClassifier(max_features=None, min_samples_leaf=4, n_estimators=200,
                       n_jobs=4, random_state=42)
best parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': None, 'class_weight': None}

Classification Report:
                          precision    recall  f1-score   support

           ARP_Spoofing       0.21      0.50      0.29      1744
                 Benign       0.92      0.88      0.90     37607
MQTT-DDoS-Connect_Flood       0.99      1.00      0.99     41916
MQTT-DDoS-Publish_Flood       0.95      0.16      0.27      8416
 MQTT-DoS-Connect_Flood       0.94      0.94      0.94      3131
 MQTT-DoS-Publish_Flood       0.55      1.00      0.71      8505
    MQTT-Malformed_Data       0.61      0.80      0.69      1747
          Recon-OS_Scan       0.82      0.44      0.57      3834
       Recon-Ping_Sweep       0.50      0.66      0.57       186
        Recon-Port_Scan      

### Gradient Boosting Classifier Training

In [8]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report

In [10]:
# train gradient boosting classifier model
os.environ["LOKY_MAX_CPU_COUNT"] = "4"

hgbc = HistGradientBoostingClassifier(random_state=42)
hgbc.fit(X_train_pca, y_train)

### Accuracy of Gradient Boosting Classifier Model

In [11]:
# check accuracy of gradient boosting classifier model

y_pred = hgbc.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
                          precision    recall  f1-score   support

           ARP_Spoofing       0.26      0.43      0.32      1744
                 Benign       0.91      0.88      0.90     37607
MQTT-DDoS-Connect_Flood       0.99      0.99      0.99     41916
MQTT-DDoS-Publish_Flood       0.95      0.16      0.28      8416
 MQTT-DoS-Connect_Flood       0.90      0.91      0.91      3131
 MQTT-DoS-Publish_Flood       0.56      1.00      0.71      8505
    MQTT-Malformed_Data       0.67      0.72      0.70      1747
          Recon-OS_Scan       0.66      0.28      0.40      3834
       Recon-Ping_Sweep       0.04      0.68      0.07       186
        Recon-Port_Scan       0.82      0.95      0.88     22622
          Recon-VulScan       0.20      0.20      0.20      1034
       TCP_IP-DDoS-ICMP       0.97      0.94      0.95    349699
        TCP_IP-DDoS-SYN       0.98      0.81      0.88    172397
        TCP_IP-DDoS-TCP       0.98      0.81      0.89    182598

### Optimizing Gradient Boosting Classifier Model

### Multi-layer Perceptron (MLP) Classifier Training

In [12]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [14]:
# train MLP model
mlpc = MLPClassifier(random_state=42, max_iter=1000)
mlpc.fit(X_train_pca, y_train)

### Accuracy of MLP Classifier Model

In [15]:
# check accuracy of MLP neural network
y_pred = mlpc.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
                          precision    recall  f1-score   support

           ARP_Spoofing       0.24      0.58      0.34      1744
                 Benign       0.95      0.90      0.92     37607
MQTT-DDoS-Connect_Flood       1.00      1.00      1.00     41916
MQTT-DDoS-Publish_Flood       0.99      0.15      0.26      8416
 MQTT-DoS-Connect_Flood       0.97      0.95      0.96      3131
 MQTT-DoS-Publish_Flood       0.54      1.00      0.70      8505
    MQTT-Malformed_Data       0.71      0.74      0.72      1747
          Recon-OS_Scan       0.70      0.07      0.13      3834
       Recon-Ping_Sweep       0.32      0.82      0.47       186
        Recon-Port_Scan       0.85      0.95      0.90     22622
          Recon-VulScan       0.24      0.23      0.24      1034
       TCP_IP-DDoS-ICMP       0.88      0.18      0.29    349699
        TCP_IP-DDoS-SYN       0.94      0.42      0.58    172397
        TCP_IP-DDoS-TCP       0.74      0.06      0.11    182598

### Optimizing MLP Classifier Model