# Healthcare IoT Device Network Intrusion Detection System

### Dataframe Creation

In [1]:
import pandas as pd
import os
import glob
import gc

#options to see all columns in dataframes
pd.set_option("display.max_columns", None)

#get directory folder of train and test csv files (make sure to replace with path on your computer)
train_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Train".replace("\\", "/")
test_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Test".replace("\\", "/")

#store paths of all csv files in the folder in a list
train_csv_files = glob.glob(os.path.join(train_dir, "*.csv"))
test_csv_files = glob.glob(os.path.join(test_dir, "*.csv"))

#list to store csv files as dataframes
train_dataframes = []
test_dataframes = []

#create dataframes for each csv with column to label the class type
#get filename from path
for file in train_csv_files:
    filename = os.path.basename(file)

    #split the file name to get the class of the attack
    attack_type = filename.split("_train.pcap.csv")[0]

    #since some attacks are split into multiple files we need to remove numbers from the string
    #if last character is a number we remove it to get the name of the attack type
    if(attack_type[-1].isdigit()):
        attack_type = attack_type[:-1]

    #create a dataframe for the specific csv file
    df = pd.read_csv(file)

    # if dataframe has more than 100000 rows then randomly sample 10% of those rows
    if(df.shape[0] > 10000):
        reduce = df.sample(frac=0.05)
    else:
        reduce = df

    #create a new column to classify the attack type in that csv file
    reduce['classification'] = attack_type

    #append the dataframe to the dataframes list
    train_dataframes.append(reduce)

    #delete the dataframe
    del reduce
    del df

#do the same for the test data
for file in test_csv_files:
    filename = os.path.basename(file)

    #split the file name to get the class of the attack
    attack_type = filename.split("_test.pcap.csv")[0]

    #since some attacks are split into multiple files we need to remove numbers from the string
    #if last character is a number we remove it to get the name of the attack type
    if(attack_type[-1].isdigit()):
        attack_type = attack_type[:-1]

    #create a dataframe for the specific csv file
    df = pd.read_csv(file)


    #create a new column to classify the attack type in that csv file
    df['classification'] = attack_type

    #append the dataframe to the dataframes list
    test_dataframes.append(df)

    #delete the dataframe
    del df


# combine all of the dataframes into one
train_df = pd.concat(train_dataframes, ignore_index=True)
test_df = pd.concat(test_dataframes, ignore_index=True)

del train_dataframes
del test_dataframes

gc.collect()

23

### Data Preprocessing

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# create training and testing data from the dataframes made earlier
X_train = train_df.drop(columns=["classification"])
y_train = train_df['classification']

X_test = test_df.drop(columns=['classification'])
y_test = test_df['classification']

# standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# use PCA to reduce feature
pca = PCA(n_components=0.99)
X_train_pca = pca.fit_transform(X_train_scaled)
print(f'number of rows in train: {X_train_pca.shape[0]}')
X_test_pca = pca.transform(X_test_scaled)
print(f'number of rows in test: {X_test_pca.shape[0]}')

print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")


number of rows in train: 365683
number of rows in test: 1614182
Reduced number of features after PCA: 29


### Random Forest Classifier Training

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# train random forest classifier model
rf = RandomForestClassifier(n_jobs=8)
rf.fit(X_train_pca, y_train)

### Accuracy of Random Forest Classifier Model

In [4]:
# check accuracy of model
y_pred = rf.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
                          precision    recall  f1-score   support

           ARP_Spoofing       0.21      0.44      0.29      1744
                 Benign       0.94      0.89      0.91     37607
MQTT-DDoS-Connect_Flood       0.99      1.00      1.00     41916
MQTT-DDoS-Publish_Flood       0.99      0.17      0.29      8416
 MQTT-DoS-Connect_Flood       0.97      0.94      0.96      3131
 MQTT-DoS-Publish_Flood       0.55      1.00      0.71      8505
    MQTT-Malformed_Data       0.55      0.89      0.68      1747
          Recon-OS_Scan       0.71      0.40      0.51      3834
       Recon-Ping_Sweep       0.31      0.77      0.44       186
        Recon-Port_Scan       0.91      0.89      0.90     22622
          Recon-VulScan       0.18      0.44      0.25      1034
       TCP_IP-DDoS-ICMP       0.98      0.99      0.99    349699
        TCP_IP-DDoS-SYN       0.93      0.96      0.95    172397
        TCP_IP-DDoS-TCP       0.97      0.99      0.98    182598

### K-Nearest Neighbors (KNN) Training

In [6]:
from sklearn.neighbors import KNeighborsClassifier

# train KNN model
neigh = KNeighborsClassifier(n_jobs=8)
neigh.fit(X_train_pca, y_train)

### Accuracy of KNN Model

In [None]:
# check accuracy of KNN model

y_pred = neigh.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

[WinError 2] The system cannot find the file specified
  File "c:\Users\spark\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\spark\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\spark\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                   

### Neural Network (Multi-layer Perceptron) Training

In [None]:
from sklearn.neural_network import MLPClassifier

# train MLP model
clf = MLPClassifier()
clf.fit(X_train_pca, y_train)

### MLP Classification Report

In [None]:
# check accuracy of MLP neural network

y_pred = clf.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))