# Healthcare IoT Device Network Intrusion Detection System

### Dataframe Creation

In [None]:
import pandas as pd
import os
import glob
import gc

# options to see all columns in dataframes
pd.set_option("display.max_columns", None)

# get directory folder of train and test csv files (make sure to replace with path on your computer)
train_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Train".replace(
    "\\", "/"
)
test_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Test".replace(
    "\\", "/"
)

# store paths of all csv files in the folder in a list
train_csv_files = glob.glob(os.path.join(train_dir, "*.csv"))
test_csv_files = glob.glob(os.path.join(test_dir, "*.csv"))

# list to store csv files as dataframes
train_dataframes = []
test_dataframes = []

# create dataframes for each csv with column to label the class type
# get filename from path
for file in train_csv_files:
    filename = os.path.basename(file)

    # split the file name to get the class of the attack
    attack_type = filename.split("_train.pcap.csv")[0]

    # since some attacks are split into multiple files we need to remove numbers from the string
    # if last character is a number we remove it to get the name of the attack type
    if attack_type[-1].isdigit():
        attack_type = attack_type[:-1]

    # create a dataframe for the specific csv file
    df = pd.read_csv(file)

    # create a new column to classify the attack type in that csv file
    df["classification"] = attack_type

    # append the dataframe to the dataframes list
    train_dataframes.append(df)

    # delete the dataframe
    del df

# do the same for the test data
for file in test_csv_files:
    filename = os.path.basename(file)

    # split the file name to get the class of the attack
    attack_type = filename.split("_test.pcap.csv")[0]

    # since some attacks are split into multiple files we need to remove numbers from the string
    # if last character is a number we remove it to get the name of the attack type
    if attack_type[-1].isdigit():
        attack_type = attack_type[:-1]

    # create a dataframe for the specific csv file
    df = pd.read_csv(file)

    # create a new column to classify the attack type in that csv file
    df["classification"] = attack_type

    # append the dataframe to the dataframes list
    test_dataframes.append(df)

    # delete the dataframe
    del df


# combine all of the dataframes into one
train_df = pd.concat(train_dataframes, ignore_index=True)
test_df = pd.concat(test_dataframes, ignore_index=True)

print(train_df["classification"].value_counts())

# downsample to a certain fraction of random samples per class depending on how many rows in the class
# if smaller class don't cut out rows
result_dfs = []
for class_name, group in train_df.groupby("classification"):
    if len(group) > 500000:
        result_dfs.append(group.sample(frac=0.01, random_state=42))
    elif len(group) > 100000:
        result_dfs.append(group.sample(frac=0.1, random_state=42))
    elif len(group) > 10000:
        result_dfs.append(group.sample(frac=0.2, random_state=42))
    else:
        result_dfs.append(group)
train_df = pd.concat(result_dfs, ignore_index=True)

print(train_df["classification"].value_counts())
print(test_df["classification"].value_counts())

del train_dataframes
del test_dataframes

gc.collect()

### Data Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# create training and testing data from the dataframes made earlier
X_train = train_df.drop(columns=["classification"])
y_train = train_df["classification"]

X_test = test_df.drop(columns=["classification"])
y_test = test_df["classification"]

# standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# use PCA to reduce feature
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")


### Random Forest Classifier Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# train random forest classifier model
rf = RandomForestClassifier(random_state=42, n_jobs=7)
rf.fit(X_train_pca, y_train)

### Accuracy of Random Forest Classifier Model

In [None]:
# check accuracy of model
y_pred = rf.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

### Optimizing the Random Forest Classifier

In [None]:
# parameters to search through
params_grid = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 10, 20, 30, 40],
    "min_samples_split": [2, 4, 8],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "class_weight": [None, "balanced"],
}

# initialize RandomizedSearchCV object
rf_randomized_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=params_grid,
    n_iter=10,
    cv=5,
    random_state=42,
    n_jobs=7,
    verbose=2,
    scoring="f1_macro",
)
# start search on hyper parameters
search = rf_randomized_search.fit(X_train_pca, y_train)


In [None]:
# get best parameters from randomized search cv
print(f"best estimator: {search.best_estimator_}")
print(f"best parameters: {search.best_params_}")

# get classification report from best model
y_pred = search.best_estimator_.predict(X_test_pca)
print(print("\nClassification Report:\n", classification_report(y_test, y_pred)))

### Gradient Boosting Classifier Training

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report

In [None]:
# train gradient boosting classifier model
os.environ["LOKY_MAX_CPU_COUNT"] = "7"

hgbc = HistGradientBoostingClassifier(random_state=42)
hgbc.fit(X_train_pca, y_train)

### Accuracy of Gradient Boosting Classifier Model

In [None]:
# check accuracy of gradient boosting classifier model

y_pred = hgbc.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

### Optimizing Gradient Boosting Classifier Model

### Multi-layer Perceptron (MLP) Classifier Training

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [None]:
# train MLP model
mlpc = MLPClassifier(random_state=42)
mlpc.fit(X_train_pca, y_train)

### Accuracy of MLP Classifier Model

In [None]:
# check accuracy of MLP neural network
y_pred = mlpc.predict(X_test_pca)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

### Optimizing MLP Classifier Model