# Healthcare IoT Device Network Intrusion Detection System

### Dataframe Creation

In [None]:
import pandas as pd
import os
import glob
import gc

#options to see all columns in dataframes
pd.set_option("display.max_columns", None)

#get directory folder of train and test csv files (make sure to replace with path on your computer)
train_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Train".replace("\\", "/")
test_dir = r"C:\Users\spark\CMPE 132 Project\CMPE132Project\Dataset\Test".replace("\\", "/")

#store paths of all csv files in the folder in a list
train_csv_files = glob.glob(os.path.join(train_dir, "*.csv"))
test_csv_files = glob.glob(os.path.join(test_dir, "*.csv"))

#list to store csv files as dataframes
train_dataframes = []
test_dataframes = []

#create dataframes for each csv with column to label the class type
#get filename from path
for file in train_csv_files:
    filename = os.path.basename(file)

    #split the file name to get the class of the attack
    attack_type = filename.split("_train.pcap.csv")[0]

    #since some attacks are split into multiple files we need to remove numbers from the string
    #if last character is a number we remove it to get the name of the attack type
    if(attack_type[-1].isdigit()):
        attack_type = attack_type[:-1]

    #create a dataframe for the specific csv file
    df = pd.read_csv(file)

    #create a new column to classify the attack type in that csv file
    df['classification'] = attack_type

    #append the dataframe to the dataframes list
    train_dataframes.append(df)

    #delete the dataframe
    del df

#do the same for the test data
for file in test_csv_files:
    filename = os.path.basename(file)

    #split the file name to get the class of the attack
    attack_type = filename.split("_test.pcap.csv")[0]

    #since some attacks are split into multiple files we need to remove numbers from the string
    #if last character is a number we remove it to get the name of the attack type
    if(attack_type[-1].isdigit()):
        attack_type = attack_type[:-1]

    #create a dataframe for the specific csv file
    df = pd.read_csv(file)

    #create a new column to classify the attack type in that csv file
    df['classification'] = attack_type

    #append the dataframe to the dataframes list
    test_dataframes.append(df)
    
    #delete the dataframe
    del df


#combine all of the dataframes into one
train_df = pd.concat(train_dataframes, ignore_index=True)
test_df = pd.concat(test_dataframes, ignore_index=True)

del train_dataframes
del test_dataframes

gc.collect()

### Data Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

#create training and testing data from the dataframes made earlier
X_train = train_df.drop(columns=["classification"])
y_train = train_df['classification']

X_test = test_df.drop(columns=['classification'])
y_test = test_df['classification']

#standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
del X_train
X_test_scaled = scaler.transform(X_test)
del X_test

#use PCA to reduce feature
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
del X_train_scaled
X_test_pca = pca.transform(X_test_scaled)
del X_test_scaled

print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")


Reduced number of features after PCA: 22


### Random Forest Classifier Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

#train random forest classifier model
rf = RandomForestClassifier()
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))