In [1]:
import pandas as pd
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report


In [2]:
# Load preprocessed data
ML_data = pd.read_csv("preprocessed_ML_data.csv")

In [3]:
# Function to preprocess data, train and evaluate Random Forest model
def train_and_evaluate_random_forest(X, y, class_type):
    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale data
    scaler = StandardScaler()
    columns = X_train.columns
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Convert the scaled data back to DataFrame
    X_train = pd.DataFrame(X_train, columns=columns)
    X_test = pd.DataFrame(X_test, columns=columns)
    
    # Apply PCA
    pca = PCA(n_components=0.95)  # Retain 95% of the variance
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    # Convert the PCA-transformed data back to DataFrame
    X_train = pd.DataFrame(X_train_pca)
    X_test = pd.DataFrame(X_test_pca)
    
    # Train Random Forest model
    model = RandomForestClassifier(criterion='gini', min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluate model
    print(f"{class_type} - Random Forest Classifier")
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred), "\n")
    report = classification_report(y_test, y_pred)
    print(report)


In [4]:
# 2-class classification
X_2 = ML_data.drop(columns=['Attack_label', 'Attack_type', 'Attack_class'])
y_2_class = ML_data['Attack_label']
train_and_evaluate_random_forest(X_2, y_2_class, "2-class")


2-class - Random Forest Classifier
Accuracy: 1.0 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25830
           1       1.00      1.00      1.00     25413

    accuracy                           1.00     51243
   macro avg       1.00      1.00      1.00     51243
weighted avg       1.00      1.00      1.00     51243



In [5]:
# 6-class classification
X_6 = ML_data.drop(columns=['Attack_label', 'Attack_type', 'Attack_class'])
y_6_class = ML_data['Attack_class']
train_and_evaluate_random_forest(X_6, y_6_class, "6-class")

6-class - Random Forest Classifier
Accuracy: 0.9092648782842563 

                   precision    recall  f1-score   support

         DoS/DDoS       0.90      0.89      0.89      9662
Injection Attacks       0.75      0.82      0.79      9576
             MITM       1.00      1.00      1.00      9641
          Malware       0.85      0.79      0.82      9585
           Normal       1.00      1.00      1.00      9751
 Scanning_Attacks       0.97      0.95      0.96      9789

         accuracy                           0.91     58004
        macro avg       0.91      0.91      0.91     58004
     weighted avg       0.91      0.91      0.91     58004



In [6]:
# 15-class classification
X_15 = ML_data.drop(columns=['Attack_label', 'Attack_type', 'Attack_class'])
y_15_class = ML_data['Attack_type']
train_and_evaluate_random_forest(X_15, y_15_class, "15-class")

15-class - Random Forest Classifier
Accuracy: 0.8562210708117444 

                       precision    recall  f1-score   support

             Backdoor       1.00      0.98      0.99      4859
            DDoS_HTTP       0.68      0.62      0.65      4812
            DDoS_ICMP       1.00      1.00      1.00      4868
             DDoS_TCP       0.88      0.94      0.91      4796
             DDoS_UDP       1.00      1.00      1.00      4805
       Fingerprinting       0.91      0.90      0.90      4845
                 MITM       1.00      1.00      1.00      4868
               Normal       1.00      1.00      1.00      4782
             Password       0.49      0.52      0.51      4847
        Port_Scanning       0.97      0.95      0.96      4797
           Ransomware       0.97      0.95      0.96      4842
        SQL_injection       0.50      0.55      0.52      4815
            Uploading       0.60      0.55      0.57      4861
Vulnerability_scanner       0.94      0.95      0.