<a href="https://colab.research.google.com/github/sankeawthong/Project-1-Lita-Chatbot/blob/main/Hybrid%20RF-Bi-LSTM%20for%2010%20class%20classifications%20based%20on%20UNSW-NB15%20dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Hybrid RF-Bi-LSTM for 10 class classifications based on UNSW-NB15 dataset**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, LSTM
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [2]:
# Load the dataset
dataset = pd.read_csv("dataset_P2.2.csv")
dataset.shape
dataset.isnull().sum()
dataset.info()
dataset["Class"].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 41 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 175341 non-null  int64  
 1   dur                175341 non-null  float64
 2   spkts              175341 non-null  int64  
 3   dpkts              175341 non-null  int64  
 4   sbytes             175341 non-null  int64  
 5   dbytes             175341 non-null  int64  
 6   rate               175341 non-null  float64
 7   sttl               175341 non-null  int64  
 8   dttl               175341 non-null  int64  
 9   sload              175341 non-null  float64
 10  dload              175341 non-null  float64
 11  sloss              175341 non-null  int64  
 12  dloss              175341 non-null  int64  
 13  sinpkt             175341 non-null  float64
 14  dinpkt             175341 non-null  float64
 15  sjit               175341 non-null  float64
 16  dj

array([0, 2, 1, 5, 8, 7, 4, 3, 9, 6])

In [3]:
# Data preprocessing
y = dataset['Class'].values
X = dataset.drop(['Class'], axis=1)
X = X.values

In [4]:
# Data balancing using SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [5]:
# Reshape the data for LSTM
X = X.reshape(X.shape[0], X.shape[1], 1)

In [6]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)  # Reshape X_train to 2D

y_pred_rf = rf_model.predict(X_test.reshape(X_test.shape[0], -1))  # Reshape X_test to 2D

from sklearn.metrics import accuracy_score

# Calculate accuracy for Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

# Evaluate Random Forest model
rf_report = classification_report(y_test, y_pred_rf)
print("Random Forest Model Classification Report:")
print(rf_report)

Random Forest Accuracy: 0.8667857142857143
Random Forest Model Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     11176
           1       0.76      0.81      0.79     11344
           2       0.74      0.84      0.79     11130
           3       0.63      0.59      0.61     11037
           4       0.78      0.74      0.76     11263
           5       0.92      0.91      0.91     11308
           6       1.00      0.98      0.99     11179
           7       0.90      0.83      0.87     11358
           8       0.96      1.00      0.98     11137
           9       1.00      1.00      1.00     11068

    accuracy                           0.87    112000
   macro avg       0.87      0.87      0.87    112000
weighted avg       0.87      0.87      0.87    112000



In [8]:
from tensorflow.keras.utils import to_categorical
# Bi-LSTM model
lstm_model = Sequential()
lstm_model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(X_train.shape[1], 1)))
lstm_model.add(Dropout(0.2))
lstm_model.add(Bidirectional(LSTM(64)))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(10, activation='softmax'))
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

# Convert labels to one-hot encoded vectors
num_classes = len(np.unique(y))
y_train_encoded = to_categorical(y_train, num_classes)
y_test_encoded = to_categorical(y_test, num_classes)

lstm_model.fit(X_train, y_train_encoded, epochs=10, batch_size=64, validation_data=(X_test, y_test_encoded), verbose=1)

##y_pred_lstm = lstm_model.predict_classes(X_test)
y_pred_lstm_prob = lstm_model.predict(X_test)
y_pred_lstm = np.argmax(y_pred_lstm_prob, axis=1)

# Calculate accuracy for Bi-LSTM model
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
print("Bi-LSTM Accuracy:", accuracy_lstm)

# Evaluate Bi-LSTM model
lstm_report = classification_report(y_test, y_pred_lstm)
print("Bi-LSTM Model Classification Report:")
print(lstm_report)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 40, 128)          33792     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 40, 128)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
Total params: 133,898
Trainable params: 133,898
Non-trai

In [9]:
# Combine Random Forest and Bi-LSTM predictions
combined_pred = np.concatenate((y_pred_rf.reshape(-1, 1), y_pred_lstm.reshape(-1, 1)), axis=1)
y_pred_combined = [np.bincount(row).argmax() for row in combined_pred]

# Calculate accuracy for the combined model
accuracy_combined = accuracy_score(y_test, y_pred_combined)
print("Combined Model Accuracy:", accuracy_combined)

# Evaluate the combined model
combined_report = classification_report(y_test, y_pred_combined)
print("Combined Model Classification Report:")
print(combined_report)

Combined Model Accuracy: 0.8
Combined Model Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     11176
           1       0.48      0.90      0.62     11344
           2       0.69      0.55      0.61     11130
           3       0.51      0.48      0.49     11037
           4       0.83      0.52      0.64     11263
           5       0.94      0.87      0.90     11308
           6       1.00      0.98      0.99     11179
           7       0.93      0.79      0.86     11358
           8       0.95      0.96      0.96     11137
           9       1.00      0.98      0.99     11068

    accuracy                           0.80    112000
   macro avg       0.83      0.80      0.80    112000
weighted avg       0.83      0.80      0.80    112000

