<a href="https://colab.research.google.com/github/sankeawthong/Project-1-Lita-Chatbot/blob/main/Project%232%20practice%207(02062023).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Hybrid RF - LSTM Model for 10 classes classifications based on UNSW-NB15 dataset**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Read the dataset
dataset = pd.read_csv("dataset_P2.2.csv")

In [3]:
# Check for missing values
dataset.isnull().sum()

id                   0
dur                  0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
is_sm_ips_ports      0
Class                0
dtype: int64

### **Data preprocessing**

In [4]:
# Data preprocessing
y = dataset['Class'].values
X = dataset.drop(['Class'], axis=1)
X = X.values

In [5]:
# Perform oversampling using SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [6]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

### **RF Model**

In [7]:
# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=30, max_depth=9)
rf.fit(X_train, y_train)

In [8]:
# Make predictions on the test set
y_test_prediction_rf = rf.predict(X_test)

In [9]:
# Generate confusion matrix and classification report for Random Forest
cm_rf = confusion_matrix(y_test, y_test_prediction_rf)
print("Random Forest - Confusion Matrix:")
print(cm_rf)
cr_rf = classification_report(y_test, y_test_prediction_rf)
print("Random Forest - Classification Report:")
print(cr_rf)

Random Forest - Confusion Matrix:
[[10288   112     1     1    19   816     0    12    24     4]
 [    0  4238  4662  2157    51    43     0     1   191     0]
 [    0   588  8061  1605   222   178     0   101   344   126]
 [    0   756  3999  4003  1539   154     0    46   546   176]
 [    0   251  1604  2525  5269   306     0   323   434   585]
 [    2    91   489   390    71  9617     0    74   315   125]
 [    0     4    41    50   108    29 10804     3    12    23]
 [    0   199  1147   583    34   207     0  8623   294    42]
 [    0     0     0    23     0   250     0   295 10459    85]
 [    0    14     0     0     6     0     0    11   564 10555]]
Random Forest - Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.91      0.95     11277
           1       0.68      0.37      0.48     11343
           2       0.40      0.72      0.52     11225
           3       0.35      0.36      0.35     11219
           4       0.72  

### **Define the LSTM model**

In [11]:
from tensorflow.keras.utils import to_categorical
# Define the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], 1)))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(10, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert the target variables to categorical variables
y_train_lstm = to_categorical(y_train)
y_test_lstm = to_categorical(y_test)

# Train the LSTM model
model.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), y_train_lstm, epochs=15, batch_size=32, validation_data=(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), y_test_lstm))

# Evaluate the LSTM model
score, acc = model.evaluate(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), y_test_lstm, batch_size=32)
print('LSTM - Test score:', score)
print('LSTM - Test accuracy:', acc)

# Make predictions with the LSTM model
predictions_lstm = model.predict(X_test)
predicted_classes_lstm = np.argmax(predictions_lstm, axis=1)

# Generate confusion matrix and classification report for LSTM
cm_lstm = confusion_matrix(y_test, predicted_classes_lstm)
print("LSTM - Confusion Matrix:")
print(cm_lstm)
cr_lstm = classification_report(y_test, predicted_classes_lstm)
print("LSTM - Classification Report:")
print(cr_lstm)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
LSTM - Test score: 0.6577540636062622
LSTM - Test accuracy: 0.7347678542137146
LSTM - Confusion Matrix:
[[ 9357   151    10     4   115  1577     1    13    42     7]
 [    7  4255  4506  2484    37     4     4     6    40     0]
 [    3   700  7662  2472    81    21     1    76   180    29]
 [    2   751  3727  5130  1109   107     1    32   300    60]
 [   14   247  1488  3274  5297   201     0   252   311   213]
 [  237    91   444   450   118  9508     1    35   249    41]
 [    0     6    51    67    95    14 10802     2    21    16]
 [    2   181   999   814   126    42     1  8674   266    24]
 [   10     3     3    11    13   113     0   251 10639    69]
 [    0     0     0     1    50     0     0     3   126 10970]]
LSTM - Classification Report:
              precision    recall  f1-score   support

         

**Combine Model RF- LSTM**

In [13]:
# Combine predictions from Random Forest and LSTM models
ensemble_predictions = np.vstack((y_test_prediction_rf, predicted_classes_lstm)).T
ensemble_predicted_classes = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=ensemble_predictions)

# Generate confusion matrix and classification report for the ensemble model
cm_ensemble = confusion_matrix(y_test, ensemble_predicted_classes)
print("Ensemble - Confusion Matrix:")
print(cm_ensemble
)
cr_ensemble = classification_report(y_test, ensemble_predicted_classes)
print("Ensemble - Classification Report:")
print(cr_ensemble)

Ensemble - Confusion Matrix:
[[10434   100     4     3    38   677     0     7    13     1]
 [    7  4753  5169  1352    22     5     0     0    35     0]
 [    3   984  8851  1051    70    23     0    67   151    25]
 [    2  1099  4791  3794  1088    88     1    34   290    32]
 [   14   373  2140  2590  5318   159     0   252   281   170]
 [  238   133   588   340   141  9491     0    47   167    29]
 [    0     6    60    59    99    17 10804     2    17    10]
 [    2   283  1303   436   122   221     1  8529   217    15]
 [   10     3     3    32    13   310     0   491 10223    27]
 [    0    14     0     1    56     0     0    14   562 10503]]
Ensemble - Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.93      0.95     11277
           1       0.61      0.42      0.50     11343
           2       0.39      0.79      0.52     11225
           3       0.39      0.34      0.36     11219
           4       0.76      0.47  