# Jupyter notebook to test the effectiveness of the CC detection models.

In [3]:
import numpy as np
import pandas as pd
import argparse

import tensorflow as tf
import sklearn as sk
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-04-20 16:34:31.767647: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-20 16:34:31.767718: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-20 16:34:31.769822: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-20 16:34:31.784120: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [38]:
# Default values
csvfile = '../datasets/all_datasets_raw.tsv'
min_letters = 5
max_letters = 500
take_last_num = lambda x: x[: max_letters]

In [55]:
df = pd.read_csv(
        csvfile,
        delimiter="|",
        names=["note", "label", "model_id", "state"],
        skipinitialspace=True,
        converters={"state": take_last_num},
    )

In [56]:
# Clean the dataset
df.dropna(axis=0, how="any", inplace=True)
df.drop(axis=1, columns=["note", "model_id"], inplace=True)

In [57]:
# Delete the strings of letters with less than a certain amount
indexNames = df[df["state"].str.len() < min_letters].index
df.drop(indexNames, inplace=True)

In [58]:
# Add a new column to the dataframe with the label. The label is 'Normal' for the normal data and 'Malcious' for the malware data
df.loc[df.label.str.contains("Normal"), "label"] = "Normal"
df.loc[df.label.str.contains("Botnet"), "label"] = "Malicious"
df.loc[df.label.str.contains("Malware"), "label"] = "Malicious"

In [62]:
df.label = df.label.replace("Malicious", 1)
df.label = df.label.replace("Normal", 0)

  df.label = df.label.replace("Normal", 0)


In [65]:
# Convert each of the stratosphere letters to an integer. There are 50
vocabulary = list("abcdefghiABCDEFGHIrstuvwxyzRSTUVWXYZ1234567890,.+*")
int_of_letters = {}
for i, letter in enumerate(vocabulary):
    int_of_letters[letter] = float(i)
print( f"There are {len(int_of_letters)} letters in total. From letter index {min(int_of_letters.values())} to letter index {max(int_of_letters.values())}.")
vocabulary_size = len(int_of_letters)

There are 50 letters in total. From letter index 0.0 to letter index 49.0.


In [66]:
# Change the letters in the state to an integer representing it uniquely. We 'encode' them.
df["state"] = df["state"].apply(lambda x: [[int_of_letters[i]] for i in x])
# So far, only 1 feature per letter
features_per_sample = 1

In [67]:
df.state

70      [[44.0], [44.0], [45.0], [17.0], [17.0], [49.0...
71      [[36.0], [36.0], [47.0], [27.0], [47.0], [27.0...
73               [[44.0], [41.0], [47.0], [14.0], [47.0]]
75      [[41.0], [41.0], [47.0], [35.0], [47.0], [26.0...
76      [[44.0], [44.0], [47.0], [35.0], [47.0], [26.0...
                              ...                        
7609    [[44.0], [44.0], [47.0], [17.0], [47.0], [17.0...
7610    [[44.0], [44.0], [47.0], [17.0], [47.0], [26.0...
7616             [[43.0], [44.0], [46.0], [35.0], [48.0]]
7625    [[39.0], [39.0], [47.0], [27.0], [48.0], [21.0...
7630    [[43.0], [43.0], [47.0], [7.0], [47.0], [7.0],...
Name: state, Length: 2228, dtype: object

In [68]:
# Convert the data into the appropriate shape
# x_data is a list of lists. The 1st dimension is the outtuple, the second the letter. Each letter is now an int value. shape=(num_outuples, features_per_sample)
x_data = df["state"].to_numpy()
print(f"There are {len(x_data)} outtuples")

There are 2228 outtuples


In [69]:
# y_data is a list of ints that are 0 or 1. One integer per outtupple. shape=(num_outuples, 1)
y_data = df["label"].to_numpy()
print(f"There are {len(y_data)} labels")

There are 2228 labels


In [70]:
# Search the sample with max len in the training. It should be already cuted by the csv_read function to a max. Here we just check
max_length_of_outtupple = max([len(sublist) for sublist in df.state.to_list()])
print(f"The max len of the letters in all outtuples is: {max_length_of_outtupple}")

The max len of the letters in all outtuples is: 500


In [71]:
# Here x_data is a array of lists [[]]
print(f"x_data type {type(x_data)} of shape {x_data.shape}. x_data[0] type is {type(x_data[0])}")
print(f"x_data[0] is {x_data[0]}")

x_data type <class 'numpy.ndarray'> of shape (2228,). x_data[0] type is <class 'list'>
x_data[0] is [[44.0], [44.0], [45.0], [17.0], [17.0], [49.0], [26.0], [49.0], [45.0], [35.0], [45.0], [17.0]]


In [72]:
# Padding.
# Since not all outtuples have the same amount of letters, we need to add padding at the end
# Transforms the list to a 2D Numpy array of shape (num_samples, num_timesteps)
# num_timesteps is either the maxlen argument if provided, or the length of the longest sequence otherwise.
# Sequences that are shorter than num_timesteps are padded with value at the end.
# padding: 'pre' or 'post': pad either before or after each sequence.
# truncating: 'pre' or 'post': remove values from sequences larger than maxlen, either at the beginning or at the end of the sequences.

# If the input are integers
padded_x_data = pad_sequences(
    x_data, maxlen=max_length_of_outtupple, padding="post"
)
print(
        f"padded_x_data is of type {type(padded_x_data)}, of shape {padded_x_data.shape}. padded_x_data[0] type is {type(padded_x_data[0])}. Shape of second list is {padded_x_data[0].shape}"
    )

padded_x_data is of type <class 'numpy.ndarray'>, of shape (2228, 500, 1). padded_x_data[0] type is <class 'numpy.ndarray'>. Shape of second list is (500, 1)


In [74]:
# Split the data in training and testing
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(df, test_size=0.2, shuffle=True)

In [76]:
train_x_data = padded_x_data
train_y_data = y_data

In [77]:
num_outtuples = train_x_data.shape[0]  # number_of_outtuples in general

# In the case of hot-encoding, the amount of features per letter per sample, is 50, which is the vocabulary size
# features_per_sample = vocabulary_size # amount of positions of the hot encoding (50 letters, so 50)
# print(f'We have as input shape: {num_outtuples}, {max_length_of_outtupple}, {features_per_sample}')
# input_shape = (max_length_of_outtupple, features_per_sample)

# In the case of not using hot-encoding, the amount of features per sample is 1, because we only have one value
# The amount of time steps is the amount of letters, since one letter is one time step, which is the amount of letters max, which 500
timesteps = max_length_of_outtupple
input_shape = (timesteps, features_per_sample)
print(
    f"We have as shape: Num of samples: {num_outtuples}, Num of letters per sample (timesteps): {timesteps}, each letter has {features_per_sample} values. The input shape is {input_shape}"
)

We have as shape: Num of samples: 2228, Num of letters per sample (timesteps): 500, each letter has 1 values. The input shape is (500, 1)


In [78]:
# Create the model of RNN
model = tf.keras.models.Sequential()
model.add(layers.Embedding(vocabulary_size, 16, mask_zero=True))
# GRU is the main RNN layer, inputs: A 3D tensor, with shape [batch, timesteps, feature]
model.add(
    layers.Bidirectional(
        layers.GRU(32, return_sequences=False), merge_mode="concat"
    )
)
model.add(layers.Dense(32, activation="relu"))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation="sigmoid"))
# Fully connected layer with 1 neuron output
# Final output value between 0 and 1 as probability
model.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, momentum=0.05),
    metrics=["accuracy"],
)

In [79]:
# Train the model
# This is already separating in trainign and validation

num_epochs = 500
batch_size = 100 # group of outtuples as a batch

history = model.fit(
    train_x_data,
    train_y_data,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_split=0.1,
    verbose=1,
    shuffle=True,
)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
 2/21 [=>............................] - ETA: 6s - loss: 0.2406 - accuracy: 0.9350

In [None]:
model_outputfile = './rnn_model_2024-04-20.h5'
model.summary()
model.save(model_outputfile, overwrite=False)

In [None]:
# To plot the results
import matplotlib.pyplot as plt

acc = history.history["accuracy"]
val_acc = history.history["val_accuracy"]
loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, "ro", label="Training acc")
plt.plot(epochs, val_acc, "r", label="Validation acc")

plt.title("Training and validation accuracy")
plt.legend()
plt.savefig("test_results_acc.png")

plt.close()
plt.plot(epochs, loss, "bo", label="Training loss")
plt.plot(epochs, val_loss, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.legend()
plt.savefig("test_results_loss.png")