In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./CLUSTER_INFORMATION.csv")

In [3]:
df

Unnamed: 0,Cluster Name,Combination List,Cluster Centroid
0,Normal,"['Regular', 'Typical', 'Common', 'Standard', '...","[0.04477231577038765, -1.1067426204681396, 0.6..."
1,Heartbleed,"['SSL vulnerability', 'OpenSSL exploit', 'Info...","[0.20892228186130524, -1.8042747974395752, 2.1..."
2,"Normal, DoS Hulk","['DDoS Shield', 'Steadyflow', 'Regular Traffic...","[-0.6773971915245056, 0.3524855971336365, -0.2..."
3,"Normal, DDoS","['Coordinated Overload', 'Cyber Storm', 'Netwo...","[1.542575716972351, 1.337799310684204, -0.9742..."
4,"Exploits, SSH Patator","['Malicious SSH Exploiter', 'Unauthorized SSH ...","[0.8602568507194519, 0.048649415373802185, -0...."
...,...,...,...
511,"DoS, FTP Patator","['BruteFTP', 'PayloadStorm', 'PassiveFlooder',...","[0.5987561941146851, -0.7863489985466003, -0.7..."
512,"DoS GoldenEye, Normal","['DestructiveStrike', 'PeakPerformance', 'Ruth...","[0.7857335209846497, 0.16938543319702148, -2.4..."
513,SSH Patator,"['SSH Brute Force', 'SSH Dictionary Attack', '...","[0.5677225589752197, 0.7569266557693481, -0.43..."
514,"Fuzzers, Exploits, Normal","['Vulnerability Probing', 'Attack Simulation',...","[-1.5628975629806519, 1.0497198104858398, -0.8..."


In [4]:
import pickle
with open('./SCALER.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [5]:
import torch
from transformers import AutoModel, AutoTokenizer, logging
import numpy as np
import pandas as pd
import warnings
import docx
import re
import nltk
import wikipedia
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
try:
  ipython = get_ipython()
  from tqdm.notebook import tqdm
except:
  from tqdm import tqdm

logging.set_verbosity_error()
logging.disable_progress_bar()
warnings.filterwarnings('ignore')
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('tagsets', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [8]:
class BERTSimilarWords:

    def __init__(self, model='bert-base-cased', max_heading_length = 10, max_document_length = 300, exclude_stopwords=[], embeddings_scaler=None):

        for i in tqdm(range(2), unit=' it', desc='Initializing', postfix='Tokenizer and Model'): pass
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.lemmatizer = WordNetLemmatizer()
        self.min_max_scaler = MinMaxScaler()
        self.scaler = embeddings_scaler
        self.scaler_col_names = [str(i) for i in range(768)]
        self.model = AutoModel.from_pretrained(model)
        if torch.cuda.is_available():
            self.processor = 'GPU'
            self.cuda_current_device = torch.cuda.current_device()
            self.model = self.model.to(self.cuda_current_device)
        else:
            self.processor = 'CPU'
        self.max_document_length = max_document_length
        self.max_heading_length = max_heading_length
        self.max_ngram = 10
        self.wikipedia_dataset_info = {}
        self.document_list = []
        self.bert_words = []
        self.bert_vectors = []
        self.bert_documents = []
        self.continous_words = []
        self.temporary_ngram_words = []
        self.count_vectorizer_words = []
        self.cv_counts = []
        self.cv_words = []
        self.count_vectorizer = CountVectorizer(analyzer=self._custom_analyzer)
        self.stop_words = [word for word in stopwords.words() if word not in exclude_stopwords]
        self.punctuations = '''!"#$%&'()*+,-./:—;<=>−?–@[\]^_`{|}~'''
        self.doc_regex = "[\([][0-9]+[\])]|[”“‘’‛‟]|\d+\s"
        self.punctuations_continuity_exclude = '''—-–,−'''
        self.pos_tags_info = nltk.help.upenn_tagset
        self.bert_words_ngram = [[] for _ in range(self.max_ngram)]
        self.bert_vectors_ngram = [[] for _ in range(self.max_ngram)]
        self.bert_documents_ngram = [[] for _ in range(self.max_ngram)]
        self.bert_words_all = []
        self.bert_vectors_all = []

    def load_dataset(self, dataset_path=None, wikipedia_query=None, wikipedia_query_limit=10, wikipedia_page_list=None):

        """
        This method extracts and processes the text content and generates word embeddings using the BERT model. Either one of the (dataset_path,wikipedia_query,wikipedia_page_list) parameters should be given.

        Parameters
        ----------
        dataset_path : the dataset paths of the text files either as a string (one file) or a list of strings (multiple files) (supported files: .docx / .txt)

        wikipedia_query : the Wikipedia search queries either as a string (one query) or a list of strings (multiple queries)

        wikipedia_query_limit : maximum number of pages to extract for each query (only when wikipedia_query is given)

        wikipedia_page_list : the list of names of Wikipedia pages to be extracted

        """

        if wikipedia_query is not None or wikipedia_page_list is not None:
            if wikipedia_query is not None:
                query_results = []
                if type(wikipedia_query) == str:
                    wikipedia_query = [wikipedia_query]
                for query in wikipedia_query:
                    query_results += wikipedia.search(query, results=wikipedia_query_limit)
            else:
                query_results = wikipedia_page_list
            page_content = []
            for result in tqdm(query_results, unit=' pages', desc='Extracting', postfix='Data from Wikipedia'):
                if '(disambiguation)' not in result and result not in self.wikipedia_dataset_info.keys():
                    try:
                        page = wikipedia.page(result, auto_suggest=False)
                    except:
                        continue
                    page_content += ['== New page =='] + page.content.split('\n\n\n')
                    self.wikipedia_dataset_info[page.title] = page.url
            self.document_list = self._process_wikipedia_dataset(page_content)
        elif dataset_path is not None:
            if type(dataset_path) == str:
                dataset_path = [dataset_path]
            for path in dataset_path:
                if path.endswith('.docx'):
                    docx_content = docx.Document(path)
                    self.document_list += self._process_docx_dataset(docx_content)
                elif path.endswith('.txt'):
                    self.document_list += self._process_txt_dataset(path)
                else:
                    raise ValueError("Files supported: .docx / .txt")
        for words, vectors, document, continous in self._tokenize_and_embeddings(self.document_list):
            self.temporary_ngram_words = []
            for i in range(len(words)):
                self._generate_n_grams(i, words, vectors, document, continous)
            self.bert_words.extend(words)
            self.bert_vectors.extend(vectors)
            self.bert_documents.extend(document)
            self.continous_words.extend(continous)
            self.count_vectorizer_words.append(words + self.temporary_ngram_words)
        self.bert_words_ngram[0] = self.bert_words
        self.bert_vectors_ngram[0] = self.bert_vectors
        self.bert_documents_ngram[0] = self.bert_documents
        self.cv_counts = self.count_vectorizer.fit_transform(self.count_vectorizer_words)
        self.cv_words = self.count_vectorizer.get_feature_names_out()
        self.bert_words_all = np.array(list(itertools.chain.from_iterable(self.bert_words_ngram)))
        self.bert_vectors_all = np.array(list(itertools.chain.from_iterable(self.bert_vectors_ngram)))
        if self.scaler is not None:
            df = pd.DataFrame(self.bert_vectors_all, columns=self.scaler_col_names)
            self.bert_vectors_all = self.scaler.transform(df)
            del df
            for i in tqdm(range(self.max_ngram), desc='Producing', postfix='N-gram Words and Embeddings'):
                df = pd.DataFrame(self.bert_vectors_ngram[i], columns=self.scaler_col_names)
                self.bert_vectors_ngram[i] = self.scaler.transform(df)
                del df
        else:
            for i in tqdm(range(self.max_ngram), desc='Producing', postfix='N-gram Words and Embeddings'):
                self.bert_vectors_ngram[i] = np.array(self.bert_vectors_ngram[i])
        return self

    def _process_wikipedia_dataset(self, page_content):

        document_list = []
        for section in page_content:
            if not any(exclude in section for exclude in
                       ['== Further reading ==', '== References ==', '== External links ==', '== See also ==',
                        '== Notes ==']):
                if "==" in section[:self.max_heading_length] and "===" not in section[:self.max_heading_length]:
                    flag = 0
                paragraph = section.split('\n')
                for sentence in paragraph:
                    sentence_words = sentence.split()
                    sentence_length = len(sentence_words)
                    if sentence_length > self.max_heading_length:
                        if len(document_list) != 0 and flag == 1 and len(
                                document_list[-1].split() + sentence_words) < self.max_document_length:
                            document_list[-1] += ' ' + sentence
                        else:
                            document_list = self._process_dataset_long_paragraph(document_list, sentence,
                                                                                 sentence_length)
                            flag = 1
        return document_list

    def _process_docx_dataset(self, docx_content):

        document_list = []
        for paragraph in tqdm(docx_content.paragraphs, unit=' paragraphs', desc='Extracting',
                              postfix='Data from Dataset'):
            if 'Heading' in str(paragraph.style):
                text = re.sub(self.doc_regex, '', paragraph.text)
                if len(document_list) != 0 and len(document_list[-1].split()) <= self.max_heading_length:
                    document_list[-1] = text + '.'
                else:
                    document_list.append(text + '.')
            if 'Body Text' in str(paragraph.style):
                sentence = re.sub(self.doc_regex, '', paragraph.text)
                sentence_length = len(sentence.split())
                if sentence_length > self.max_heading_length:
                    if len(document_list) != 0 and len(
                            document_list[-1].split()) + sentence_length < self.max_document_length:
                        document_list[-1] += ' ' + sentence
                    else:
                        document_list = self._process_dataset_long_paragraph(document_list, sentence, sentence_length)
        return document_list

    def _process_txt_dataset(self, path):

        document_list = []
        with open(path) as file:
            for line in tqdm(file.readlines(), unit=' paragraphs', desc='Extracting', postfix='Data from Dataset'):
                line_text = line.strip()
                line_text = re.sub(self.doc_regex, '', line_text)
                line_length = len(line_text.split())
                if 0 < line_length <= self.max_heading_length:
                    if len(document_list) != 0 and len(document_list[-1].split()) <= self.max_heading_length:
                        document_list[-1] = line_text + '.'
                    else:
                        document_list.append(line_text + '.')
                elif line_length > self.max_heading_length:
                    if len(document_list) != 0 and len(
                            document_list[-1].split()) + line_length <= self.max_document_length:
                        document_list[-1] += ' ' + line_text
                    else:
                        document_list = self._process_dataset_long_paragraph(document_list, line_text, len(
                            document_list[-1].split()) + line_length)
        return document_list

    def _process_dataset_long_paragraph(self, document_list, sentence, sentence_length):

        if sentence_length > self.max_document_length:
            for i in range(2, sentence_length):
                div = sentence_length / i
                if div < self.max_document_length:
                    break
            temp_sent = ''
            sm_sent = sent_tokenize(sentence)

            for sent in sm_sent:
                if len(temp_sent.split() + sent.split()) > div:
                    if len(document_list[-1].split()) <= self.max_heading_length:
                        document_list[-1] += ' ' + temp_sent
                    else:
                        document_list.append(temp_sent)
                    temp_sent = ''
                temp_sent = temp_sent + sent

            if len(document_list[-1].split() + temp_sent.split()) < self.max_document_length:
                document_list[-1] += ' ' + temp_sent
            else:
                document_list.append(temp_sent)
        else:
            document_list.append(sentence)
        return document_list

    def _tokenize_and_embeddings(self, document_list):

        continous_index = 0
        document_index = 0
        for document in tqdm(document_list, unit=' documents', desc='Processing', postfix='Word Embeddings'):
            if self.processor == 'GPU':
                tokens = self.tokenizer(document, truncation=True, return_tensors='pt').to(self.cuda_current_device)
            else:
                tokens = self.tokenizer(document, truncation=True, return_tensors='pt')
            words = self.tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
            word_ids = tokens.word_ids()
            output = self.model(**tokens)
            if self.processor == 'GPU':
                vectors = output.last_hidden_state[0].cpu().detach().numpy()
            else:
                vectors = output.last_hidden_state[0].detach().numpy()
            word_list = []
            vector_list = []
            continous_words = []
            word_index = -1
            for i in range(len(words)):
                if word_ids[i] is None or words[i] in self.punctuations:
                    if words[i] in self.punctuations_continuity_exclude:
                        pass
                    else:
                        continous_index = continous_index + 1
                    continue
                if word_ids[i] > word_index:
                    if len(word_list) != 0 and word_list[-1].lower() in self.stop_words:
                        word_list.pop()
                        vector_list.pop()
                        continous_words.pop()
                        continous_index = continous_index + 1
                    word_list.append(words[i])
                    vector_list.append(vectors[i])
                    continous_words.append(continous_index)
                    word_index = word_ids[i]
                elif word_ids[i] == word_index:
                    sub_word = words[i].replace('##', "")
                    word_list[-1] = word_list[-1] + sub_word
                    vector_list[-1] = (vector_list[-1] + vectors[i])
                    if word_ids[i + 1] != word_ids[i]:
                        vector_list[-1] = vector_list[-1] / word_ids.count(word_index)
            yield word_list, vector_list, [document_index] * len(word_list), continous_words
            document_index += 1

    def _generate_n_grams(self, i, words, vectors, document, continous, n=1):

        if i > n - 1 and n < self.max_ngram and continous[i] == continous[i - n]:
            temp_word = ''
            temp_vector = np.zeros([len(vectors[i])])
            for j in range(n, -1, -1):
                temp_word = temp_word + ' ' + words[i - j]
                temp_vector = temp_vector + vectors[i - j]
            self.temporary_ngram_words.append(temp_word.strip())
            self.bert_words_ngram[n].append(temp_word.strip())
            self.bert_vectors_ngram[n].append(temp_vector / (n + 1))
            self.bert_documents_ngram[n].append(document[i])
            self._generate_n_grams(i, words, vectors, document, continous, n=n + 1)
        return

    def _custom_analyzer(self, words):

        final_list = []
        for word in words:
            final_list.append(word)
            lemmatized_word = ' '.join([self.lemmatizer.lemmatize(token.lower()) for token in word.split()])
            if word != lemmatized_word:
                final_list.append(lemmatized_word)
        return final_list

    def _context_similarity_measurement(self, features, context_length):

        context_total = 0
        word_total = 0
        for i in range(context_length):
            if features[i] != 0:
                context_total += 1
        for i, x in enumerate(features[context_length:]):
            if x != 0:
                word_total += 1
        word_mean = 0.5 * np.mean(features[context_length:])
        if len(features[:context_length]) == 0:
            context_mean = 0
        else:
            context_mean = 0.5 * np.mean(features[:context_length])
        return int(str(context_total) + str(word_total)) + context_mean + word_mean

    def _get_article_words_vectors(self, similar_documents, similarity_scores, similarity_factor, input_words_max):

        document_words = []
        document_vectors = np.empty((0, similar.bert_vectors[0].shape[0]))
        for article in similar_documents:
            if similarity_scores[article] < similarity_scores[similar_documents[0]] - similarity_factor:
                break
            if article == len(similar_documents) - 1:
                for i in range(input_words_max):
                    document_words += self.bert_words_ngram[i][self.bert_documents_ngram[i].index(article):]
                    document_vectors = np.append(document_vectors, self.bert_vectors_ngram[i][self.bert_documents_ngram[i].index(article):], axis=0)
            else:
                for i in range(input_words_max):
                    document_words += self.bert_words_ngram[i][
                                      self.bert_documents_ngram[i].index(article):self.bert_documents_ngram[i].index(
                                          article + 1)]
                    document_vectors = np.append(document_vectors, self.bert_vectors_ngram[i][
                                        self.bert_documents_ngram[i].index(article):self.bert_documents_ngram[i].index(
                                            article + 1)], axis=0)
        return document_words, document_vectors

    def _calculate_input_word_embedding(self, input_words, document_words, document_vectors, uncased_lemmatization):

        average_list = np.zeros([len(input_words), len(document_vectors[0])])
        mean_index = []
        for i_index, i_word in enumerate(input_words):
            a_count = 0
            for a_index, a_word in enumerate(document_words):
                if uncased_lemmatization and i_word == self.lemmatizer.lemmatize(a_word.lower()):
                    average_list[i_index] += document_vectors[a_index]
                    a_count = a_count + 1
                elif i_word == a_word:
                    average_list[i_index] += document_vectors[a_index]
                    a_count = a_count + 1
            if average_list[i_index].any():
                average_list[i_index] = average_list[i_index] / a_count
                mean_index.append(i_index)
        average = np.mean(average_list[mean_index], axis=0)
        return average

    def _context_similarity_document_scores(self, input_context_words, input_context_length, input_words_length,
                                            context_similarity_factor):

        cv_list = []
        cv_counts = self.cv_counts.toarray()
        index = [i for i in np.searchsorted(self.cv_words, input_context_words) if
                 self.cv_words[i] in input_context_words]

        for i in range(len(self.document_list)):
            cv_list.append(cv_counts[i][index].tolist())

        cv_list = self.min_max_scaler.fit_transform(cv_list)
        similarity_scores = [self._context_similarity_measurement(counts, input_context_length) for counts in cv_list]
        similarity_factor = context_similarity_factor * input_words_length
        similar_documents = np.flip(np.argsort(similarity_scores))
        return similar_documents, similarity_scores, similarity_factor

    def _find_nearest_cosine_words(self, input_context_words, cosine_sim, cosine_words, pos_to_exclude,
                                   max_output_words, output_filter_factor):

        output_dict = {}
        sorted_list = np.flip(np.argsort(cosine_sim))
        lemmatized_words = {self.lemmatizer.lemmatize(token.lower()) for word in input_context_words for token in
                            word.split()}

        for i in range(len(cosine_words)):
            stop = 0
            pop_list = []
            original_word = cosine_words[sorted_list[i]]
            pos_tags = [pos[1] for pos in nltk.pos_tag(original_word.split())]
            lemmatized_word = {self.lemmatizer.lemmatize(token.lower()) for token in original_word.split()}
            if len(lemmatized_words.intersection(lemmatized_word)) > output_filter_factor * len(original_word.split()):
                continue
            if any(pos in pos_tags for pos in pos_to_exclude):
                continue
            if original_word not in output_dict.keys():
                for word in output_dict.keys():
                    if original_word in word:
                        stop = 1
                        break
                    elif word in original_word:
                        pop_list.append(word)
                        stop = 0
                if stop == 0:
                    pop = [output_dict.pop(key) for key in pop_list]
                    output_dict[original_word] = cosine_sim[sorted_list[i]]
                    if len(output_dict.keys()) == max_output_words:
                        break
        return output_dict

    def _process_input_context_words(self, input_context, input_words, single_word_split, uncased_lemmatization):

        if single_word_split:
            input_context_split = input_context.split()
            input_words_split = list(itertools.chain.from_iterable([word.split() for word in input_words]))
            input_words_max = 1
        else:
            input_context_split = [] if input_context == '' else [input_context]
            input_words_split = input_words
            input_words_max = max([len(word.split()) for word in input_words])
        if uncased_lemmatization:
            input_context_split = [' '.join([self.lemmatizer.lemmatize(token.lower()) for token in word.split()]) for
                                   word in input_context_split]
            input_words_split = [' '.join([self.lemmatizer.lemmatize(token.lower()) for token in word.split()]) for word
                                 in input_words_split]
        input_context_words = input_context_split + input_words_split
        input_context_words_max = max([len(word.split()) for word in input_context_words])
        return input_context_split, input_words_split, input_words_max, input_context_words, input_context_words_max

    def find_similar_words(self,
                           input_context='',
                           input_words=[],
                           input_embedding = np.array([]),
                           output_words_ngram=1,
                           pos_to_exclude=[],
                           max_output_words=10,
                           context_similarity_factor=0.25,
                           output_filter_factor=0.5,
                           single_word_split=True,
                           uncased_lemmatization=True
                           ):
        """
        This method calculates the cosine similarity between the average of the input words based on the given context and all the words present in the given vocabulary.

        Parameters
        ----------
        input_context : the input context (string) (optional) (default: None)

        input_words : the input words as (list of strings)

        output_words_ngram : n-gram words expected as output (integer) (optional) (default: 1)

        pos_to_exclude : the words are ignored in the output if these part of speech tags are present in it (list of strings) (optional) (default: None)

        max_output_words : the maximum number of output words to be generated (integer) (optional) (default: 10)

        context_similarity_factor : uses to tune the context-matching process, find the best paragraphs related to the given input words (float) (optional) (default: 0.25) (Range: 0 to 1)

        output_filter_factor : uses to ignore words that are similar to the given input in the output (float) (optional) (default: 0.5) (Range: 0 to 1)

        single_word_split : whether to split n-gram words when given as input (boolean) (optional) (default: True)

        uncased_lemmatization : whether to uncase and lemmatize the input (boolean) (optional) (default: True)

        """

        if input_embedding.size == 0:

            input_context_split, input_words_split, input_words_max, input_context_words, input_context_words_max = self._process_input_context_words(
                input_context, input_words, single_word_split, uncased_lemmatization)

            similar_documents, similarity_scores, similarity_factor = self._context_similarity_document_scores(
                input_context_words, len(input_context_split), len(input_words_split), context_similarity_factor)

            document_words, document_vectors = self._get_article_words_vectors(similar_documents, similarity_scores,
                                                                              similarity_factor, input_words_max)

            input_embedding = self._calculate_input_word_embedding(input_words_split, document_words, document_vectors,
                                                                  uncased_lemmatization)
        else:
            input_context_words = []

        if output_words_ngram == 0:
            cosine_sim = cosine_similarity(self.bert_vectors_all,
                                           [input_embedding]).flatten()
            cosine_words = self.bert_words_all
        else:
            cosine_sim = cosine_similarity(self.bert_vectors_ngram[output_words_ngram - 1], [input_embedding]).flatten()
            cosine_words = self.bert_words_ngram[output_words_ngram - 1]

        output_dictionary = self._find_nearest_cosine_words(input_context_words, cosine_sim, cosine_words,
                                                            pos_to_exclude, max_output_words, output_filter_factor)
        return output_dictionary, input_embedding

In [9]:
similar = BERTSimilarWords(model='rdpahalavan/bert-network-packet-flow-header-payload', max_document_length=375, exclude_stopwords=['dos'], embeddings_scaler=scaler).load_dataset(dataset_path='CORPUS.txt')

Initializing:   0%|          | 0/2 [00:00<?, ? it/s, Tokenizer and Model]

Extracting:   0%|          | 0/6020 [00:00<?, ? paragraphs/s, Data from Dataset]

Processing:   0%|          | 0/3010 [00:00<?, ? documents/s, Word Embeddings]

Creating:   0%|          | 0/10 [00:00<?, ?it/s, N-gram Words]

In [8]:
len(similar.bert_words)

476013

In [9]:
len(similar.bert_words_all)

868533

In [12]:
len(similar.bert_words_ngram[0])

476013

In [16]:
for i in range(10):
    print(f'{i+1}-gram words: {len(similar.bert_words_ngram[i])}')

1-gram words: 476013
2-gram words: 224565
3-gram words: 94640
4-gram words: 39059
5-gram words: 17326
6-gram words: 8469
7-gram words: 4313
8-gram words: 2284
9-gram words: 1218
10-gram words: 646


In [11]:
df

Unnamed: 0,Cluster Name,Combination List,Cluster Centroid
0,Normal,"['Regular', 'Typical', 'Common', 'Standard', '...","[0.04477231577038765, -1.1067426204681396, 0.6..."
1,Heartbleed,"['SSL vulnerability', 'OpenSSL exploit', 'Info...","[0.20892228186130524, -1.8042747974395752, 2.1..."
2,"Normal, DoS Hulk","['DDoS Shield', 'Steadyflow', 'Regular Traffic...","[-0.6773971915245056, 0.3524855971336365, -0.2..."
3,"Normal, DDoS","['Coordinated Overload', 'Cyber Storm', 'Netwo...","[1.542575716972351, 1.337799310684204, -0.9742..."
4,"Exploits, SSH Patator","['Malicious SSH Exploiter', 'Unauthorized SSH ...","[0.8602568507194519, 0.048649415373802185, -0...."
...,...,...,...
511,"DoS, FTP Patator","['BruteFTP', 'PayloadStorm', 'PassiveFlooder',...","[0.5987561941146851, -0.7863489985466003, -0.7..."
512,"DoS GoldenEye, Normal","['DestructiveStrike', 'PeakPerformance', 'Ruth...","[0.7857335209846497, 0.16938543319702148, -2.4..."
513,SSH Patator,"['SSH Brute Force', 'SSH Dictionary Attack', '...","[0.5677225589752197, 0.7569266557693481, -0.43..."
514,"Fuzzers, Exploits, Normal","['Vulnerability Probing', 'Attack Simulation',...","[-1.5628975629806519, 1.0497198104858398, -0.8..."


In [12]:
df['Combination List'][0]

"['Regular', 'Typical', 'Common', 'Standard', 'Regularized']"

In [21]:
df['Combination List'][500]

"['Regular', 'Standard', 'Routine', 'Common', 'Typical']"

In [22]:
tags, emb = similar.find_similar_words(input_words=eval(df['Combination List'][500]), context_similarity_factor=0, output_words_ngram=0, uncased_lemmatization=True, single_word_split=False, output_filter_factor=1)
tags

{'accepted behavior patterns observed': 0.9800301522430844,
 'Standard behavior': 0.9775606474812201,
 'signifies': 0.9773024906893971,
 'differentiate regular traffic': 0.9757654693549477,
 'vulnerabilities': 0.9756843923825949,
 'routine network activities including regular communication data transfer': 0.9750090825589134,
 'typical': 0.9741505065735123,
 'malicious intent': 0.9732447395603571,
 'conforming': 0.9718956874685923,
 'unexpected behavior helping': 0.970450529274012}

In [23]:
tag_embeddings = []

In [None]:
for i in range(len(df)):
    try:
        tags, emb = similar.find_similar_words(input_words=eval(df['Combination List'][i]), context_similarity_factor=0, output_words_ngram=0, uncased_lemmatization=True, single_word_split=False, output_filter_factor=1)
    except:
        tags, emb = similar.find_similar_words(input_words=eval(df['Combination List'][i]), context_similarity_factor=0, output_words_ngram=0, uncased_lemmatization=True, single_word_split=True, output_filter_factor=1)
    tag_embeddings.append(emb)
    print(f'Completed: {i}')

In [25]:
len(tag_embeddings)

516

In [26]:
np.save('./TAGS-NAMES-EMBEDDINGS.npy', tag_embeddings)

In [27]:
cluster_centers = np.load('./KMEANS-CLUSTER-CENTERS.npy')
tag_names = np.load('./TAGS-NAMES-EMBEDDINGS.npy')

In [28]:
main_df = pd.read_csv('./DATA_TRAIN.csv')

In [29]:
main_df

Unnamed: 0,packet_dat,attack_cat
0,0 0 141 -1 80 63713 2960 2920 64 0 5 0 -1 119 ...,DDoS
1,1190 1582 3526815 -1 80 50095 1500 1460 118 0 ...,Normal
2,0 0 4 -1 80 41471 4420 4380 64 0 5 0 -1 72 84 ...,DDoS
3,0 0 176 -1 80 45284 2948 2896 64 0 8 0 -1 72 8...,DoS Hulk
4,0 0 128 -1 80 46654 1500 1448 64 0 8 0 -1 72 8...,DoS Hulk
...,...,...
1187776,14492 14492 0 -1 51328 22 164 112 62 0 8 3 -1 ...,SSH Patator
1187777,14 98 131788 -1 80 52067 1500 1460 253 0 5 0 -...,DoS
1187778,1 2 397 -1 47188 22 692 640 62 0 8 3 -1 0 0 2 ...,SSH Patator
1187779,2063 0 0 -1 80 32768 1500 1448 64 0 8 0 -1 32 ...,DoS Hulk


In [30]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("rdpahalavan/bert-network-packet-flow-header-payload")

model = AutoModel.from_pretrained("rdpahalavan/bert-network-packet-flow-header-payload")

In [31]:
model = model.to(0)

In [32]:
scaler_col_names = [str(i) for i in range(768)]

In [33]:
from sklearn.metrics.pairwise import euclidean_distances

In [34]:
def get_packet_embedding(packet):
    tokens = tokenizer(packet, truncation=True, return_tensors='pt').to(0)
    output = model(**tokens)
    embedding = output.last_hidden_state[:, 1:-1, :].mean(dim=1).cpu().detach().numpy()
    df = pd.DataFrame(embedding[0].reshape(1, -1), columns=scaler_col_names)
    embedding = scaler.transform(df)[0]
    euclidean_distance = euclidean_distances(cluster_centers, [embedding])
    data = {i: euclidean_distance[i][0] for i in range(len(euclidean_distance))}
    df = pd.DataFrame(list(data.items()), columns=['Cluster', 'Euclidean Distance'])
    df.sort_values(by='Euclidean Distance', inplace=True)
    return int(df.iloc[0]['Cluster']), df.iloc[0]['Euclidean Distance'], embedding

In [35]:
df.iloc[178]

Cluster Name                                   DoS, Exploits, Generic
Combination List    ['Vulnerability Bomb', 'Overwhelming Assault',...
Cluster Centroid    [-0.8283041715621948, -0.6053891777992249, -0....
Name: 178, dtype: object

In [36]:
main_df

Unnamed: 0,packet_dat,attack_cat
0,0 0 141 -1 80 63713 2960 2920 64 0 5 0 -1 119 ...,DDoS
1,1190 1582 3526815 -1 80 50095 1500 1460 118 0 ...,Normal
2,0 0 4 -1 80 41471 4420 4380 64 0 5 0 -1 72 84 ...,DDoS
3,0 0 176 -1 80 45284 2948 2896 64 0 8 0 -1 72 8...,DoS Hulk
4,0 0 128 -1 80 46654 1500 1448 64 0 8 0 -1 72 8...,DoS Hulk
...,...,...
1187776,14492 14492 0 -1 51328 22 164 112 62 0 8 3 -1 ...,SSH Patator
1187777,14 98 131788 -1 80 52067 1500 1460 253 0 5 0 -...,DoS
1187778,1 2 397 -1 47188 22 692 640 62 0 8 3 -1 0 0 2 ...,SSH Patator
1187779,2063 0 0 -1 80 32768 1500 1448 64 0 8 0 -1 32 ...,DoS Hulk


In [79]:
cluster, distance, embedding = get_packet_embedding(main_df['packet_dat'][953456])

In [80]:
cluster

63

In [81]:
distance

2.968107223510742

In [82]:
df.loc[63]

Cluster Name                                              SSH Patator
Combination List    ['SSH Password Cracker', 'Brute Force SSH Atta...
Cluster Centroid    [0.6212525963783264, 0.7067753672599792, -0.76...
Name: 63, dtype: object

In [83]:
tags, emb = similar.find_similar_words(input_embedding = (tag_names[cluster] + (embedding - cluster_centers[cluster])), context_similarity_factor=0, output_words_ngram=0, uncased_lemmatization=True, single_word_split=False, output_filter_factor=1)
tags

{'SSH Credential Guessing': 0.955725148046819,
 'SSH Password Guessing': 0.9526428861127418,
 'Secure Shell': 0.9485803671888409,
 'SSH Dictionary based Attack': 0.9485583280322055,
 'secure remote access': 0.9458850748318011,
 'SSH Login Attack': 0.9451930643133879,
 'unauthorized access': 0.9451043138417081,
 'SSH authentication attack': 0.94485077834196,
 'successful login attempts': 0.9435012262817355,
 'SSH Dictionary Attacker': 0.9430031735410134}

# Testing

In [86]:
test_df = pd.read_csv('./DATA_TEST.csv')

In [134]:
test_df[test_df['attack_cat']!='DoS Hulk'].head(20)

Unnamed: 0,packet_dat,attack_cat
1,0 0 2 -1 80 56471 10175 10135 64 0 5 3 -1 112 ...,DDoS
4,111 16 147912 -1 38156 25 1500 1460 254 0 5 0 ...,DoS
5,14 11 1332 -1 13447 111 100 60 254 0 5 0 -1 12...,Reconnaissance
6,22 14 63411 -1 444 45022 8740 8688 64 0 8 0 -1...,Heartbleed
12,157 213 474513 -1 443 54320 1500 1460 53 0 5 0...,Normal
15,122 18 163802 -1 4664 25 1500 1460 254 0 5 0 -...,DoS
16,2 1 3347 -1 80 33580 471 419 64 0 8 3 -1 219 1...,Web Attack - XSS
17,127 298 495757 -1 80 22701 2960 2920 64 0 5 0 ...,DDoS
18,8130 4065 56910 -1 21 59698 72 20 64 0 8 3 -1 ...,FTP Patator
21,0 0 4 -1 80 50564 2875 2835 64 0 5 2 -1 109 97...,DDoS


In [143]:
cluster, distance, embedding = get_packet_embedding(test_df['packet_dat'][29969])

In [144]:
test_df.loc[29969]

packet_dat    31 31 15599 -1 54131 444 343 303 128 0 5 3 -1 ...
attack_cat                                         Infiltration
Name: 29969, dtype: object

In [145]:
cluster, distance

(17, 1.8301955461502075)

In [146]:
tags, emb = similar.find_similar_words(input_embedding = (tag_names[cluster] + (embedding - cluster_centers[cluster])), context_similarity_factor=0, output_words_ngram=0, uncased_lemmatization=True, single_word_split=False, output_filter_factor=1)
tags

{'routers firewalls': 0.9642805725907245,
 'subversive entry': 0.9623438670437829,
 'malicious payloads': 0.9617977693854897,
 'address software vulnerabilities robust network monitoring': 0.961637982183049,
 'IP spoofing session hijacking': 0.9603539512293087,
 'configuration perform regular security audits': 0.9602574015344214,
 'strict access': 0.9599809415814653,
 'packets reach': 0.9592730400260735,
 'safe online': 0.9591345643930638,
 'intrusion detection systems': 0.9591310571446596}

In [139]:
test_df = test_df.groupby('attack_cat').apply(lambda x: x.sample(min(len(x), 2500))).reset_index(drop=True)

In [140]:
test_df = test_df.sample(frac=1.0)

In [141]:
test_df

Unnamed: 0,packet_dat,attack_cat
29969,31 31 15599 -1 54131 444 343 303 128 0 5 3 -1 ...,Infiltration
26900,125 19 167141 -1 7977 25 1500 1460 254 0 5 0 -...,Generic
35952,53 31 5932 -1 80 34338 1500 1460 252 0 5 0 -1 ...,Reconnaissance
3801,0 0 1 -1 80 56417 4335 4295 64 0 5 3 -1 114 11...,DDoS
541,19 15 3868 -1 42812 80 239 199 62 0 5 3 -1 71 ...,Analysis
...,...,...
43585,7326 0 0 -1 80 59042 1919 1867 64 0 8 3 -1 72 ...,Web Attack - XSS
45875,14 77 103183 -1 80 49608 1500 1460 253 0 5 0 -...,Worms
30893,15 15 4167 -1 54119 444 165 125 128 0 5 3 -1 3...,Infiltration
46303,14 77 103183 -1 80 49608 1500 1460 252 0 5 0 -...,Worms


In [142]:
test_df.to_csv('tag_generation.csv', index=False)

In [151]:
tags_testing_list = []
index_list = []

In [None]:
for i, packet in enumerate(test_df['packet_dat'].to_list()):
    if i > -1:
        cluster, distance, embedding = get_packet_embedding(test_df['packet_dat'][i])
        tags, emb = similar.find_similar_words(input_embedding = (tag_names[cluster] + (embedding - cluster_centers[cluster])), context_similarity_factor=0, output_words_ngram=0, uncased_lemmatization=True, single_word_split=False, output_filter_factor=1)
        tags_testing_list.append(tags)
        index_list.append(i)
        if i%1000 == 0:
            temp = pd.DataFrame({'df_index': index_list, 'tags': tags_testing_list})
            temp.to_csv(f'./Tags Testing/Tags{i}.csv', index=False)
        print(f'Completed: {i}')

In [265]:
test_df['packet_dat'][index_list[-1]]

'11 71 94466 -1 80 40371 1500 1460 253 0 5 0 -1 15 136 177 4 0 0 131 125 8 12 15 143 167 4 0 0 139 117 12 133 246 15 132 39 5 0 0 139 85 8 185 0 48 11 8 139 69 12 139 92 145 64 137 4 36 137 92 36 4 232 41 99 1 0 133 192 137 218 117 10 141 101 244 137 208 91 94 95 93 195 199 4 36 229 221 9 8 49 201 49 219 137 157 96 255 255 255 137 141 100 255 255 255 232 205 94 0 0 133 192 116 9 128 56 0 15 133 119 4 0 0 131 125 8 6 15 132 71 1 0 0 139 93 12 184 0 0 0 0 133 192 137 157 92 255 255 255 116 12 199 4 36 92 83 11 8 232 184 193 249 247 139 149 100 255 255 255 141 181 92 255 255 255 139 133 96 255 255 255 137 52 36 139 77 8 232 219 6 0 0 131 236 4 137 195 133 192 15 132 245 0 0 0 131 120 24 255 116 7 199 64 24 255 255 255 255 139 133 92 255 255 255 61 117 160 10 8 116 22 137 4 36 232 236 98 1 0 137 133 92 255 255 255 133 192 15 132 197 0 0 0 137 116 36 4 139 69 8 137 4 36 232 255 4 0 0 133 192 137 199 15 132 151 0 0 0 139 85 8 190 0 48 11 8 139 4 149 128 222 9 8 137 28 149 0 48 11 8 133 192 1

In [266]:
index_list[-1]

46780

In [269]:
test_df.loc[46780]

packet_dat    11 71 94466 -1 80 40371 1500 1460 253 0 5 0 -1...
attack_cat                                                Worms
Name: 46780, dtype: object

In [155]:
len(tags_testing_list)

46781

In [156]:
df = pd.DataFrame({'id':index_list, 'tags':tags_testing_list})

In [157]:
df

Unnamed: 0,id,tags
0,0,"{'ZeroImpact': 0.9326767845982844, 'injecting'..."
1,1,{'injecting unauthorized packets': 0.929202050...
2,2,"{'unknowingly triggering': 0.9352225230817839,..."
3,3,"{'Exploit DoS': 0.9215557725519854, 'mitigate ..."
4,4,"{'ZeroImpact': 0.9545252141232174, 'anomalies'..."
...,...,...
46776,46776,{'successful Exploit Normalizer attack': 0.940...
46777,46777,{'misconfigured firewalls weak authentication ...
46778,46778,"{'malicious code': 0.9816162270729385, 'infect..."
46779,46779,"{'Vulnerability Hunter': 0.9274233934166645, '..."


In [158]:
df.to_csv('GENERATED_TAGS.csv', index=False)

In [163]:
test_df

Unnamed: 0,packet_dat,attack_cat
29969,31 31 15599 -1 54131 444 343 303 128 0 5 3 -1 ...,Infiltration
26900,125 19 167141 -1 7977 25 1500 1460 254 0 5 0 -...,Generic
35952,53 31 5932 -1 80 34338 1500 1460 252 0 5 0 -1 ...,Reconnaissance
3801,0 0 1 -1 80 56417 4335 4295 64 0 5 3 -1 114 11...,DDoS
541,19 15 3868 -1 42812 80 239 199 62 0 5 3 -1 71 ...,Analysis
...,...,...
43585,7326 0 0 -1 80 59042 1919 1867 64 0 8 3 -1 72 ...,Web Attack - XSS
45875,14 77 103183 -1 80 49608 1500 1460 253 0 5 0 -...,Worms
30893,15 15 4167 -1 54119 444 165 125 128 0 5 3 -1 3...,Infiltration
46303,14 77 103183 -1 80 49608 1500 1460 252 0 5 0 -...,Worms


In [18]:
df = pd.read_csv('./GENERATED_TAGS.csv')

In [19]:
df

Unnamed: 0,id,tags
0,0,"{'ZeroImpact': 0.9326767845982844, 'injecting'..."
1,1,{'injecting unauthorized packets': 0.929202050...
2,2,"{'unknowingly triggering': 0.9352225230817839,..."
3,3,"{'Exploit DoS': 0.9215557725519854, 'mitigate ..."
4,4,"{'ZeroImpact': 0.9545252141232174, 'anomalies'..."
...,...,...
46776,46776,{'successful Exploit Normalizer attack': 0.940...
46777,46777,{'misconfigured firewalls weak authentication ...
46778,46778,"{'malicious code': 0.9816162270729385, 'infect..."
46779,46779,"{'Vulnerability Hunter': 0.9274233934166645, '..."


In [40]:
test_df2 = pd.read_csv('./tag_generation.csv')

In [41]:
test_df2

Unnamed: 0,packet_dat,attack_cat
0,31 31 15599 -1 54131 444 343 303 128 0 5 3 -1 ...,Infiltration
1,125 19 167141 -1 7977 25 1500 1460 254 0 5 0 -...,Generic
2,53 31 5932 -1 80 34338 1500 1460 252 0 5 0 -1 ...,Reconnaissance
3,0 0 1 -1 80 56417 4335 4295 64 0 5 3 -1 114 11...,DDoS
4,19 15 3868 -1 42812 80 239 199 62 0 5 3 -1 71 ...,Analysis
...,...,...
46776,7326 0 0 -1 80 59042 1919 1867 64 0 8 3 -1 72 ...,Web Attack - XSS
46777,14 77 103183 -1 80 49608 1500 1460 253 0 5 0 -...,Worms
46778,15 15 4167 -1 54119 444 165 125 128 0 5 3 -1 3...,Infiltration
46779,14 77 103183 -1 80 49608 1500 1460 252 0 5 0 -...,Worms


In [43]:
test_df2['attack_cat'].value_counts()

Infiltration                  2500
Heartbleed                    2500
Worms                         2500
Fuzzers                       2500
FTP Patator                   2500
Web Attack - Brute Force      2500
DoS GoldenEye                 2500
Generic                       2500
SSH Patator                   2500
DoS                           2500
Normal                        2500
DoS Hulk                      2500
Exploits                      2500
DDoS                          2500
Reconnaissance                2500
DoS SlowHTTPTest              2432
DoS Slowloris                 1852
Web Attack - XSS              1549
Bot                           1460
Analysis                       793
Backdoor                       507
Shellcode                      435
Port Scan                      238
Web Attack - SQL Injection      15
Name: attack_cat, dtype: int64

In [187]:
df.loc[0]

id                                                      0
tags    {'ZeroImpact': 0.9326767845982844, 'injecting'...
Name: 0, dtype: object

In [188]:
test_df['attack_cat'][0]

'Analysis'

In [271]:
df.loc[46780]

id                                                  46780
tags    {'misconfigured firewalls weak authentication ...
Name: 46780, dtype: object

In [281]:
', '.join(df.loc[46780, 'tags'].keys())

'misconfigured firewalls weak authentication mechanisms, unauthorized, PersistentIntrusion attack, Malware Launcher, StealthySilencer, EncryptionBreach, exploits vulnerabilities, VulnerabilityChain, Packet spoofing, Vulnerability Scanner'

In [243]:
list(df.loc[0, 'tags'].keys())[:3]

['ZeroImpact', 'injecting', 'responsiveness']

In [250]:
[','.join(list(df.loc[0, 'tags'].keys())[:3])]

['ZeroImpact,injecting,responsiveness']

In [210]:
from sentence_transformers import SentenceTransformer, util

In [257]:
[test_df['attack_cat'][0]]

['Analysis']

In [279]:
index = 46780

sentences = [f"{test_df['attack_cat'][index]}, network packet"] + list(df.loc[index, 'tags'].keys()) + [', '.join(df.loc[index, 'tags'].keys())] + [','.join(list(df.loc[index, 'tags'].keys())[:3])] + [','.join(list(df.loc[index, 'tags'].keys())[:5])] + [','.join(list(df.loc[index, 'tags'].keys())[6:])]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)

In [282]:
sentences

['Worms, network packet',
 'misconfigured firewalls weak authentication mechanisms',
 'unauthorized',
 'PersistentIntrusion attack',
 'Malware Launcher',
 'StealthySilencer',
 'EncryptionBreach',
 'exploits vulnerabilities',
 'VulnerabilityChain',
 'Packet spoofing',
 'Vulnerability Scanner',
 'misconfigured firewalls weak authentication mechanisms, unauthorized, PersistentIntrusion attack, Malware Launcher, StealthySilencer, EncryptionBreach, exploits vulnerabilities, VulnerabilityChain, Packet spoofing, Vulnerability Scanner',
 'misconfigured firewalls weak authentication mechanisms,unauthorized,PersistentIntrusion attack',
 'misconfigured firewalls weak authentication mechanisms,unauthorized,PersistentIntrusion attack,Malware Launcher,StealthySilencer',
 'exploits vulnerabilities,VulnerabilityChain,Packet spoofing,Vulnerability Scanner']

In [298]:
test_df[test_df['attack_cat']=='Web Attack - SQL Injection']

Unnamed: 0,packet_dat,attack_cat
42728,3937 1968 0 -1 80 36196 582 530 64 0 8 3 -1 72...,Web Attack - SQL Injection
42719,0 0 523 -1 80 36214 2073 2021 64 0 8 3 -1 72 8...,Web Attack - SQL Injection
42730,12820 12820 0 -1 36204 80 651 599 62 0 8 3 -1 ...,Web Attack - SQL Injection
42717,1 0 947 -1 80 36212 1500 1448 64 0 8 0 -1 72 8...,Web Attack - SQL Injection
42720,0 0 523 -1 36204 80 651 599 62 0 8 3 -1 71 69 ...,Web Attack - SQL Injection
42724,0 0 210 -1 80 36206 582 530 64 0 8 3 -1 72 84 ...,Web Attack - SQL Injection
42729,0 0 933 -1 80 36202 4201 4149 64 0 8 3 -1 72 8...,Web Attack - SQL Injection
42718,0 0 195 -1 80 36196 582 530 64 0 8 3 -1 72 84 ...,Web Attack - SQL Injection
42725,13698 13698 0 -1 36242 80 651 599 62 0 8 3 -1 ...,Web Attack - SQL Injection
42721,14084 14084 0 -1 36206 80 575 523 62 0 8 3 -1 ...,Web Attack - SQL Injection


In [350]:
i = 0

sentences = [f"The network packet attack label is {test_df['attack_cat'][i]}"] + list(df.loc[i, 'tags'].keys()) + [', '.join(df.loc[i, 'tags'].keys())] + [', '.join(list(df.loc[i, 'tags'].keys())[:3])] + [', '.join(list(df.loc[i, 'tags'].keys())[:5])] + [', '.join(list(df.loc[i, 'tags'].keys())[5:])]
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)

In [351]:
cos_sim = util.cos_sim(embeddings[0], embeddings)
print(cos_sim[0][1:])
print(sentences)

tensor([0.0762, 0.0879, 0.0472, 0.2492, 0.3381, 0.0877, 0.0468, 0.0061, 0.3375,
        0.4865, 0.4895, 0.0506, 0.3742, 0.3952])
['The network packet attack label is Analysis', 'ZeroImpact', 'injecting', 'responsiveness', 'spoofing', 'data exfiltration', 'encrypted', 'reassembly', 'reordering', 'vulnerabilities', 'intrusion detection', 'ZeroImpact, injecting, responsiveness, spoofing, data exfiltration, encrypted, reassembly, reordering, vulnerabilities, intrusion detection', 'ZeroImpact, injecting, responsiveness', 'ZeroImpact, injecting, responsiveness, spoofing, data exfiltration', 'encrypted, reassembly, reordering, vulnerabilities, intrusion detection']


In [389]:
i = 0

sentences = ['Network Packet attack label'] + list(df.loc[i, 'tags'].keys()) + [', '.join(df.loc[i, 'tags'].keys())] + [', '.join(list(df.loc[i, 'tags'].keys())[:3])] + [', '.join(list(df.loc[i, 'tags'].keys())[:5])] + [', '.join(list(df.loc[i, 'tags'].keys())[5:])]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)

In [390]:
cos_sim = util.cos_sim(embeddings[0], embeddings)
print(cos_sim[0][1:])
print(sentences)

tensor([ 0.1084,  0.1272, -0.0045,  0.2424,  0.2889,  0.1280,  0.0351,  0.0356,
         0.2897,  0.4265,  0.4104,  0.0445,  0.3287,  0.3273])
['Network Packet attack label', 'ZeroImpact', 'injecting', 'responsiveness', 'spoofing', 'data exfiltration', 'encrypted', 'reassembly', 'reordering', 'vulnerabilities', 'intrusion detection', 'ZeroImpact, injecting, responsiveness, spoofing, data exfiltration, encrypted, reassembly, reordering, vulnerabilities, intrusion detection', 'ZeroImpact, injecting, responsiveness', 'ZeroImpact, injecting, responsiveness, spoofing, data exfiltration', 'encrypted, reassembly, reordering, vulnerabilities, intrusion detection']


In [346]:
i = 0

sentences = [test_df['attack_cat'][i]] + list(df.loc[i, 'tags'].keys()) + [', '.join(df.loc[i, 'tags'].keys())] + [','.join(list(df.loc[i, 'tags'].keys())[:3])] + [','.join(list(df.loc[i, 'tags'].keys())[:5])] + [','.join(list(df.loc[i, 'tags'].keys())[6:])]
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)

In [347]:
cos_sim = util.cos_sim(embeddings[0], embeddings)
print(cos_sim[0][1:])
print(sentences)

tensor([0.2340, 0.1321, 0.2948, 0.1979, 0.2507, 0.1533, 0.1558, 0.1399, 0.2518,
        0.3591, 0.2131, 0.1419, 0.1951, 0.1674])
['Analysis', 'ZeroImpact', 'injecting', 'responsiveness', 'spoofing', 'data exfiltration', 'encrypted', 'reassembly', 'reordering', 'vulnerabilities', 'intrusion detection', 'ZeroImpact, injecting, responsiveness, spoofing, data exfiltration, encrypted, reassembly, reordering, vulnerabilities, intrusion detection', 'ZeroImpact,injecting,responsiveness', 'ZeroImpact,injecting,responsiveness,spoofing,data exfiltration', 'reassembly,reordering,vulnerabilities,intrusion detection']


In [315]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [324]:
cos_sim[0][1:]

tensor([0.0762, 0.0879, 0.0472, 0.2492, 0.3381, 0.0877, 0.0468, 0.0061, 0.3375,
        0.4865, 0.4895, 0.0506, 0.3742, 0.4002])

In [391]:
similarity_list = []

In [None]:
for i, packet in enumerate(test_df['attack_cat'].to_list()):
    if i > -1:
        sentences = [test_df['attack_cat'][i]] + list(df.loc[i, 'tags'].keys()) + [', '.join(df.loc[i, 'tags'].keys())] + [', '.join(list(df.loc[i, 'tags'].keys())[:3])] + [', '.join(list(df.loc[i, 'tags'].keys())[:5])] + [', '.join(list(df.loc[i, 'tags'].keys())[5:])]
        embeddings = model.encode(sentences)
        cos_sim_attack = util.cos_sim(embeddings[0], embeddings)[0].tolist()
        sentences = ['Network Packet'] + list(df.loc[i, 'tags'].keys()) + [', '.join(df.loc[i, 'tags'].keys())] + [', '.join(list(df.loc[i, 'tags'].keys())[:3])] + [', '.join(list(df.loc[i, 'tags'].keys())[:5])] + [', '.join(list(df.loc[i, 'tags'].keys())[5:])]
        embeddings = model.encode(sentences)
        cos_sim_lingo = util.cos_sim(embeddings[0], embeddings)[0].tolist()
        similarity_list.append([(x + y) / 2 for x, y in zip(cos_sim_attack, cos_sim_lingo)])
        if i%100 == 0:
            print(f'Completed: {i}')

In [320]:
df.loc[0]

id                                                      0
tags    {'ZeroImpact': 0.9326767845982844, 'injecting'...
Name: 0, dtype: object

In [321]:
test_df.loc[0]

packet_dat    9 7 1889 -1 64439 80 240 200 62 0 5 3 -1 71 69...
attack_cat                                             Analysis
Name: 0, dtype: object

In [395]:
len(similarity_list)

46781

In [396]:
mdf = pd.DataFrame(similarity_list)

In [397]:
mdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.165734,0.153081,0.229065,0.203539,0.282942,0.185188,0.105524,0.111780,0.219842,0.357846,0.241838,0.098677,0.227567,0.172046
1,1.0,0.212547,0.422899,0.162640,0.274683,0.320445,0.382868,0.104131,0.328420,0.339410,0.097423,0.304525,0.260117,0.304683,0.236394
2,1.0,0.123161,0.165813,0.101715,0.149232,0.129282,0.085533,0.190391,0.039290,0.106399,0.113634,0.173009,0.112740,0.106716,0.121531
3,1.0,0.174206,0.039290,0.165813,0.107845,0.066654,0.435127,0.111372,0.065605,0.113634,0.149232,0.199035,0.136042,0.167589,0.219220
4,1.0,0.165734,0.183315,0.219842,0.213787,0.111780,0.272382,0.185188,0.248124,0.357846,0.041874,0.183145,0.129901,0.162203,0.175423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46776,1.0,0.037546,0.206883,0.361385,0.119604,0.166541,0.095780,0.295328,0.320644,0.134524,0.170841,0.250173,0.308410,0.234362,0.222349
46777,1.0,0.121958,0.164295,0.183354,0.159643,0.164175,0.153030,0.320644,0.087626,0.137386,0.161356,0.202870,0.161359,0.187015,0.213972
46778,1.0,0.259527,0.178403,0.150412,0.222469,0.159309,0.129087,0.325329,0.110329,0.358205,0.046278,0.236216,0.168021,0.198743,0.261288
46779,1.0,0.164419,0.370298,0.296461,0.227149,0.095630,0.151565,0.249103,0.498921,0.054749,0.349575,0.262295,0.325645,0.273556,0.285411


In [399]:
mdf.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0,46781.0
mean,1.0,0.219515,0.214031,0.215339,0.215046,0.211866,0.214049,0.211245,0.210009,0.209603,0.208452,0.266279,0.240022,0.258612,0.26009
std,7.387071e-08,0.10858,0.106218,0.109244,0.11031,0.11261,0.113846,0.112782,0.112755,0.113025,0.112829,0.086012,0.099156,0.092054,0.090923
min,0.9999998,-0.076408,-0.076408,-0.076408,-0.076408,-0.076408,-0.076408,-0.076408,-0.076408,-0.076408,-0.056033,0.003943,-0.076668,-0.010201,-0.026272
25%,0.9999999,0.138151,0.137058,0.140748,0.135511,0.132277,0.131804,0.129087,0.126102,0.12551,0.125944,0.211459,0.16865,0.194956,0.198387
50%,1.0,0.21504,0.211609,0.208698,0.206679,0.198504,0.198504,0.198285,0.197349,0.193769,0.189399,0.264363,0.240741,0.258014,0.258389
75%,1.0,0.292255,0.281561,0.283656,0.287505,0.281036,0.285785,0.281846,0.280474,0.280592,0.275343,0.324074,0.311717,0.32601,0.322065
max,1.0,0.608874,0.639383,0.639383,0.696787,0.696787,0.696787,0.696787,0.696787,0.696787,0.696787,0.550049,0.568608,0.567608,0.594196


In [None]:
# cos_sim_attack = (cos_sim_attack + 1) / 2

In [20]:
variability_list = []

In [26]:
for i in df['tags'].to_list():
    variability_list.append(list(eval(i).keys()))

In [29]:
tdf = pd.DataFrame(variability_list)

In [30]:
tdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,ZeroImpact,injecting,responsiveness,spoofing,data exfiltration,encrypted,reassembly,reordering,vulnerabilities,intrusion detection
1,injecting unauthorized packets,packet injection,exploit vulnerabilities,injects malicious packets,SYN packets overwhelming,HTTP packets,Host Header Injection attack,inject packets,attacker intercepts network traffic,buffer overflow input validation issues
2,unknowingly triggering,exploiting vulnerabilities,VulneraKit,compromising,ExploitBot attack,ExploShock attack typically,ExploitAnalyzer,mitigate ExploDos,misconfigured,ReconFuzz
3,Exploit DoS,mitigate ExploDos,exploiting vulnerabilities,mitigate ScanFuzz attacks,Exploit Normalizer,TCP,misconfigurations,ReconFuseDoS,ReconFuzz,compromising
4,ZeroImpact,anomalies,vulnerabilities,intricacies,reordering,DataGatheringGrenade,encrypted,evade detection,intrusion detection,prevent SQLStorm attacks
...,...,...,...,...,...,...,...,...,...,...
46776,successful Exploit Normalizer attack,malicious actor exploits vulnerabilities,attacker sends specially crafted network packets,detecting malformed,malicious activities including remote code exe...,filter FTP,malicious traffic diverting legitimate,Sockets Layer,clients updated,Vulnerability
46777,misconfigured firewalls weak authentication me...,Vulnerability Scanner,disguising,unauthorized,ExploitSQL,PersistentIntrusion attack,Sockets Layer,VulnerabilityChain attack relies,exploiting multiple vulnerabilities,exploits vulnerabilities
46778,malicious code,infectious exploits involves regular patching,specific vulnerabilities discovered allowing,detecting infectious exploits,exploit vulnerabilities,sensitive information,malicious activity,execute arbitrary commands install malware,Additionally network traffic monitoring intrusion,software bugs configuration weaknesses
46779,Vulnerability Hunter,injects malicious packets,attacks network administrators,security measures including encryption protoco...,unauthorized location,simple buffer overflow attacks,employing robust network security measures,network packet attack,diverting sensitive,Additionally monitoring network traffic


In [63]:
tdf['c'] = tdf.iloc[:, 0:10].apply(lambda x: ' '.join(x.astype(str)), axis=1)

In [64]:
tdf['c'][0]

'ZeroImpact injecting responsiveness spoofing data exfiltration encrypted reassembly reordering vulnerabilities intrusion detection'

In [65]:
tdf['c'].nunique()

42766

In [45]:
df1

0    2302
1    3051
2    3712
3    4249
4    4685
5    5214
6    5631
7    6092
8    6424
9    6036
dtype: int64

# Text for Tag

In [1]:
import pandas as pd

In [16]:
df = pd.read_csv('./GENERATED_TAGS.csv')

In [3]:
df

Unnamed: 0,id,tags
0,0,"{'ZeroImpact': 0.9326767845982844, 'injecting'..."
1,1,{'injecting unauthorized packets': 0.929202050...
2,2,"{'unknowingly triggering': 0.9352225230817839,..."
3,3,"{'Exploit DoS': 0.9215557725519854, 'mitigate ..."
4,4,"{'ZeroImpact': 0.9545252141232174, 'anomalies'..."
...,...,...
46776,46776,{'successful Exploit Normalizer attack': 0.940...
46777,46777,{'misconfigured firewalls weak authentication ...
46778,46778,"{'malicious code': 0.9816162270729385, 'infect..."
46779,46779,"{'Vulnerability Hunter': 0.9274233934166645, '..."


In [4]:
df['tags'][0]

"{'ZeroImpact': 0.9326767845982844, 'injecting': 0.9281784060320444, 'responsiveness': 0.9276119540057473, 'spoofing': 0.9275524398471746, 'data exfiltration': 0.9271565682528817, 'encrypted': 0.9271299868504239, 'reassembly': 0.9270286951368641, 'reordering': 0.9263218957220968, 'vulnerabilities': 0.9262331496090768, 'intrusion detection': 0.9258145334115236}"

In [17]:
df['tags'] = df['tags'].apply(lambda x: [key for key in eval(x).keys()])

In [18]:
df['tags'][516]

['ZeroImpact',
 'reordering',
 'anomalies',
 'vulnerabilities',
 'DataGatheringGrenade',
 'RegularOperation',
 'firewalls intrusion',
 'intricacies',
 'reroute',
 'StealthySilencer']