# Fine-tuning

In [None]:
!pip install transformers datasets torchinfo accelerate pycaret BERTSimilarWords

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pycaret
  Downloading pycaret-3.0.4-py3-none-any.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.4/484.4 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting BERTSimilarWords
  Downloading BERTSimilarWords-0.0.13-py3-none-any.whl (11 kB)
Collecting hug

In [None]:
!gdown "https://drive.google.com/uc?id=1A10G8rwLMSjMCYdHPOdCONe27kwXloi8&confirm=t"

Downloading...
From: https://drive.google.com/uc?id=1A10G8rwLMSjMCYdHPOdCONe27kwXloi8&confirm=t
To: /content/DATA_TRAIN.csv
100% 2.03G/2.03G [00:25<00:00, 78.5MB/s]


In [None]:
!gdown "https://drive.google.com/uc?id=1bCIbmUf9_swEtuPdC_ZA09X48YTazWfz&confirm=t"

Downloading...
From: https://drive.google.com/uc?id=1bCIbmUf9_swEtuPdC_ZA09X48YTazWfz&confirm=t
To: /content/EMBEDDING.csv
100% 11.1G/11.1G [00:58<00:00, 188MB/s]


In [None]:
from transformers import AutoConfig, AutoTokenizer, TrainingArguments, AutoModel, AutoModelForSequenceClassification, Trainer, pipeline, DataCollatorForTokenClassification, AutoModelForTokenClassification, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric, Dataset
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score
import torch
import torchinfo

In [None]:
checkpoint = 'distilbert-base-cased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
tokenizer("Hello World")

{'input_ids': [101, 8667, 1291, 102], 'attention_mask': [1, 1, 1, 1]}

In [None]:
tokenizer.tokenize("Hello world full of Chimponzees")

['Hello', 'world', 'full', 'of', 'Chi', '##mpo', '##nz', '##ees']

In [None]:
ids = tokenizer.encode('Hello world')

In [None]:
tokenizer.convert_ids_to_tokens(ids)

['[CLS]', 'Hello', 'world', '[SEP]']

In [None]:
train_df = pd.read_csv('./train_10_sentence.csv')

In [None]:
train_df

Unnamed: 0,packet_dat,attack_cat
0,18 14 3916 -1 65387 80 229 189 63 0 5 3 -1 71 ...,Analysis
1,11 8 2278 -1 80 46315 482 442 253 0 5 0 -1 72 ...,Analysis
2,13 10 2757 -1 63628 80 221 181 62 0 5 3 -1 71 ...,Analysis
3,12 12 2757 -1 20528 80 226 186 63 0 5 3 -1 71 ...,Analysis
4,5 4 1289 -1 5855 80 235 195 62 0 5 3 -1 71 69 ...,Analysis
...,...,...
61767,The consequences of a successful DDoS attack c...,DDoS
61768,DDoS attacks are not limited to large organiza...,DDoS
61769,"As technology evolves, so do the techniques em...",DDoS
61770,DDoS attacks are not only a concern for the pr...,DDoS


In [None]:
train_df = train_df.groupby('attack_cat').apply(lambda x: x.sample(min(len(x), 100000))).reset_index(drop=True)

In [None]:
train_df

Unnamed: 0,packet_dat,attack_cat,embedding
0,18 14 3916 -1 65387 80 229 189 63 0 5 3 -1 71 ...,Analysis,"[-0.22003393, -0.7812249, 1.1886178, -0.751072..."
1,11 8 2278 -1 80 46315 482 442 253 0 5 0 -1 72 ...,Analysis,"[0.14469527, -0.6813727, 1.0579, -0.93477905, ..."
2,13 10 2757 -1 63628 80 221 181 62 0 5 3 -1 71 ...,Analysis,"[-0.1992719, -0.7335618, 0.92741567, -0.822641..."
3,12 12 2757 -1 20528 80 226 186 63 0 5 3 -1 71 ...,Analysis,"[-0.1047841, -0.82374144, 1.1461064, -0.726840..."
4,5 4 1289 -1 5855 80 235 195 62 0 5 3 -1 71 69 ...,Analysis,"[-0.08865672, -0.92923635, 1.0538378, -0.79000..."
...,...,...,...
61757,11 60 82612 -1 80 16565 363 323 253 0 5 0 -1 2...,Worms,"[-0.56639206, 1.6416208, -0.3714938, 0.482306,..."
61758,11 71 94466 -1 80 40371 1500 1460 252 0 5 0 -1...,Worms,"[-0.64701754, 1.0316591, -0.7111639, 0.6273469..."
61759,14 90 119416 -1 80 3404 41 6 253 0 5 0 -1 116 ...,Worms,"[-0.48861536, 1.439461, -0.59521705, 0.8684100..."
61760,14 87 119898 -1 80 63800 1500 1460 253 0 5 0 -...,Worms,"[-0.6815854, 1.1861004, -0.75322866, 0.5188070..."


In [None]:
train_df.attack_cat.value_counts()

DDoS                          3010
Generic                       3000
Fuzzers                       3000
Web Attack - XSS              3000
Web Attack - Brute Force      3000
SSH Patator                   3000
Reconnaissance                3000
Normal                        3000
Infiltration                  3000
Heartbleed                    3000
Worms                         3000
FTP Patator                   3000
Exploits                      3000
DoS Slowloris                 3000
DoS SlowHTTPTest              3000
DoS Hulk                      3000
DoS GoldenEye                 3000
DoS                           3000
Bot                           3000
Analysis                      1819
Backdoor                      1245
Shellcode                     1063
Port Scan                      605
Web Attack - SQL Injection      30
Name: attack_cat, dtype: int64

In [None]:
ros = RandomOverSampler(random_state=42)

In [None]:
X, y = ros.fit_resample(train_df[['packet_dat']], train_df[['attack_cat']])

In [None]:
data = pd.concat([X,y], axis=1)

In [None]:
data

Unnamed: 0,packet_dat,attack_cat
0,18 14 3916 -1 65387 80 229 189 63 0 5 3 -1 71 ...,Analysis
1,11 8 2278 -1 80 46315 482 442 253 0 5 0 -1 72 ...,Analysis
2,13 10 2757 -1 63628 80 221 181 62 0 5 3 -1 71 ...,Analysis
3,12 12 2757 -1 20528 80 226 186 63 0 5 3 -1 71 ...,Analysis
4,5 4 1289 -1 5855 80 235 195 62 0 5 3 -1 71 69 ...,Analysis
...,...,...
72235,11 71 94466 -1 80 40371 1500 1460 252 0 5 0 -1...,Worms
72236,14 90 119416 -1 80 3404 1500 1460 253 0 5 0 -1...,Worms
72237,14 90 119416 -1 80 3404 1500 1460 253 0 5 0 -1...,Worms
72238,12 68 92012 -1 80 63213 1500 1460 253 0 5 0 -1...,Worms


In [None]:
data.attack_cat.value_counts()

Analysis                      3010
Backdoor                      3010
Web Attack - XSS              3010
Web Attack - SQL Injection    3010
Web Attack - Brute Force      3010
Shellcode                     3010
SSH Patator                   3010
Reconnaissance                3010
Port Scan                     3010
Normal                        3010
Infiltration                  3010
Heartbleed                    3010
Generic                       3010
Fuzzers                       3010
FTP Patator                   3010
Exploits                      3010
DoS Slowloris                 3010
DoS SlowHTTPTest              3010
DoS Hulk                      3010
DoS GoldenEye                 3010
DoS                           3010
DDoS                          3010
Bot                           3010
Worms                         3010
Name: attack_cat, dtype: int64

In [None]:
classes = data['attack_cat'].unique()

target_map = {class_name: index for index, class_name in enumerate(sorted(classes))}

In [None]:
target_map

{'Analysis': 0,
 'Backdoor': 1,
 'Bot': 2,
 'DDoS': 3,
 'DoS': 4,
 'DoS GoldenEye': 5,
 'DoS Hulk': 6,
 'DoS SlowHTTPTest': 7,
 'DoS Slowloris': 8,
 'Exploits': 9,
 'FTP Patator': 10,
 'Fuzzers': 11,
 'Generic': 12,
 'Heartbleed': 13,
 'Infiltration': 14,
 'Normal': 15,
 'Port Scan': 16,
 'Reconnaissance': 17,
 'SSH Patator': 18,
 'Shellcode': 19,
 'Web Attack - Brute Force': 20,
 'Web Attack - SQL Injection': 21,
 'Web Attack - XSS': 22,
 'Worms': 23}

In [None]:
data['target'] = data['attack_cat'].map(target_map)

In [None]:
data

Unnamed: 0,packet_dat,attack_cat,target
0,13 10 2607 -1 64294 80 236 196 62 0 5 3 -1 71 ...,Analysis,0
1,4 3 938 -1 8501 80 220 180 62 0 5 3 -1 71 69 8...,Analysis,0
2,13 10 2941 -1 80 65403 521 481 253 0 5 0 -1 72...,Analysis,0
3,18 14 3458 -1 34422 80 229 189 63 0 5 3 -1 71 ...,Analysis,0
4,16 12 3060 -1 80 65401 512 472 253 0 5 0 -1 72...,Analysis,0
...,...,...,...
2399995,11 58 78185 -1 80 63496 1500 1460 253 0 5 0 -1...,Worms,23
2399996,12 75 98580 -1 80 24418 1500 1460 252 0 5 0 -1...,Worms,23
2399997,15 89 121890 -1 80 17744 1500 1460 253 0 5 0 -...,Worms,23
2399998,14 87 119898 -1 80 63800 363 323 252 0 5 0 -1 ...,Worms,23


In [None]:
data = data[['packet_dat', 'target']]

In [None]:
data.columns = ['packet', 'label']

In [None]:
data

Unnamed: 0,packet,label
0,13 10 2607 -1 64294 80 236 196 62 0 5 3 -1 71 ...,0
1,4 3 938 -1 8501 80 220 180 62 0 5 3 -1 71 69 8...,0
2,13 10 2941 -1 80 65403 521 481 253 0 5 0 -1 72...,0
3,18 14 3458 -1 34422 80 229 189 63 0 5 3 -1 71 ...,0
4,16 12 3060 -1 80 65401 512 472 253 0 5 0 -1 72...,0
...,...,...
2399995,11 58 78185 -1 80 63496 1500 1460 253 0 5 0 -1...,23
2399996,12 75 98580 -1 80 24418 1500 1460 252 0 5 0 -1...,23
2399997,15 89 121890 -1 80 17744 1500 1460 253 0 5 0 -...,23
2399998,14 87 119898 -1 80 63800 363 323 252 0 5 0 -1 ...,23


In [None]:
raw_dataset = Dataset.from_pandas(data)

In [None]:
raw_dataset

Dataset({
    features: ['packet', 'label'],
    num_rows: 72240
})

In [None]:
raw_dataset = raw_dataset.train_test_split(test_size=0.3, seed=42)

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['packet', 'label'],
        num_rows: 50568
    })
    test: Dataset({
        features: ['packet', 'label'],
        num_rows: 21672
    })
})

In [None]:
raw_dataset['train'][0]

{'packet': '169 25 225892 -1 58006 25 1500 1460 63 0 5 0 -1 109 10 75 83 73 103 73 120 48 98 72 104 56 100 73 67 77 104 74 67 81 105 74 83 77 104 74 67 77 104 74 67 107 110 75 105 65 101 73 84 48 55 80 109 104 109 97 86 49 98 88 105 89 107 74 120 77 82 70 67 115 112 10 76 67 77 104 74 67 65 101 73 82 48 98 72 104 115 90 72 66 111 89 71 120 115 90 72 67 65 101 73 83 85 106 74 104 119 97 72 84 48 55 80 109 70 102 89 109 90 107 90 48 49 76 84 105 56 116 77 66 56 100 10 73 66 48 98 72 105 103 109 75 83 65 101 73 82 111 89 71 120 115 90 72 67 69 102 73 105 89 107 74 121 81 105 74 83 65 101 73 83 77 104 74 70 66 79 85 88 70 118 99 109 78 104 90 68 107 51 79 104 52 99 10 72 120 48 98 72 105 89 107 74 121 103 109 75 83 73 103 73 120 48 98 72 104 56 100 73 67 77 104 74 67 81 105 74 83 77 104 74 67 77 104 74 80 47 43 47 47 47 43 47 119 65 65 70 120 85 89 74 121 85 111 10 84 69 112 78 98 50 49 119 90 87 78 109 79 68 89 53 70 120 85 89 69 81 56 83 71 120 107 99 73 105 65 106 75 67 89 112 75 83 99 

In [None]:
def tokenize_batch(batch):
  return tokenizer(batch['packet'], truncation=True)

In [None]:
tokenized_datasets = raw_dataset.map(function=tokenize_batch, batched=True)

Map:   0%|          | 0/50568 [00:00<?, ? examples/s]

Map:   0%|          | 0/21672 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['packet', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50568
    })
    test: Dataset({
        features: ['packet', 'label', 'input_ids', 'attention_mask'],
        num_rows: 21672
    })
})

In [None]:
config = AutoConfig.from_pretrained(checkpoint)

In [None]:
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.30.2",
  "vocab_size": 28996
}

In [None]:
config.label2id = target_map

In [None]:
config.id2label = {v:k for k,v in target_map.items()}

In [None]:
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "Analysis",
    "1": "Backdoor",
    "2": "Bot",
    "3": "DDoS",
    "4": "DoS",
    "5": "DoS GoldenEye",
    "6": "DoS Hulk",
    "7": "DoS SlowHTTPTest",
    "8": "DoS Slowloris",
    "9": "Exploits",
    "10": "FTP Patator",
    "11": "Fuzzers",
    "12": "Generic",
    "13": "Heartbleed",
    "14": "Infiltration",
    "15": "Normal",
    "16": "Port Scan",
    "17": "Reconnaissance",
    "18": "SSH Patator",
    "19": "Shellcode",
    "20": "Web Attack - Brute Force",
    "21": "Web Attack - SQL Injection",
    "22": "Web Attack - XSS",
    "23": "Worms"
  },
  "initializer_range": 0.02,
  "label2id": {
    "Analysis": 0,
    "Backdoor": 1,
    "Bot": 2,
    "DDoS": 3,
    "DoS": 4,
    "DoS GoldenEye": 5,
    "DoS Hulk": 6,
    "DoS SlowHTTPTest": 7,
    "DoS Slowloris": 8,
    "Exploits"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config)

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.wei

In [None]:
torchinfo.summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              22,268,928
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 65,783,042
Trainable params: 65,783,042
Non-trainable params: 0

In [None]:
training_args = TrainingArguments(
    output_dir='BERT',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128
    )

In [None]:
def compute_metric(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  acc = np.mean(predictions == labels)
  f1 = f1_score(labels, predictions, average='macro')
  return {'accuracy':acc, 'f1-score':f1}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics = compute_metric
    )

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1-score
1,1.0066,0.213723,0.940384,0.939577


TrainOutput(global_step=791, training_loss=0.7369703927022015, metrics={'train_runtime': 647.4518, 'train_samples_per_second': 78.103, 'train_steps_per_second': 1.222, 'total_flos': 6701239540187136.0, 'train_loss': 0.7369703927022015, 'epoch': 1.0})

In [None]:
!zip -r downloads.zip /content/BERT/checkpoint-78750


  adding: content/BERT/checkpoint-78750/ (stored 0%)
  adding: content/BERT/checkpoint-78750/scheduler.pt (deflated 49%)
  adding: content/BERT/checkpoint-78750/optimizer.pt (deflated 38%)
  adding: content/BERT/checkpoint-78750/tokenizer_config.json (deflated 41%)
  adding: content/BERT/checkpoint-78750/vocab.txt (deflated 49%)
  adding: content/BERT/checkpoint-78750/tokenizer.json (deflated 70%)
  adding: content/BERT/checkpoint-78750/trainer_state.json (deflated 87%)
  adding: content/BERT/checkpoint-78750/pytorch_model.bin (deflated 8%)
  adding: content/BERT/checkpoint-78750/rng_state.pth (deflated 28%)
  adding: content/BERT/checkpoint-78750/special_tokens_map.json (deflated 42%)
  adding: content/BERT/checkpoint-78750/training_args.bin (deflated 48%)
  adding: content/BERT/checkpoint-78750/config.json (deflated 58%)


In [None]:
# prompt: mount google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
trainer.state.log_history

[{'loss': 0.8671,
  'learning_rate': 4.968253968253969e-05,
  'epoch': 0.02,
  'step': 500},
 {'loss': 0.2133,
  'learning_rate': 4.936507936507937e-05,
  'epoch': 0.04,
  'step': 1000},
 {'loss': 0.1414,
  'learning_rate': 4.904761904761905e-05,
  'epoch': 0.06,
  'step': 1500},
 {'loss': 0.1091,
  'learning_rate': 4.873015873015873e-05,
  'epoch': 0.08,
  'step': 2000},
 {'loss': 0.0924,
  'learning_rate': 4.841269841269841e-05,
  'epoch': 0.1,
  'step': 2500},
 {'loss': 0.0813,
  'learning_rate': 4.80952380952381e-05,
  'epoch': 0.11,
  'step': 3000},
 {'loss': 0.0782,
  'learning_rate': 4.7777777777777784e-05,
  'epoch': 0.13,
  'step': 3500},
 {'loss': 0.0707,
  'learning_rate': 4.746031746031746e-05,
  'epoch': 0.15,
  'step': 4000},
 {'loss': 0.0672,
  'learning_rate': 4.714285714285714e-05,
  'epoch': 0.17,
  'step': 4500},
 {'loss': 0.0624,
  'learning_rate': 4.682539682539683e-05,
  'epoch': 0.19,
  'step': 5000},
 {'loss': 0.0595,
  'learning_rate': 4.6507936507936515e-05,
 

# Clustering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
import itertools
import torch
from pycaret.clustering import *

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP/DATA_TRAIN.csv')

In [None]:
df

Unnamed: 0,packet_dat,attack_cat
0,0 0 141 -1 80 63713 2960 2920 64 0 5 0 -1 119 ...,DDoS
1,1190 1582 3526815 -1 80 50095 1500 1460 118 0 ...,Normal
2,0 0 4 -1 80 41471 4420 4380 64 0 5 0 -1 72 84 ...,DDoS
3,0 0 176 -1 80 45284 2948 2896 64 0 8 0 -1 72 8...,DoS Hulk
4,0 0 128 -1 80 46654 1500 1448 64 0 8 0 -1 72 8...,DoS Hulk
...,...,...
1187776,14492 14492 0 -1 51328 22 164 112 62 0 8 3 -1 ...,SSH Patator
1187777,14 98 131788 -1 80 52067 1500 1460 253 0 5 0 -...,DoS
1187778,1 2 397 -1 47188 22 692 640 62 0 8 3 -1 0 0 2 ...,SSH Patator
1187779,2063 0 0 -1 80 32768 1500 1448 64 0 8 0 -1 32 ...,DoS Hulk


In [None]:
new_df = pd.read_csv('./EMBEDDING.csv')

In [None]:
scaler = StandardScaler()

In [None]:
new_df = scaler.fit_transform(new_df)

In [None]:
new_df

array([[ 1.39890419,  1.62363812, -0.4314475 , ..., -0.58798875,
         0.12279084, -0.12605832],
       [-0.01599618, -1.00361614,  0.59355158, ..., -1.58293487,
         0.50788063, -1.36838972],
       [ 1.68385094,  1.1786868 , -0.93584771, ..., -1.19744494,
         0.2811531 , -0.35964584],
       ...,
       [ 1.19198466,  0.46667399, -1.40050749, ...,  0.5583523 ,
         1.50819284,  2.51796654],
       [-0.53120693,  0.7115948 , -0.29749342, ...,  0.18920795,
        -0.99527525,  0.18722665],
       [ 0.68815732, -0.80118774, -0.48318969, ...,  0.57190332,
        -1.10341874, -0.6256423 ]])

In [None]:
import pickle

# Assuming that 'scaler' is the fitted StandardScaler object
with open('SCALER.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
type(df['embedding'][0])

str

In [None]:
[float(x) for x in df['embedding'][0].strip("[]").replace("\n", "").split()]

In [None]:
df['embedding'] = df['embedding'].apply(lambda x: [float(i) for i in x.strip("[]").replace("\n", "").split()])

In [None]:
df

Unnamed: 0,packet_dat,attack_cat,embedding
0,0 0 141 -1 80 63713 2960 2920 64 0 5 0 -1 119 ...,DDoS,"[0.566230714, 1.03200865, -0.469253629, 0.1800..."
1,1190 1582 3526815 -1 80 50095 1500 1460 118 0 ...,Normal,"[-0.232119739, -0.668878675, 0.132740527, 0.70..."
2,0 0 4 -1 80 41471 4420 4380 64 0 5 0 -1 72 84 ...,DDoS,"[0.727010489, 0.743946671, -0.76549387, 0.0006..."
3,0 0 176 -1 80 45284 2948 2896 64 0 8 0 -1 72 8...,DoS Hulk,"[-1.04123712, 0.482669771, -0.46553719, 0.4727..."
4,0 0 128 -1 80 46654 1500 1448 64 0 8 0 -1 72 8...,DoS Hulk,"[-1.07848549, 0.413386047, -0.456292927, 0.502..."
...,...,...,...
1187776,14492 14492 0 -1 51328 22 164 112 62 0 8 3 -1 ...,SSH Patator,"[-0.0342493653, 0.366511583, -0.643470943, 0.8..."
1187777,14 98 131788 -1 80 52067 1500 1460 253 0 5 0 -...,DoS,"[0.73437655, 0.516807735, 0.00688571436, -0.29..."
1187778,1 2 397 -1 47188 22 692 640 62 0 8 3 -1 0 0 2 ...,SSH Patator,"[0.449477404, 0.282988787, -1.03839409, 0.7508..."
1187779,2063 0 0 -1 80 32768 1500 1448 64 0 8 0 -1 32 ...,DoS Hulk,"[-0.522824824, 0.441550791, -0.390580803, 0.75..."


In [None]:
s_df = df.groupby('attack_cat').apply(lambda x: x.sample(min(len(x), 3000))).reset_index(drop=True)

In [None]:
df

Unnamed: 0,packet_dat,attack_cat
0,14 11 2798 -1 80 62088 414 374 252 0 5 0 -1 72...,Analysis
1,11 8 2110 -1 80 54809 398 358 253 0 5 0 -1 72 ...,Analysis
2,16 12 3060 -1 80 65401 401 361 253 0 5 0 -1 72...,Analysis
3,9 9 1958 -1 3154 80 238 198 63 0 5 3 -1 71 69 ...,Analysis
4,17 14 3494 -1 80 65393 462 422 253 0 5 0 -1 72...,Analysis
...,...,...
61757,21 114 149193 -1 80 30394 363 323 252 0 5 0 -1...,Worms
61758,14 87 119898 -1 80 63800 1500 1460 253 0 5 0 -...,Worms
61759,14 87 119898 -1 80 63800 1500 1460 252 0 5 0 -...,Worms
61760,14 87 119898 -1 80 63800 1500 1460 252 0 5 0 -...,Worms


In [None]:
df.attack_cat.value_counts()

DoS Hulk                      417967
Normal                        181742
DDoS                          127975
Heartbleed                    117056
SSH Patator                    59905
DoS                            49047
Exploits                       45931
Generic                        34343
FTP Patator                    31684
Fuzzers                        25409
Reconnaissance                 24250
DoS GoldenEye                  23355
Web Attack - Brute Force       13014
Infiltration                    7514
Worms                           6552
DoS SlowHTTPTest                5567
DoS Slowloris                   4424
Bot                             3679
Web Attack - XSS                3605
Analysis                        1819
Backdoor                        1245
Shellcode                       1063
Port Scan                        605
Web Attack - SQL Injection        30
Name: attack_cat, dtype: int64

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("rdpahalavan/bert-network-packet-v2")

model = AutoModel.from_pretrained("rdpahalavan/bert-network-packet-v2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at rdpahalavan/bert-network-packet-v2 were not used when initializing DistilBertModel: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model = model.to(0)

In [None]:
time = 0
def get_embedding(text):
  with torch.no_grad():
    tokens = tokenizer(text, truncation=True, return_tensors='pt').to(0)
    output = model(**tokens)
    embedding = output.last_hidden_state[:, 1:-1, :].mean(dim=1).cpu().detach().numpy()
  global time
  time += 1
  if time % 10000 == 0:
    print(f'Completed: {time}')
  return embedding

In [None]:
df['embedding'] = df.packet_dat.apply(lambda x: get_embedding(x)[0])

In [None]:
df

Unnamed: 0,packet_dat,attack_cat,embedding
0,0 0 141 -1 80 63713 2960 2920 64 0 5 0 -1 119 ...,DDoS,"[0.5662307, 1.0320086, -0.46925363, 0.18004657..."
1,1190 1582 3526815 -1 80 50095 1500 1460 118 0 ...,Normal,"[-0.23211974, -0.6688787, 0.13274053, 0.706968..."
2,0 0 4 -1 80 41471 4420 4380 64 0 5 0 -1 72 84 ...,DDoS,"[0.7270105, 0.7439467, -0.76549387, 0.00066453..."
3,0 0 176 -1 80 45284 2948 2896 64 0 8 0 -1 72 8...,DoS Hulk,"[-1.0412371, 0.48266977, -0.4655372, 0.4727513..."
4,0 0 128 -1 80 46654 1500 1448 64 0 8 0 -1 72 8...,DoS Hulk,"[-1.0784855, 0.41338605, -0.45629293, 0.502546..."
...,...,...,...
1187776,14492 14492 0 -1 51328 22 164 112 62 0 8 3 -1 ...,SSH Patator,"[-0.034249365, 0.36651158, -0.64347094, 0.8059..."
1187777,14 98 131788 -1 80 52067 1500 1460 253 0 5 0 -...,DoS,"[0.73437655, 0.51680773, 0.0068857144, -0.2929..."
1187778,1 2 397 -1 47188 22 692 640 62 0 8 3 -1 0 0 2 ...,SSH Patator,"[0.4494774, 0.2829888, -1.0383941, 0.7508589, ..."
1187779,2063 0 0 -1 80 32768 1500 1448 64 0 8 0 -1 32 ...,DoS Hulk,"[-0.5228248, 0.4415508, -0.3905808, 0.7572069,..."


In [None]:
new_df = pd.DataFrame(df['embedding'].tolist())

In [None]:
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.566231,1.032009,-0.469254,0.180047,0.144989,-0.368475,0.388921,-0.388886,-0.409928,0.086448,...,0.997707,0.302570,0.416993,-0.164334,0.978619,-0.614433,-0.225654,-0.198800,0.149472,0.062751
1,-0.232120,-0.668879,0.132741,0.706968,0.573730,-0.091610,-0.746398,0.110077,-0.098132,0.728295,...,0.610387,1.002173,0.250074,0.580995,0.631434,0.913427,-0.673387,-0.836257,0.322913,-0.601973
2,0.727010,0.743947,-0.765494,0.000665,-0.111112,0.011211,0.097044,-0.321532,-0.127531,0.285301,...,0.953088,0.175978,0.388354,-0.161116,0.888571,-0.626760,-0.128763,-0.589276,0.220797,-0.062233
3,-1.041237,0.482670,-0.465537,0.472751,0.450419,-0.012997,0.360186,-0.291310,-0.660684,-0.551630,...,0.556321,-1.433114,0.972958,-0.444996,0.162548,0.186794,0.724563,0.283182,-0.143693,0.216256
4,-1.078485,0.413386,-0.456293,0.502546,0.506876,0.088572,0.388568,-0.276944,-0.725662,-0.497849,...,0.518704,-1.470442,0.968616,-0.455446,0.188966,0.139524,0.763116,0.369715,-0.173726,0.215749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187776,-0.034249,0.366512,-0.643471,0.805957,0.360807,0.034010,0.003919,0.297773,0.219370,-1.185546,...,-1.329836,0.587557,0.192689,0.213404,-1.510436,-0.384090,-0.142956,0.454081,0.637317,1.526956
1187777,0.734377,0.516808,0.006886,-0.292908,-0.670592,-0.364789,-0.779417,0.754055,-0.454206,-1.008551,...,0.138735,-0.164899,-0.186572,-0.171693,-0.893917,0.706636,-0.283599,1.632657,0.015785,1.158581
1187778,0.449477,0.282989,-1.038394,0.750859,0.311943,-0.042874,-0.198380,0.102160,0.148049,-0.898640,...,-1.250497,0.535474,0.156221,-0.133085,-1.531517,-0.032511,-0.246156,0.535655,0.773442,1.477466
1187779,-0.522825,0.441551,-0.390581,0.757207,0.475397,-0.104455,0.140438,-0.334763,-0.926548,-0.971389,...,0.550062,-1.232339,0.967732,-0.318511,0.054592,0.235966,0.631848,0.299146,-0.354092,0.230377


In [None]:
s = setup(new_df, use_gpu=True)

Unnamed: 0,Description,Value
0,Session id,8917
1,Original data shape,"(1187781, 768)"
2,Transformed data shape,"(1187781, 768)"
3,Numeric features,768
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,CPU Jobs,-1
9,Use GPU,True


In [None]:
kmeans = create_model('kmeans', num_clusters=516)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1392,114273.2133,1.973,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
save_model(kmeans, 'K-MEANS')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['feature_1', 'feature_2',
                                              'feature_3', 'feature_4',
                                              'feature_5', 'feature_6',
                                              'feature_7', 'feature_8',
                                              'feature_9', 'feature_10',
                                              'feature_11', 'feature_12',
                                              'feature_13', 'feature_14',
                                              'feature_15', 'feature_16',
                                              'feature_17', 'feature_18',
                                              'feature_19', 'feature_20',
                                              'feature_21', 'feature_22',
                                              'feature_23', 'feature_24',
                               

In [None]:
kmeans.n_iter_

238

In [None]:
kmeans.inertia_

18039302.0

In [None]:
np.save('KMEANS-CLUSTER-CENTERS.npy', kmeans.cluster_centers_)

In [None]:
kmeans.cluster_centers_.shape

(516, 768)

In [None]:
kmeans_cluster = assign_model(kmeans)
kmeans_cluster['label'] = df['attack_cat'].tolist()
pd.pivot_table(kmeans_cluster, index='Cluster', columns='label', aggfunc='size', fill_value=0)

label,Analysis,Backdoor,Bot,DDoS,DoS,DoS GoldenEye,DoS Hulk,DoS SlowHTTPTest,DoS Slowloris,Exploits,...,Infiltration,Normal,Port Scan,Reconnaissance,SSH Patator,Shellcode,Web Attack - Brute Force,Web Attack - SQL Injection,Web Attack - XSS,Worms
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cluster 0,0,0,0,0,0,0,0,0,0,0,...,0,6297,0,0,0,0,0,0,0,0
Cluster 1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cluster 10,0,0,0,0,0,686,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cluster 100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,627,0,0,0,0,0
Cluster 101,0,0,0,0,0,0,6149,0,0,0,...,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cluster 95,0,0,0,5594,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,0
Cluster 96,0,0,0,0,0,0,0,0,0,1,...,0,4,0,0,0,0,0,0,0,0
Cluster 97,0,0,0,4512,0,0,0,0,0,0,...,0,9,0,0,0,0,0,0,0,0
Cluster 98,0,0,0,0,1941,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0




In [None]:
final_df = pd.DataFrame()
for k in list(range(12,577,12)):
  kmeans = create_model('kmeans', num_clusters=k)
  kmeans_results = pull()
  kmeans_results['k'] = k
  final_df = pd.concat([final_df, kmeans_results], ignore_index=True)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4321,7184.8126,1.4128,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.7356,31712.0698,0.5285,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5677,29072.7487,1.0423,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.52,26263.825,1.1606,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4333,23907.1886,1.2923,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3405,21440.3835,1.4594,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3496,19691.5151,1.5085,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3183,18123.3537,1.5738,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2732,16690.2888,1.6344,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2847,15725.1398,1.6503,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2822,14738.0777,1.7348,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2624,13899.3207,1.7279,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2542,13122.5528,1.7914,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2546,12501.792,1.783,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.231,11877.9954,1.8024,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2323,11415.5225,1.812,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2274,10926.04,1.8062,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2221,10539.6328,1.8132,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2147,10121.0497,1.8322,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2221,9777.504,1.8471,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2133,9421.4609,1.8763,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2162,9119.2774,1.8729,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2026,8822.5519,1.8836,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1996,8596.0282,1.8878,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1987,8340.1317,1.8922,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2004,8129.0135,1.859,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1955,7918.8973,1.8687,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1967,7711.0718,1.9046,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.193,7521.2678,1.8827,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1902,7340.5846,1.8894,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.184,7185.3269,1.9159,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1926,7023.996,1.8909,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1929,6886.6468,1.8799,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1862,6725.3418,1.9045,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1731,6587.9074,1.8905,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1722,6459.4822,1.8945,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.185,6342.5843,1.8796,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1782,6219.3792,1.9163,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.181,6110.8203,1.8893,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1859,6017.3388,1.8953,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1683,5893.4768,1.9022,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1667,5815.0121,1.9009,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1812,5701.7928,1.8695,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1653,5614.3201,1.8774,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1796,5538.3649,1.8858,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1661,5436.9846,1.8935,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1631,5368.5841,1.8911,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1795,5296.2274,1.8711,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
final_df

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness,k
0,0.4321,7184.8126,1.4128,0,0,0,12
1,0.7356,31712.0698,0.5285,0,0,0,24
2,0.5677,29072.7487,1.0423,0,0,0,36
3,0.52,26263.825,1.1606,0,0,0,48
4,0.4333,23907.1886,1.2923,0,0,0,60
5,0.3405,21440.3835,1.4594,0,0,0,72
6,0.3496,19691.5151,1.5085,0,0,0,84
7,0.3183,18123.3537,1.5738,0,0,0,96
8,0.2732,16690.2888,1.6344,0,0,0,108
9,0.2847,15725.1398,1.6503,0,0,0,120


In [None]:
kmeans

In [None]:
kmeans = create_model('kmeans', num_clusters=456)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1782,6219.3792,1.9163,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Custom Clusters
kmeans_cluster = assign_model(kmeans)
kmeans_cluster['label'] = df['attack_cat'].tolist()
p_df = pd.pivot_table(kmeans_cluster, index='Cluster', columns='label', aggfunc='size', fill_value=0)
p_df

label,Analysis,Backdoor,Bot,DDoS,DoS,DoS GoldenEye,DoS Hulk,DoS SlowHTTPTest,DoS Slowloris,Exploits,...,Infiltration,Normal,Port Scan,Reconnaissance,SSH Patator,Shellcode,Web Attack - Brute Force,Web Attack - SQL Injection,Web Attack - XSS,Worms
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cluster 0,0,0,0,0,0,0,0,0,0,0,...,0,6297,0,0,0,0,0,0,0,0
Cluster 1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cluster 10,0,0,0,0,0,686,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cluster 100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,627,0,0,0,0,0
Cluster 101,0,0,0,0,0,0,6149,0,0,0,...,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cluster 95,0,0,0,5594,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,0
Cluster 96,0,0,0,0,0,0,0,0,0,1,...,0,4,0,0,0,0,0,0,0,0
Cluster 97,0,0,0,4512,0,0,0,0,0,0,...,0,9,0,0,0,0,0,0,0,0
Cluster 98,0,0,0,0,1941,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
p_df.to_csv('PIVOT_TABLE.csv')

In [None]:
# prompt: read PIVOT_TABLE.csv into dataframe

p_df = pd.read_csv('PIVOT_TABLE.csv', index_col=0)

In [None]:
p_df

Unnamed: 0_level_0,Analysis,Backdoor,Bot,DDoS,DoS,DoS GoldenEye,DoS Hulk,DoS SlowHTTPTest,DoS Slowloris,Exploits,...,Infiltration,Normal,Port Scan,Reconnaissance,SSH Patator,Shellcode,Web Attack - Brute Force,Web Attack - SQL Injection,Web Attack - XSS,Worms
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cluster 0,0,0,0,0,0,0,0,0,0,0,...,0,6297,0,0,0,0,0,0,0,0
Cluster 1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cluster 10,0,0,0,0,0,686,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cluster 100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,627,0,0,0,0,0
Cluster 101,0,0,0,0,0,0,6149,0,0,0,...,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cluster 95,0,0,0,5594,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,0
Cluster 96,0,0,0,0,0,0,0,0,0,1,...,0,4,0,0,0,0,0,0,0,0
Cluster 97,0,0,0,4512,0,0,0,0,0,0,...,0,9,0,0,0,0,0,0,0,0
Cluster 98,0,0,0,0,1941,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
greater_than_10 = p_df > 1

In [None]:
count_greater_than_10 = greater_than_10.sum(axis=1)

In [None]:
more_than_one_greater_than_10 = count_greater_than_10 > 1

In [None]:
more_than_one_greater_than_10.sum()

156

In [None]:
kmeans.cluster_centers_[0]

In [None]:
df['packet_dat'][8413]

'4 0 24 -1 80 53711 5880 5840 64 0 5 0 -1 112 120 59 10 10 32 32 32 32 102 111 110 116 45 115 105 122 101 58 32 49 56 48 37 59 10 32 32 32 32 102 111 110 116 45 119 101 105 103 104 116 58 32 98 111 108 100 59 10 32 32 125 10 10 32 32 100 105 118 46 112 97 103 101 95 104 101 97 100 101 114 32 105 109 103 32 123 10 32 32 32 32 109 97 114 103 105 110 58 32 51 112 120 32 48 112 120 32 48 112 120 32 52 48 112 120 59 10 10 32 32 32 32 98 111 114 100 101 114 58 32 48 112 120 32 48 112 120 32 48 112 120 59 10 32 32 125 10 10 32 32 100 105 118 46 116 97 98 108 101 95 111 102 95 99 111 110 116 101 110 116 115 32 123 10 32 32 32 32 99 108 101 97 114 58 32 108 101 102 116 59 10 10 32 32 32 32 109 105 110 45 119 105 100 116 104 58 32 50 48 48 112 120 59 10 10 32 32 32 32 109 97 114 103 105 110 58 32 51 112 120 32 51 112 120 32 51 112 120 32 51 112 120 59 10 10 32 32 32 32 98 97 99 107 103 114 111 117 110 100 45 99 111 108 111 114 58 32 35 70 70 70 70 70 70 59 10 10 32 32 32 32 116 101 120 116 45 97

In [None]:
get_embedding('Analysis')[0]

In [None]:
df[df['packet_dat'].str.len()<500]

Unnamed: 0,packet_dat,attack_cat,embedding
6163,A distributed denial-of-service (DDoS) attack ...,DDoS,"[0.3744314, -0.26912475, 0.13510273, -0.065331..."
6421,"As technology evolves, so do the techniques em...",DDoS,"[0.7172379, -0.39320922, 0.42899263, 0.3460503..."
6567,DDoS attacks can target various layers of the ...,DDoS,"[0.5198794, -0.28791603, 0.40043634, 0.0374435..."
6953,Mitigating DDoS attacks is a complex task that...,DDoS,"[0.5613602, -0.4123387, 0.17053483, 0.12550281..."
7162,DDoS attacks are not limited to large organiza...,DDoS,"[0.4610605, -0.4154755, 0.25910008, -0.0462436..."
7214,The consequences of a successful DDoS attack c...,DDoS,"[0.2706191, -0.306447, 0.020052975, 0.00434680..."
7636,"DDoS attacks have evolved over time, becoming ...",DDoS,"[0.4367774, -0.45091015, 0.26178655, -0.188619..."
7863,The motives behind DDoS attacks can vary. Some...,DDoS,"[0.14406481, -0.22542818, 0.11302145, 0.006903..."
8407,Combating DDoS attacks requires a collaborativ...,DDoS,"[0.29670325, -0.6804996, 0.21292919, -0.043914..."
8412,DDoS attacks are not only a concern for the pr...,DDoS,"[0.26654494, -0.30732167, -0.20270191, -0.0540..."


In [None]:
cosine_dist = cosine_distances(kmeans.cluster_centers_, get_embedding(df['packet_dat'][8413])[0].reshape(1, -1))
cosine_dist_formatted = [round(dist[0], 4) for dist in cosine_dist]
data = {'Cluster {}'.format(i): cosine_dist_formatted[i] for i in range(len(cosine_dist_formatted))}
df_sorted = pd.DataFrame(list(data.items()), columns=['Cluster', 'Cosine Distance'])
df_sorted = df_sorted.sort_values(by='Cosine Distance')
df_sorted.reset_index(drop=True, inplace=True)
df_sorted

Unnamed: 0,Cluster,Cosine Distance
0,Cluster 8,0.3654
1,Cluster 17,0.4455
2,Cluster 1,0.8363
3,Cluster 3,0.9002
4,Cluster 14,0.9208
5,Cluster 23,0.9272
6,Cluster 19,0.987
7,Cluster 5,1.0201
8,Cluster 15,1.0234
9,Cluster 13,1.0252


In [None]:
cosine_dist[0][0]

1.1381369

In [None]:
pd.set_option('display.max_columns', 25)
from google.colab.data_table import DataTable
DataTable.max_columns = 25

In [None]:
# All Tokens except [CLS] and [SEP]
kmeans_cluster = assign_model(kmeans)
kmeans_cluster['label'] = df['attack_cat'].tolist()
pd.pivot_table(kmeans_cluster, index='Cluster', columns='label', aggfunc='size', fill_value=0)

label,Analysis,Backdoor,Bot,DDoS,DoS,DoS GoldenEye,DoS Hulk,DoS SlowHTTPTest,DoS Slowloris,Exploits,FTP Patator,Fuzzers,Generic,Heartbleed,Infiltration,Normal,Port Scan,Reconnaissance,SSH Patator,Shellcode,Web Attack - Brute Force,Web Attack - SQL Injection,Web Attack - XSS,Worms
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Cluster 0,1819,0,0,0,0,0,0,0,0,1,0,3,7,0,0,1,0,0,0,0,0,0,0,0
Cluster 1,0,0,0,0,0,0,0,0,3000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Cluster 10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3000,0,0,0,0,0
Cluster 11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3000,0,0,0
Cluster 12,0,0,0,0,0,0,3000,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Cluster 13,0,0,0,0,15,0,0,0,0,2977,0,0,4,0,0,1,0,0,0,0,0,0,0,0
Cluster 14,0,0,0,0,0,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cluster 15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3000,0
Cluster 16,0,0,0,0,143,0,0,0,0,1,0,0,0,0,0,0,0,1751,0,0,0,0,0,0
Cluster 17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2990,0,0,0,0,0,0,0,0


In [None]:
# All Tokens
kmeans_cluster = assign_model(kmeans)
kmeans_cluster['label'] = df['attack_cat'].tolist()
pd.pivot_table(kmeans_cluster, index='Cluster', columns='label', aggfunc='size', fill_value=0)

label,Analysis,Backdoor,Bot,DDoS,DoS,DoS GoldenEye,DoS Hulk,DoS SlowHTTPTest,DoS Slowloris,Exploits,FTP Patator,Fuzzers,Generic,Heartbleed,Infiltration,Normal,Port Scan,Reconnaissance,SSH Patator,Shellcode,Web Attack - Brute Force,Web Attack - SQL Injection,Web Attack - XSS,Worms
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Cluster 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3000
Cluster 1,0,0,0,0,0,0,0,0,3000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Cluster 10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3000,0,0,0,0,0,0,6,0,0
Cluster 11,0,0,0,0,0,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cluster 12,0,0,0,3000,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0
Cluster 13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3000,0
Cluster 14,0,0,0,0,0,0,3000,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Cluster 15,0,0,0,0,1,0,0,0,0,6,0,2996,2,0,0,1,0,0,0,0,0,0,0,0
Cluster 16,0,0,0,0,55,0,0,0,0,8,0,1,2980,0,0,0,0,0,0,0,0,0,0,0
Cluster 17,1819,0,0,0,0,0,0,0,0,1,0,3,7,0,0,1,0,0,0,0,0,0,0,0


In [None]:
# [CLS] Token
kmeans_cluster = assign_model(kmeans)
kmeans_cluster['label'] = df['attack_cat'].tolist()
pd.pivot_table(kmeans_cluster, index='Cluster', columns='label', aggfunc='size', fill_value=0)

label,Analysis,Backdoor,Bot,DDoS,DoS,DoS GoldenEye,DoS Hulk,DoS SlowHTTPTest,DoS Slowloris,Exploits,FTP Patator,Fuzzers,Generic,Heartbleed,Infiltration,Normal,Port Scan,Reconnaissance,SSH Patator,Shellcode,Web Attack - Brute Force,Web Attack - SQL Injection,Web Attack - XSS,Worms
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Cluster 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,605,0,0,0,0,0,0,0
Cluster 1,0,0,0,0,0,0,0,0,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cluster 10,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3000
Cluster 11,0,0,0,0,0,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cluster 12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3000,0,0,0,0,0
Cluster 13,0,0,0,0,0,0,0,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cluster 14,0,0,0,0,56,0,0,0,0,8,0,1,2983,0,0,0,0,0,0,0,0,0,0,0
Cluster 15,0,0,0,0,12,0,0,0,0,2976,0,0,4,0,0,0,0,0,0,0,0,0,0,0
Cluster 16,0,0,0,0,0,0,0,0,0,7,0,2997,2,0,0,1,0,0,0,0,0,0,0,0
Cluster 17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3000,0,0,0,0,0,0,0,0,0


# Cluster Analysis

In [None]:
!pip install openai transformers

In [None]:
import pandas as pd
import openai
import os
import numpy as np
import pickle
import re
import time
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
pd.set_option('display.max_columns', 25)
from google.colab.data_table import DataTable
DataTable.max_columns = 25

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("rdpahalavan/bert-network-packet-v2")

model = AutoModel.from_pretrained("rdpahalavan/bert-network-packet-v2")

Some weights of the model checkpoint at rdpahalavan/bert-network-packet-v2 were not used when initializing DistilBertModel: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
with open('/content/drive/MyDrive/NLP/SCALER.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [None]:
scaler_col_names = [str(i) for i in range(768)]
def get_embedding(text):
  tokens = tokenizer(text, truncation=True, return_tensors='pt')
  output = model(**tokens)
  embedding = output.last_hidden_state[:, 1:-1, :].mean(dim=1).detach().numpy()
  df = pd.DataFrame(embedding[0].reshape(1, -1), columns=scaler_col_names)
  return scaler.transform(df)[0]

In [None]:
main_df = pd.read_csv('/content/drive/MyDrive/NLP/DATA_TRAIN.csv')

In [None]:
main_df

Unnamed: 0,packet_dat,attack_cat
0,0 0 141 -1 80 63713 2960 2920 64 0 5 0 -1 119 ...,DDoS
1,1190 1582 3526815 -1 80 50095 1500 1460 118 0 ...,Normal
2,0 0 4 -1 80 41471 4420 4380 64 0 5 0 -1 72 84 ...,DDoS
3,0 0 176 -1 80 45284 2948 2896 64 0 8 0 -1 72 8...,DoS Hulk
4,0 0 128 -1 80 46654 1500 1448 64 0 8 0 -1 72 8...,DoS Hulk
...,...,...
1187776,14492 14492 0 -1 51328 22 164 112 62 0 8 3 -1 ...,SSH Patator
1187777,14 98 131788 -1 80 52067 1500 1460 253 0 5 0 -...,DoS
1187778,1 2 397 -1 47188 22 692 640 62 0 8 3 -1 0 0 2 ...,SSH Patator
1187779,2063 0 0 -1 80 32768 1500 1448 64 0 8 0 -1 32 ...,DoS Hulk


In [None]:
df = pd.read_csv('PIVOT_TABLE.csv', index_col=0)

In [None]:
df

Unnamed: 0_level_0,Analysis,Backdoor,Bot,DDoS,DoS,DoS GoldenEye,DoS Hulk,DoS SlowHTTPTest,DoS Slowloris,Exploits,FTP Patator,Fuzzers,Generic,Heartbleed,Infiltration,Normal,Port Scan,Reconnaissance,SSH Patator,Shellcode,Web Attack - Brute Force,Web Attack - SQL Injection,Web Attack - XSS,Worms
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Cluster 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6297,0,0,0,0,0,0,0,0
Cluster 1,0,0,0,0,0,0,0,0,0,0,0,0,0,2601,0,0,0,0,0,0,0,0,0,0
Cluster 10,0,0,0,0,0,686,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cluster 100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,627,0,0,0,0,0
Cluster 101,0,0,0,0,0,0,6149,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cluster 95,0,0,0,5594,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0
Cluster 96,0,0,0,0,0,0,0,0,0,1,0,1112,0,0,0,4,0,0,0,0,0,0,0,0
Cluster 97,0,0,0,4512,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0
Cluster 98,0,0,0,0,1941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
def create_value_name_pair(row):
    return ', '.join([f"{k}" for k, v in row.items() if v != 0])

In [None]:
df['Cluster Name'] = df.apply(create_value_name_pair, axis=1)

In [None]:
df

Unnamed: 0_level_0,Analysis,Backdoor,Bot,DDoS,DoS,DoS GoldenEye,DoS Hulk,DoS SlowHTTPTest,DoS Slowloris,Exploits,FTP Patator,Fuzzers,Generic,Heartbleed,Infiltration,Normal,Port Scan,Reconnaissance,SSH Patator,Shellcode,Web Attack - Brute Force,Web Attack - SQL Injection,Web Attack - XSS,Worms,Cluster Name
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Cluster 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6297,0,0,0,0,0,0,0,0,Normal
Cluster 1,0,0,0,0,0,0,0,0,0,0,0,0,0,2601,0,0,0,0,0,0,0,0,0,0,Heartbleed
Cluster 10,0,0,0,0,0,686,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,DoS GoldenEye
Cluster 100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,627,0,0,0,0,0,SSH Patator
Cluster 101,0,0,0,0,0,0,6149,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,"DoS Hulk, Normal"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cluster 95,0,0,0,5594,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,"DDoS, Normal"
Cluster 96,0,0,0,0,0,0,0,0,0,1,0,1112,0,0,0,4,0,0,0,0,0,0,0,0,"Exploits, Fuzzers, Normal"
Cluster 97,0,0,0,4512,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,"DDoS, Normal"
Cluster 98,0,0,0,0,1941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,DoS


In [None]:
df[df['Cluster Name'].str.split(',').apply(lambda x: len(x)) > 1]

Unnamed: 0_level_0,Analysis,Backdoor,Bot,DDoS,DoS,DoS GoldenEye,DoS Hulk,DoS SlowHTTPTest,DoS Slowloris,Exploits,FTP Patator,Fuzzers,Generic,Heartbleed,Infiltration,Normal,Port Scan,Reconnaissance,SSH Patator,Shellcode,Web Attack - Brute Force,Web Attack - SQL Injection,Web Attack - XSS,Worms,Cluster Name
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Cluster 101,0,0,0,0,0,0,6149,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,"DoS Hulk, Normal"
Cluster 103,0,0,0,0,0,0,6188,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"DoS Hulk, Normal"
Cluster 106,0,0,0,0,140,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"DoS, Exploits"
Cluster 109,0,0,0,0,0,0,6353,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"DoS Hulk, Normal"
Cluster 11,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,351,0,0,0,0,0,0,"DoS, Fuzzers, Reconnaissance"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cluster 92,0,0,0,0,0,0,12466,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,"DoS Hulk, Normal"
Cluster 94,0,0,0,0,3,0,0,0,0,0,0,948,0,0,0,0,0,0,0,0,0,0,0,0,"DoS, Fuzzers"
Cluster 95,0,0,0,5594,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,"DDoS, Normal"
Cluster 96,0,0,0,0,0,0,0,0,0,1,0,1112,0,0,0,4,0,0,0,0,0,0,0,0,"Exploits, Fuzzers, Normal"


In [None]:
cluster_centers = np.load('/content/drive/MyDrive/NLP/KMEANS-CLUSTER-CENTERS.npy')
cluster_centers.shape

(516, 768)

In [None]:
def generate_cluster_name(row):
  cosine_dist = cosine_similarity(cluster_centers, cluster_centers[int(row.name.split('Cluster ')[1])].reshape(1, -1))
  cosine_dist_formatted = [round(dist[0], 4) for dist in cosine_dist]
  data = {'Cluster {}'.format(i): cosine_dist_formatted[i] for i in range(len(cosine_dist_formatted))}
  df_sorted = pd.DataFrame(list(data.items()), columns=['Cluster', 'Cosine Distance'])
  df_sorted['Cluster Name'] = df_sorted.Cluster.apply(lambda x: df.loc[x]['Cluster Name'])
  df_sorted = df_sorted.sort_values(by='Cosine Distance', ascending=False)
  df_sorted.reset_index(drop=True, inplace=True)
  cluster_name = row['Cluster Name']
  name = df_sorted['Cluster Name'][0]
  for i in range(50):
    if df_sorted['Cluster Name'][i] != name and df_sorted['Cosine Distance'][i] > 0.97:
      cluster_name += ', ' + df_sorted['Cluster Name'][i]
  cluster_name = list(set(cluster_name.split(', ')))
  return ', '.join(cluster_name)

In [None]:
df['Cluster Name'] = df.apply(lambda row: generate_cluster_name(row), axis=1)

In [None]:
df = df[['Cluster Name']]

In [None]:
df

Unnamed: 0_level_0,Cluster Name
Cluster,Unnamed: 1_level_1
Cluster 0,Normal
Cluster 1,Heartbleed
Cluster 10,DoS GoldenEye
Cluster 100,SSH Patator
Cluster 101,"Normal, DoS Hulk"
...,...
Cluster 95,"Normal, DDoS"
Cluster 96,"Exploits, Generic, Normal, Fuzzers, DoS"
Cluster 97,"Normal, DDoS"
Cluster 98,"Exploits, DoS"


In [None]:
df['id'] = df.index.to_series().apply(lambda x: int(x.split('Cluster ')[1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df.index.to_series().apply(lambda x: int(x.split('Cluster ')[1]))


In [None]:
df.sort_values(by='id', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(by='id', inplace=True)


In [None]:
df = df.drop('id', axis=1)

In [None]:
df

Unnamed: 0_level_0,Cluster Name
Cluster,Unnamed: 1_level_1
Cluster 0,Normal
Cluster 1,Heartbleed
Cluster 2,"Normal, DoS Hulk"
Cluster 3,"Normal, DDoS"
Cluster 4,"Exploits, SSH Patator"
...,...
Cluster 511,"DoS, FTP Patator"
Cluster 512,"DoS GoldenEye, Normal"
Cluster 513,SSH Patator
Cluster 514,"Fuzzers, Exploits, Normal"


In [None]:
df['Cluster Name'].str.split(',').str.len().gt(1).sum()

340

In [None]:
combination = df['Cluster Name'].to_list()

In [None]:
len(combination)

516

In [None]:
combination[0]

'Normal'

In [None]:
cosine_dist = cosine_similarity(cluster_centers, cluster_centers[246].reshape(1, -1))
cosine_dist_formatted = [round(dist[0], 4) for dist in cosine_dist]
data = {'Cluster {}'.format(i): cosine_dist_formatted[i] for i in range(len(cosine_dist_formatted))}
df_sorted = pd.DataFrame(list(data.items()), columns=['Cluster', 'Cosine Distance'])
df_sorted['Cluster Name'] = df_sorted.Cluster.apply(lambda x: df.loc[x]['Cluster Name'])
df_sorted = df_sorted.sort_values(by='Cosine Distance', ascending=False)
df_sorted.reset_index(drop=True, inplace=True)
df_sorted.head(20)

Unnamed: 0,Cluster,Cosine Distance,Cluster Name
0,Cluster 246,1.0,Normal
1,Cluster 0,0.9956,Normal
2,Cluster 264,0.9937,Normal
3,Cluster 91,0.9892,Normal
4,Cluster 485,0.988,Normal
5,Cluster 426,0.9875,Normal
6,Cluster 156,0.9858,Normal
7,Cluster 73,0.9832,Normal
8,Cluster 342,0.9821,Normal
9,Cluster 339,0.98,Normal


In [None]:
df.loc['Cluster 156']['Cluster Name']

'Normal'

In [None]:
os.environ['OPENAI_API_KEY'] = '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'
openai.api_key = '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'

In [None]:
def get_explanation(label, count):
  try:
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a helpful assistant who gives 5 meaningful words or new labels or new attack names for the given label combination from this list of network packet labels: [DoS Hulk, Normal, DDoS, Heartbleed, SSH Patator, DoS, Exploits, Generic, FTP Patator, Fuzzers, Reconnaissance, DoS GoldenEye, Web Attack - Brute Force, Infiltration, Worms, DoS SlowHTTPTest, DoS Slowloris, Bot, Web Attack - XSS, Analysis, Backdoor, Shellcode, Port Scan, Web Attack - SQL Injection]"},
            {"role": "user", "content": f'For a given network packet label combination, give 5 meaningful words or new labels or new attack names (no explanation) that explain this combination: {label}'}
        ]
    )
  except Exception as e:
    time.sleep(2)
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a helpful assistant who gives 5 meaningful words or new labels or new attack names for the given label combination from this list of network packet labels: [DoS Hulk, Normal, DDoS, Heartbleed, SSH Patator, DoS, Exploits, Generic, FTP Patator, Fuzzers, Reconnaissance, DoS GoldenEye, Web Attack - Brute Force, Infiltration, Worms, DoS SlowHTTPTest, DoS Slowloris, Bot, Web Attack - XSS, Analysis, Backdoor, Shellcode, Port Scan, Web Attack - SQL Injection]"},
            {"role": "user", "content": f'For a given network packet label combination, give 5 meaningful words or new labels or new attack names (no explanation) that explain this combination: {label}'}
        ]
    )
  explanation = response['choices'][0]['message']['content']
  print(f'Completed: {count}')
  return explanation

In [None]:
combination

['Normal',
 'Heartbleed',
 'Normal, DoS Hulk',
 'Normal, DDoS',
 'Exploits, SSH Patator',
 'Exploits',
 'DoS',
 'FTP Patator',
 'Fuzzers, Normal',
 'Exploits, Generic',
 'DoS GoldenEye',
 'Reconnaissance, Fuzzers, DoS',
 'Reconnaissance, DoS',
 'Web Attack - Brute Force',
 'Normal, DoS Hulk',
 'Exploits, Worms',
 'DoS, DoS SlowHTTPTest',
 'Infiltration, Normal',
 'Heartbleed',
 'Normal, SSH Patator',
 'Web Attack - XSS, Normal',
 'DoS Hulk',
 'Generic, Normal, Bot',
 'DoS, DoS GoldenEye, Normal',
 'Exploits',
 'Shellcode, Exploits',
 'Generic, DoS',
 'Generic, DoS Slowloris, Normal',
 'Normal, Generic, DoS, Fuzzers, Exploits',
 'Exploits, DoS',
 'Analysis, Exploits, Generic, Normal',
 'DoS, Normal, DDoS',
 'Exploits, DoS',
 'Reconnaissance, Generic',
 'Normal, DDoS',
 'Normal, DoS Hulk',
 'Normal, DoS Hulk',
 'Port Scan, DoS Hulk',
 'Generic',
 'Normal',
 'Backdoor',
 'DoS, Exploits',
 'Normal, DoS Hulk',
 'Generic',
 'Normal',
 'Exploits, DoS, Generic',
 'Normal, DDoS',
 'SSH Patator'

In [None]:
combination_list = []
for i, text in enumerate(combination):
  if len(text.split(', '))>1:
    result = get_explanation(text, i)
    combination_list.append(result.split('\n'))
  else:
    combination_list.append(np.nan)

In [None]:
combination_df = pd.DataFrame({'Combination': combination, 'Combination List': combination_list})

In [None]:
combination_df

Unnamed: 0,Combination,Combination List
0,Normal,
1,Heartbleed,
2,"Normal, DoS Hulk","[1. DDoS Shield, 2. Steadyflow, 3. Regular Tra..."
3,"Normal, DDoS","[1. Coordinated Overload, 2. Cyber Storm, 3. N..."
4,"Exploits, SSH Patator","[1. Malicious SSH Exploiter, 2. Unauthorized S..."
...,...,...
511,"DoS, FTP Patator","[1. BruteFTP, 2. PayloadStorm, 3. PassiveFlood..."
512,"DoS GoldenEye, Normal","[1. DestructiveStrike, 2. PeakPerformance, 3. ..."
513,SSH Patator,
514,"Fuzzers, Exploits, Normal","[1. Vulnerability Probing, 2. Attack Simulatio..."


In [None]:
def check_own(cell):
    if isinstance(cell, list):  # if the cell contains a list of strings
        return any('stealthstrike' in str(item).lower() for item in cell)
    elif isinstance(cell, str):  # if the cell contains a string
        return 'stealthstrike' in cell.lower()
    else:
        return False

# apply the function to the 'Combination List' column
mask = combination_df['Combination List'].apply(check_own)

combination_df[mask]['Combination List']

389    [1. Exploits-DoS-Generic-Normal,    - Stealths...
Name: Combination List, dtype: object

In [None]:
combination_df.iloc[389]

Combination                            Exploits, DoS, Generic, Normal
Combination List    [1. Exploits-DoS-Generic-Normal,    - Stealths...
Name: 389, dtype: object

In [None]:
combination_df.to_csv('combination_df.csv', index=False)

In [None]:
np.isnan(combination_list[0])

True

In [None]:
def is_iterable(x):
    try:
        iter(x)
    except TypeError:
        return False
    return True

In [None]:
combination_list = [item for sublist in combination_list if is_iterable(sublist) for item in sublist]

In [None]:
combination_list = [re.sub(r'\d+\.\s|\d+\)\s', '', item) for item in combination_list]

In [None]:
combination_list = list(set(combination_list))

In [None]:
[x for x in combination_list if x[0]==' ']

['   - SilentAssault',
 '   - Inconspicuous',
 '   - Stealthstrike',
 '   - SystemStorm',
 '   - Crossfire']

In [None]:
combination_list = [i for i in combination_list if i not in [x for x in combination_list if x[0]==' ']]

In [None]:
[x for x in combination_list if len(x.split(','))>3]

['Timebomb, Harmonia, Surge, PeacefulStorm, Tranquil',
 'Network Stability, Large Scale Attack, Traffic Anomaly, Infrastructure Overload, Unusual Traffic Load']

In [None]:
combination_df.loc[504, 'Combination List'] = ['1. Timebomb',
 '2. Harmonia',
 '3. Surge',
 '4. PeacefulStorm',
 '5. Tranquil']

In [None]:
combination_df.loc[101, 'Combination List'] = ['1. Network Stability',
 '2. Large Scale Attack',
 '3. Traffic Anomaly',
 '4. Infrastructure Overload',
 '5. Unusual Traffic Load']

In [None]:
get_explanation('Exploits, DoS, Generic, Normal', 0)

Completed: 0


'1. Cyber Assault\n2. Malicious Masquerade\n3. Covert Sabotage\n4. Unpredictable Intrusion\n5. Anomaly-neutral'

In [None]:
combination_df.loc[389, 'Combination List'] = ['1. Cyber Assault',
 '2. Malicious Masquerade',
 '3. Covert Sabotage',
 '4. Unpredictable Intrusion',
 '5. Anomaly-neutral']

In [None]:
temp_list = []
for i in [x for x in combination_list if len(x.split(','))>3]:
  temp_list.extend(i.split(', '))

In [None]:
temp_list = ['Timebomb',
 'Harmonia',
 'Surge',
 'PeacefulStorm',
 'Tranquil',
 'Network Stability',
 'Large Scale Attack',
 'Traffic Anomaly',
 'Infrastructure Overload',
 'Unusual Traffic Load',
             'Cyber Assault', 'Malicious Masquerade', 'Covert Sabotage', 'Unpredictable Intrusion', 'Anomaly-neutral']

In [None]:
combination_list += temp_list

In [None]:
'Timebomb' in combination_list

True

In [None]:
combination_list = [i for i in combination_list if i not in [x for x in combination_list if len(x.split(','))>3]]

In [None]:
[x for x in combination_list if len(x.split(','))>3]

[]

In [None]:
combination_list

['Software Abuse',
 'AggressiveExploit',
 'Seamless',
 'ICMP Flood',
 'Code Manipulation',
 'Coordinated Surge',
 'Autonomous',
 'Rampant Sweep',
 'Performance-based Filtering',
 'Zero-day Raid',
 'Covert Overwhelm',
 'Raging Force',
 'CrossScriptingAssault',
 'Generic Vulnerability',
 'Generic Flooding',
 'Mixed Network Activity',
 'Reconnaissance Exploit Attack',
 'ExploitPatator',
 'Attack Vector Amplification',
 'CrossScripter',
 'Intrusive Recon DoS',
 'Stealthy Infiltration',
 'Overloading',
 'Universal Attack',
 'Malicious Payload',
 'Exploit Repository',
 'Silent Approach',
 'Code Propagation',
 'Fuzzing Normal Traffic',
 'Normal SSH traffic',
 'Malicious SSH Login',
 'Vulnerability Rampage',
 'Vulnerability Scanner',
 'OverflowBlitz',
 'Brutal Assault',
 'Ineffective Attack',
 'Normal Network Behavior',
 'Fuzz-based DoS Attack',
 'ExploitForce',
 'Vulnerability Seekers ',
 'FloodStorm',
 'Secure Network Access',
 'FTP Password Cracker',
 'Exploit-as-a-Service',
 'BypassedFirew

In [None]:
len(combination_list)

1329

In [None]:
attack_explanation_df = pd.read_csv('/content/ATTACK_EXPLANATION.csv')

In [None]:
attack_explanation_df

Unnamed: 0,name,explanation
0,Pass the Hash,"In a ""Pass the Hash"" network packet attack, an..."
1,Timing Attack,A timing attack is a type of network packet at...
2,DNS Spoofing,DNS spoofing is a network packet attack that i...
3,Null Byte Injection,Null Byte Injection is a network packet attack...
4,HTTP Response Splitting,HTTP response splitting is a network packet at...
...,...,...
554,Covert Flooding,Covert Flooding is a network packet attack tha...
555,Exploit Camouflage,Exploit camouflage is a network packet attack ...
556,Hulk's Wrath,"""Hulk's Wrath"" is a network packet attack that..."
557,DoS Exploits,A Denial-of-Service (DoS) exploit is a network...


In [None]:
attack_explanation_df = attack_explanation_df.iloc[:251,:]

In [None]:
attack_explanation_df

Unnamed: 0,name,explanation
0,Pass the Hash,"In a ""Pass the Hash"" network packet attack, an..."
1,Timing Attack,A timing attack is a type of network packet at...
2,DNS Spoofing,DNS spoofing is a network packet attack that i...
3,Null Byte Injection,Null Byte Injection is a network packet attack...
4,HTTP Response Splitting,HTTP response splitting is a network packet at...
...,...,...
246,Typosquatting/URL Hijacking,"Typosquatting, also known as URL hijacking, is..."
247,Format String Attack,A format string attack is a type of network pa...
248,Cross-site Scripting (XSS),Cross-site scripting (XSS) is a network packet...
249,Typosquatting,Typosquatting is a network packet attack that ...


In [None]:
def get_attack_explanation(name, count):
  try:
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a helpful assistant who gives a technical explanation of network packet attacks in around 300 words in a single paragraph without any new lines."},
            {"role": "user", "content": f'Explain about "{name}" network packet attack in a technical manner.'}
        ]
    )
  except Exception as e:
    time.sleep(2)
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a helpful assistant who gives a technical explanation of network packet attacks in around 300 words in a single paragraph without any new lines."},
            {"role": "user", "content": f'Explain about "{name}" network packet attack in a technical manner.'}
        ]
    )
  explanation = response['choices'][0]['message']['content']
  print(f'Completed: {count}')
  return explanation.replace('\n', ' ')

In [None]:
len(combination_list)

1329

In [None]:
explanation_list = []

In [None]:
for i, name in enumerate(combination_list):
  explanation = get_attack_explanation(name, i)
  explanation_list.append(explanation)

In [None]:
len(explanation_list)

1329

In [None]:
explanation_list[211]

"A Flux Flood network packet attack, also known as a reflection amplification attack, is a type of DDoS (Distributed Denial of Service) attack that exploits certain network protocols to overwhelm a target system's resources. The attack is carried out by sending a large volume of forged source IP packets to vulnerable servers or devices that support protocols like DNS (Domain Name System), NTP (Network Time Protocol), or SNMP (Simple Network Management Protocol). The attacker spoofs the source IP address, making it appear as if the attack originates from the targeted system, causing the responses from the targeted server to be sent to the victim's IP address instead. This amplification effect allows the attacker to generate an overwhelming amount of traffic, creating a flood of packets that congest the victim's network and exhaust its resources. One reason why Flux Flood attacks are particularly effective is the use of protocols that support recursive querying, allowing a small request 

In [None]:
attack_exp_df = pd.DataFrame({'name': combination_list, 'explanation': explanation_list})

In [None]:
attack_exp_df

Unnamed: 0,name,explanation
0,Software Abuse,"A ""Software Abuse"" network packet attack is a ..."
1,AggressiveExploit,"The ""AggressiveExploit"" network packet attack ..."
2,Seamless,"A ""Seamless"" network packet attack, also known..."
3,ICMP Flood,An ICMP (Internet Control Message Protocol) Fl...
4,Code Manipulation,"Code Manipulation, also known as packet inject..."
...,...,...
1324,Cyber Assault,"A ""Cyber Assault"" network packet attack refers..."
1325,Malicious Masquerade,"A ""Malicious Masquerade"" network packet attack..."
1326,Covert Sabotage,Covert Sabotage refers to a network packet att...
1327,Unpredictable Intrusion,"An ""Unpredictable Intrusion"" network packet at..."


In [None]:
attack_explanation_df = pd.concat([attack_explanation_df, attack_exp_df], ignore_index=True)

In [None]:
attack_explanation_df

Unnamed: 0,name,explanation
0,Pass the Hash,"In a ""Pass the Hash"" network packet attack, an..."
1,Timing Attack,A timing attack is a type of network packet at...
2,DNS Spoofing,DNS spoofing is a network packet attack that i...
3,Null Byte Injection,Null Byte Injection is a network packet attack...
4,HTTP Response Splitting,HTTP response splitting is a network packet at...
...,...,...
1575,Cyber Assault,"A ""Cyber Assault"" network packet attack refers..."
1576,Malicious Masquerade,"A ""Malicious Masquerade"" network packet attack..."
1577,Covert Sabotage,Covert Sabotage refers to a network packet att...
1578,Unpredictable Intrusion,"An ""Unpredictable Intrusion"" network packet at..."


In [None]:
attack_explanation_df.to_csv('ATTACK_EXPLANATION.csv', index=False)

In [None]:
original_list = ['DDoS', 'Normal', 'DoS Hulk', 'DoS', 'Bot', 'Exploits', 'Fuzzers',
       'Reconnaissance', 'Web Attack - XSS', 'Heartbleed', 'SSH Patator',
       'DoS SlowHTTPTest', 'FTP Patator', 'Generic',
       'Web Attack - Brute Force', 'DoS GoldenEye', 'Analysis', 'Worms',
       'Infiltration', 'DoS Slowloris', 'Shellcode', 'Backdoor',
       'Port Scan', 'Web Attack - SQL Injection']
new_list = []

for i in range(len(original_list)):
    for j in range(i+1, len(original_list)):
        new_list.append(original_list[i] + ', ' + original_list[j])

print(new_list)

['DDoS, Normal', 'DDoS, DoS Hulk', 'DDoS, DoS', 'DDoS, Bot', 'DDoS, Exploits', 'DDoS, Fuzzers', 'DDoS, Reconnaissance', 'DDoS, Web Attack - XSS', 'DDoS, Heartbleed', 'DDoS, SSH Patator', 'DDoS, DoS SlowHTTPTest', 'DDoS, FTP Patator', 'DDoS, Generic', 'DDoS, Web Attack - Brute Force', 'DDoS, DoS GoldenEye', 'DDoS, Analysis', 'DDoS, Worms', 'DDoS, Infiltration', 'DDoS, DoS Slowloris', 'DDoS, Shellcode', 'DDoS, Backdoor', 'DDoS, Port Scan', 'DDoS, Web Attack - SQL Injection', 'Normal, DoS Hulk', 'Normal, DoS', 'Normal, Bot', 'Normal, Exploits', 'Normal, Fuzzers', 'Normal, Reconnaissance', 'Normal, Web Attack - XSS', 'Normal, Heartbleed', 'Normal, SSH Patator', 'Normal, DoS SlowHTTPTest', 'Normal, FTP Patator', 'Normal, Generic', 'Normal, Web Attack - Brute Force', 'Normal, DoS GoldenEye', 'Normal, Analysis', 'Normal, Worms', 'Normal, Infiltration', 'Normal, DoS Slowloris', 'Normal, Shellcode', 'Normal, Backdoor', 'Normal, Port Scan', 'Normal, Web Attack - SQL Injection', 'DoS Hulk, DoS', 

In [None]:
combination

['Normal',
 'Heartbleed',
 'Normal, DoS Hulk',
 'Normal, DDoS',
 'Exploits, SSH Patator',
 'Exploits',
 'DoS',
 'FTP Patator',
 'Fuzzers, Normal',
 'Exploits, Generic',
 'DoS GoldenEye',
 'Reconnaissance, Fuzzers, DoS',
 'Reconnaissance, DoS',
 'Web Attack - Brute Force',
 'Normal, DoS Hulk',
 'Exploits, Worms',
 'DoS, DoS SlowHTTPTest',
 'Infiltration, Normal',
 'Heartbleed',
 'Normal, SSH Patator',
 'Web Attack - XSS, Normal',
 'DoS Hulk',
 'Generic, Normal, Bot',
 'DoS, DoS GoldenEye, Normal',
 'Exploits',
 'Shellcode, Exploits',
 'Generic, DoS',
 'Generic, DoS Slowloris, Normal',
 'Normal, Generic, DoS, Fuzzers, Exploits',
 'Exploits, DoS',
 'Analysis, Exploits, Generic, Normal',
 'DoS, Normal, DDoS',
 'Exploits, DoS',
 'Reconnaissance, Generic',
 'Normal, DDoS',
 'Normal, DoS Hulk',
 'Normal, DoS Hulk',
 'Port Scan, DoS Hulk',
 'Generic',
 'Normal',
 'Backdoor',
 'DoS, Exploits',
 'Normal, DoS Hulk',
 'Generic',
 'Normal',
 'Exploits, DoS, Generic',
 'Normal, DDoS',
 'SSH Patator'

In [None]:
len(new_list)

276

In [None]:
set1 = set(tuple(sorted(pair.split(', '))) for pair in new_list)
set2 = set(tuple(sorted(pair.split(', '))) for pair in [x for x in combination if len(x.split(','))==2])

# compute set difference
diff = set1 - set2

# Convert tuples back into comma-separated strings
final_list = [', '.join(pair) for pair in diff]

print(final_list)

['DoS SlowHTTPTest, Fuzzers', 'DoS, Worms', 'FTP Patator, Reconnaissance', 'DoS Hulk, Reconnaissance', 'Generic, SSH Patator', 'SSH Patator, Web Attack - Brute Force', 'Bot, Generic', 'Bot, Shellcode', 'Shellcode, Worms', 'Generic, Web Attack - XSS', 'Bot, DoS Slowloris', 'DoS GoldenEye, FTP Patator', 'DoS Slowloris, Web Attack - Brute Force', 'DoS Slowloris, Shellcode', 'Analysis, DoS GoldenEye', 'DoS SlowHTTPTest, Infiltration', 'Backdoor, Shellcode', 'DDoS, Worms', 'Shellcode, Web Attack - SQL Injection', 'Port Scan, Reconnaissance', 'Backdoor, DoS Slowloris', 'DoS Hulk, Generic', 'Bot, SSH Patator', 'Fuzzers, Worms', 'Web Attack - SQL Injection, Web Attack - XSS', 'Bot, Web Attack - XSS', 'DoS SlowHTTPTest, Reconnaissance', 'Analysis, Port Scan', 'DoS Slowloris, SSH Patator', 'DoS Hulk, FTP Patator', 'DoS Slowloris, Web Attack - XSS', 'DDoS, Web Attack - SQL Injection', 'Backdoor, SSH Patator', 'Generic, Web Attack - SQL Injection', 'Normal, Web Attack - Brute Force', 'Fuzzers, Web

In [None]:
len(final_list)

247

In [None]:
get_explanation(final_list[0], 1)

Completed: 1


'1. Denial-of-Service SlowHTTPTest Fuzzing\n2. HTTP Flooding Fuzzing Attack\n3. SlowHTTPTest Fuzzer DoS\n4. Fuzzing DoS SlowHTTPTest\n5. HTTP Slowloris Fuzzing'

In [None]:
final_combination_list = []
for i, text in enumerate(final_list):
  result = get_explanation(text, i)
  final_combination_list.append(result.split('\n'))

In [None]:
final_combination_df = pd.DataFrame({'Combination': final_list, 'Combination List': final_combination_list})

In [None]:
final_combination_df

Unnamed: 0,Combination,Combination List
0,"DoS SlowHTTPTest, Fuzzers","[1. Resource Exhaustion, 2. Slow POST Attack, ..."
1,"DoS, Worms","[1. Malicious Overload, 2. Infected Swarm, 3. ..."
2,"FTP Patator, Reconnaissance","[1. Password Hunter, 2. Credential Scraper, 3...."
3,"DoS Hulk, Reconnaissance","[1. Intense Mass Disruption, 2. Stealthy Infor..."
4,"Generic, SSH Patator","[1. Password Crack-SSH, 2. Common Credential A..."
...,...,...
242,"DDoS, Generic","[1. Malicious Storm, 2. Flood Fusion, 3. Avala..."
243,"Generic, Shellcode","[1. Malcode Shell, 2. Non-specific Exploit, 3...."
244,"Fuzzers, Generic","[1. CodeMangler, 2. ProtocolGambler, 3. BitScr..."
245,"DoS Hulk, Infiltration","[1. StealthFlood, 2. CovertIngress, 3. Overwhe..."


In [None]:
final_combination_df.to_csv('final_combination_df.csv', index=False)

In [None]:
final_combination_list = [item for sublist in final_combination_list for item in sublist]
final_combination_list = [re.sub(r'\d+\.\s', '', item) for item in final_combination_list]
final_combination_list = list(set(final_combination_list))

In [None]:
final_combination_list

['DataExfiltrator',
 'Packet Analyzer',
 'Exploitative Bot',
 'GenericWebExploit',
 'File Transfer Assault',
 'Payload Multiplication',
 'NonspecificInfiltrator',
 'ConnectionStarvation',
 'Sabotage Injection',
 'Unthreatening',
 'BreachIngress ',
 'Layer 7 Denial of Service',
 'Port Flooding',
 'Golden Worm Onslaught',
 'Hulk XSS ',
 'PersistentPulse',
 'Automated FTP Crack',
 'Worm Propagation',
 'Resource Exhaustion Strike',
 'HTTP Heartbleed DoS',
 'SneakyTester',
 'SQLHack',
 'Mechanized Exploitation',
 'Web Invasion',
 'BrutalKnockout',
 'Malcode Shell',
 'Silent Infiltrator',
 'SQLInjectBot',
 'Tactical Paralysis',
 'Flood of Slowness',
 'clean',
 'DataTrickster',
 'Data breach',
 'Web-DoS XSS ',
 'Hostile Infestation',
 'Botnet Storm',
 'DoS Sweep',
 'RemoteAccessCompromise',
 'System Inundation',
 'Common Credential Attack',
 'Unseen Intruder',
 'Botware ',
 'Stealthy Access',
 'ShellStrike',
 'GoldenHacker',
 'JavaScript Vulnerability Exploits ',
 'Gateway Breach',
 'SSH Brut

In [None]:
combination_list = attack_explanation_df['name'].to_list()

In [None]:
combination_list

['Pass the Hash',
 'Timing Attack',
 'DNS Spoofing',
 'Null Byte Injection',
 'HTTP Response Splitting',
 'Security Misconfiguration',
 'Tag Tampering',
 'DNS Server Amplification',
 'Phishing',
 'Web Scraping',
 'Dumpster Diving',
 'Web Scripting Virus',
 'Podcasting',
 'Honeypot',
 'Traffic Analysis',
 'Keylogger',
 'Zeroday exploit',
 'Zip Bomb',
 'Cookie Poisoning',
 'SSL Hijacking',
 'Digital Forensics',
 'DoS SlowHTTPTest',
 'Email Bombing',
 'Teardrop Attack',
 'ICMP Flood',
 'Mail Bombing',
 'Pass the Hash Attacks',
 'Reconnaissance',
 'Online Fraud',
 'URL Redirector Abuse',
 'Directory Traversal Attack',
 'Unauthorized Mobile Charges',
 'Analysis',
 'Bluesnarfing',
 'Cyber Fraud',
 'Session Sidejacking',
 'Email Hijacking',
 'Waterhole Attacks',
 'Incident Response',
 'ARP Spoofing',
 'Integer Overflow',
 'DOS Device Attacks',
 'Crypto Shredding',
 'BEC (Business Email Compromise)',
 'SSI Injection',
 'Adware',
 'DNS Cache Poisoning',
 'Cyber Terrorism',
 'Cryptojacking',
 'P

In [None]:
final_combination_list = [x for x in final_combination_list if x not in combination_list]

In [None]:
len(final_combination_list)

1060

In [None]:
final_combination_list[0]

'DataExfiltrator'

In [None]:
final_explanation_list = []

In [None]:
for i, name in enumerate(final_combination_list):
  explanation = get_attack_explanation(name, i)
  final_explanation_list.append(explanation)

Completed: 0
Completed: 1
Completed: 2
Completed: 3
Completed: 4
Completed: 5
Completed: 6
Completed: 7
Completed: 8
Completed: 9
Completed: 10
Completed: 11
Completed: 12
Completed: 13
Completed: 14
Completed: 15
Completed: 16
Completed: 17
Completed: 18
Completed: 19
Completed: 20
Completed: 21
Completed: 22
Completed: 23
Completed: 24
Completed: 25
Completed: 26
Completed: 27
Completed: 28
Completed: 29
Completed: 30
Completed: 31
Completed: 32
Completed: 33
Completed: 34
Completed: 35
Completed: 36
Completed: 37
Completed: 38
Completed: 39
Completed: 40
Completed: 41
Completed: 42
Completed: 43
Completed: 44
Completed: 45
Completed: 46
Completed: 47
Completed: 48
Completed: 49
Completed: 50
Completed: 51
Completed: 52
Completed: 53
Completed: 54
Completed: 55
Completed: 56
Completed: 57
Completed: 58
Completed: 59
Completed: 60
Completed: 61
Completed: 62
Completed: 63
Completed: 64
Completed: 65
Completed: 66
Completed: 67
Completed: 68
Completed: 69
Completed: 70
Completed: 71
Co

In [None]:
get_attack_explanation(final_combination_list[0], 1)

Completed: 1


'A "DataExfiltrator" network packet attack is a type of cyber attack that focuses on the unauthorized extraction or exfiltration of data from a network. This attack typically occurs when an attacker gains access to a network and begins to intercept and capture network packets containing sensitive or valuable data. The attacker can then analyze these packets to extract the desired information, which can include login credentials, financial data, or intellectual property. The DataExfiltrator attack usually involves various techniques such as packet sniffing, where the attacker monitors the network traffic to identify packets of interest. The attacker may also employ packet injection techniques to modify or create forged packets to facilitate data exfiltration. This can be done by sending packets with the stolen data to a remote server controlled by the attacker or by modifying existing packets to include hidden data. Various methods can be used during a DataExfiltrator attack, such as ex

In [None]:
attack_exp_df = pd.DataFrame({'name': final_combination_list, 'explanation': final_explanation_list})

In [None]:
attack_exp_df

Unnamed: 0,name,explanation
0,DataExfiltrator,"A network packet attack known as ""DataExfiltra..."
1,Packet Analyzer,"A packet analyzer, also known as a packet snif..."
2,Exploitative Bot,An exploitative bot network packet attack is a...
3,GenericWebExploit,"A ""GenericWebExploit"" network packet attack re..."
4,File Transfer Assault,"A ""File Transfer Assault"" network packet attac..."
...,...,...
1055,SSH Bruteforce,SSH Bruteforce is a network packet attack exec...
1056,Unleashed Fury,"""Unleashed Fury"" is a sophisticated network pa..."
1057,3.SQL Exploder,"The ""SQL Exploder"" network packet attack is a ..."
1058,Breach Mapping,"A ""Breach Mapping"" network packet attack is a ..."


In [None]:
attack_explanation_df = pd.concat([attack_explanation_df, attack_exp_df], ignore_index=True)

In [None]:
attack_explanation_df

Unnamed: 0,name,explanation
0,Pass the Hash,"In a ""Pass the Hash"" network packet attack, an..."
1,Timing Attack,A timing attack is a type of network packet at...
2,DNS Spoofing,DNS spoofing is a network packet attack that i...
3,Null Byte Injection,Null Byte Injection is a network packet attack...
4,HTTP Response Splitting,HTTP response splitting is a network packet at...
...,...,...
2635,SSH Bruteforce,SSH Bruteforce is a network packet attack exec...
2636,Unleashed Fury,"""Unleashed Fury"" is a sophisticated network pa..."
2637,3.SQL Exploder,"The ""SQL Exploder"" network packet attack is a ..."
2638,Breach Mapping,"A ""Breach Mapping"" network packet attack is a ..."


In [None]:
attack_explanation_df.to_csv('ATTACK_EXPLANATION.csv', index=False)

In [None]:
def get_similar_label(name, count):
  try:
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a helpful assistant who gives 5 meaningful words or similar labels for the given network packet label: The primary labels are [DoS Hulk, Normal, DDoS, Heartbleed, SSH Patator, DoS, Exploits, Generic, FTP Patator, Fuzzers, Reconnaissance, DoS GoldenEye, Web Attack - Brute Force, Infiltration, Worms, DoS SlowHTTPTest, DoS Slowloris, Bot, Web Attack - XSS, Analysis, Backdoor, Shellcode, Port Scan, Web Attack - SQL Injection]."},
            {"role": "user", "content": f'Give 5 similar labels for this label: "{name}"'}
        ]
    )
  except Exception as e:
    time.sleep(2)
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a helpful assistant who gives 5 meaningful words or similar labels for the given network packet label: The primary labels are [DoS Hulk, Normal, DDoS, Heartbleed, SSH Patator, DoS, Exploits, Generic, FTP Patator, Fuzzers, Reconnaissance, DoS GoldenEye, Web Attack - Brute Force, Infiltration, Worms, DoS SlowHTTPTest, DoS Slowloris, Bot, Web Attack - XSS, Analysis, Backdoor, Shellcode, Port Scan, Web Attack - SQL Injection]."},
            {"role": "user", "content": f'Give 5 similar labels for this label: "{name}"'}
        ]
    )
  explanation = response['choices'][0]['message']['content']
  print(f'Completed: {count}')
  return explanation

In [None]:
get_similar_label('SQL Injection', 0)

Completed: 0


'1. Web Attack - SQL Inject\n2. Exploit - SQL Injection\n3. SQL Injection Attack\n4. SQL Inject Attempt\n5. Web Application Attack - SQL Injection'

In [None]:
combination_df

Unnamed: 0,Combination,Combination List
0,Normal,
1,Heartbleed,
2,"Normal, DoS Hulk","[1. DDoS Shield, 2. Steadyflow, 3. Regular Tra..."
3,"Normal, DDoS","[1. Coordinated Overload, 2. Cyber Storm, 3. N..."
4,"Exploits, SSH Patator","[1. Malicious SSH Exploiter, 2. Unauthorized S..."
...,...,...
511,"DoS, FTP Patator","[1. BruteFTP, 2. PayloadStorm, 3. PassiveFlood..."
512,"DoS GoldenEye, Normal","[1. DestructiveStrike, 2. PeakPerformance, 3. ..."
513,SSH Patator,
514,"Fuzzers, Exploits, Normal","[1. Vulnerability Probing, 2. Attack Simulatio..."


In [None]:
combination_df['Combination List'].isna().sum()

176

In [None]:
similar_label_list = []

In [None]:
for i in range(len(combination_df)):
  if len(combination_df['Combination'][i].split(',')) == 1:
    result = get_similar_label(combination_df['Combination'][i], i)
    similar_label_list.append(result.split('\n'))

Completed: 0
Completed: 1
Completed: 5
Completed: 6
Completed: 7
Completed: 10
Completed: 13
Completed: 18
Completed: 21
Completed: 24
Completed: 38
Completed: 39
Completed: 40
Completed: 43
Completed: 44
Completed: 47
Completed: 48
Completed: 49
Completed: 57
Completed: 59
Completed: 61
Completed: 63
Completed: 68
Completed: 73
Completed: 75
Completed: 80
Completed: 81
Completed: 83
Completed: 85
Completed: 86
Completed: 88
Completed: 90
Completed: 91
Completed: 93
Completed: 100
Completed: 104
Completed: 105
Completed: 107
Completed: 112
Completed: 113
Completed: 116
Completed: 118
Completed: 123
Completed: 125
Completed: 126
Completed: 129
Completed: 132
Completed: 133
Completed: 135
Completed: 136
Completed: 141
Completed: 145
Completed: 147
Completed: 148
Completed: 150
Completed: 155
Completed: 156
Completed: 161
Completed: 167
Completed: 174
Completed: 176
Completed: 179
Completed: 180
Completed: 185
Completed: 188
Completed: 190
Completed: 195
Completed: 204
Completed: 206
Comp

In [None]:
similar_label_list

[['1. Regular', '2. Typical', '3. Common', '4. Standard', '5. Regularized'],
 ['1. SSL vulnerability',
  '2. OpenSSL exploit',
  '3. Information disclosure',
  '4. TLS attack',
  '5. Memory leak vulnerability'],
 ['1. Vulnerability exploits',
  '2. Code exploits',
  '3. Software exploits',
  '4. Security exploits',
  '5. System exploits'],
 ['1. Denial of Service',
  '2. DDoS (Distributed Denial of Service)',
  '3. DoS attacks',
  '4. DoS flooding',
  '5. DoS amplification'],
 ['1. FTP Brute Force',
  '2. FTP Attack',
  '3. FTP Cracking',
  '4. FTP Password Guessing',
  '5. FTP Password Crack'],
 ['1. Denial of Service (DoS) Attack - GoldenEye',
  '2. GoldenEye DoS Attack',
  '3. Denial of Service - GoldenEye Variant',
  '4. DoS GoldenEye Exploit',
  '5. GoldenEye Denial of Service Attack'],
 ['1. Web Attack - Credential Stuffing',
  '2. Web Attack - Dictionary Attack',
  '3. Brute Force Login Attempts',
  '4. Password Guessing Attack',
  '5. Web Attack - Password Spraying'],
 ['1. SSL

In [None]:
len(similar_label_list)

176

In [None]:
similar_label_list.index(['The network packet labeled as "Normal" refers to a regular and expected network communication occurring between devices. It does not exhibit any suspicious or malicious behavior. The contents and behavior of the packet comply with the standard protocols and rules defined by the network and application. This label indicates that the packet is not associated with any known attacks, exploits, or abnormal activities. It is important to classify network packets as "Normal" to differentiate them from potentially harmful or suspicious packets. Monitoring network traffic for anomalies and identifying packets as "Normal" helps in maintaining a secure and efficient network environment.'])

150

In [None]:
similar_label_list[150] = ['1. Regular', '2. Standard', '3. Typical', '4. Common', '5. Routine']

In [None]:
k = 0
for i in range(len(combination_df)):
  if len(combination_df['Combination'][i].split(',')) == 1:
    combination_df.loc[i, 'Combination List'] = similar_label_list[k]
    k+=1

In [None]:
combination_df

Unnamed: 0,Combination,Combination List
0,Normal,"[1. Regular, 2. Typical, 3. Common, 4. Standar..."
1,Heartbleed,"[1. SSL vulnerability, 2. OpenSSL exploit, 3. ..."
2,"Normal, DoS Hulk","[1. DDoS Shield, 2. Steadyflow, 3. Regular Tra..."
3,"Normal, DDoS","[1. Coordinated Overload, 2. Cyber Storm, 3. N..."
4,"Exploits, SSH Patator","[1. Malicious SSH Exploiter, 2. Unauthorized S..."
...,...,...
511,"DoS, FTP Patator","[1. BruteFTP, 2. PayloadStorm, 3. PassiveFlood..."
512,"DoS GoldenEye, Normal","[1. DestructiveStrike, 2. PeakPerformance, 3. ..."
513,SSH Patator,"[1. SSH Brute Force, 2. SSH Dictionary Attack,..."
514,"Fuzzers, Exploits, Normal","[1. Vulnerability Probing, 2. Attack Simulatio..."


In [None]:
combination_df.to_csv('COMBINATION_LIST.csv', index=False)

In [None]:
final_combination_list = [item for sublist in similar_label_list for item in sublist]
final_combination_list = [re.sub(r'\d+\.\s', '', item) for item in final_combination_list]
final_combination_list = list(set(final_combination_list))

In [None]:
final_combination_list

['Generic Attack',
 'Vulnerability exploits',
 'Remote Code Execution',
 'OpenSSL Heartbeat Attack',
 'Standard Attack',
 'User enumeration',
 'Password Guessing Attack',
 'SSH password cracking',
 'Username enumeration',
 'FTP Authentication Attack',
 'SSH dictionary attacks',
 'Data breach',
 'Web Attack - Dictionary Attack',
 'FTP Brute Forcer',
 'Denial-of-Service',
 'Indiscriminate',
 'Innocuous',
 'Information disclosure',
 'Information Leak',
 'SSH Attack Tool',
 'Virus',
 'Password cracking',
 'Code Injection',
 'Botnet - SSH',
 'FTP Login Exploiter',
 'Common Threat',
 'OpenSSL bug',
 'Expected network behavior',
 'Indeterminate',
 'Brute Force Login Attempts',
 'Randomizers',
 'GoldenEye DDoS',
 'FTP Cracking',
 'Zero-Day Exploits',
 'Security exploits',
 'DoS Flood Attack',
 'Exploitation techniques',
 'Denial of Service (DoS) Attack - GoldenEye',
 'Password cracking - SSH',
 'FTP Brute Force',
 'Brute Force SSH',
 'Hulk Denial of Service',
 'DoS GoldenEye Exploit',
 'Non-ma

In [None]:
len(final_combination_list)

346

In [None]:
def get_label_explanation(name, count):
  try:
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a helpful assistant who gives a technical explanation to the given network packet label in around 300 words in a single paragraph without any new lines. The primary labels are [DoS Hulk, Normal, DDoS, Heartbleed, SSH Patator, DoS, Exploits, Generic, FTP Patator, Fuzzers, Reconnaissance, DoS GoldenEye, Web Attack - Brute Force, Infiltration, Worms, DoS SlowHTTPTest, DoS Slowloris, Bot, Web Attack - XSS, Analysis, Backdoor, Shellcode, Port Scan, Web Attack - SQL Injection]."},
            {"role": "user", "content": f'Explain this network packet label: "{name}"'}
        ]
    )
  except Exception as e:
    time.sleep(2)
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a helpful assistant who gives a technical explanation to the given network packet label in around 300 words in a single paragraph without any new lines. The primary labels are [DoS Hulk, Normal, DDoS, Heartbleed, SSH Patator, DoS, Exploits, Generic, FTP Patator, Fuzzers, Reconnaissance, DoS GoldenEye, Web Attack - Brute Force, Infiltration, Worms, DoS SlowHTTPTest, DoS Slowloris, Bot, Web Attack - XSS, Analysis, Backdoor, Shellcode, Port Scan, Web Attack - SQL Injection]."},
            {"role": "user", "content": f'Explain this network packet label: "{name}"'}
        ]
    )
  explanation = response['choices'][0]['message']['content']
  print(f'Completed: {count}')
  return explanation.replace('\n', ' ')

In [None]:
get_label_explanation(final_combination_list[0], 0)

Completed: 0


'The network packet label "Generic Attack" refers to a type of attack that does not fit into any specific category but encompasses various nefarious activities aimed at exploiting vulnerabilities in a computer system or network. It is a broad label used to classify and identify malicious activities that do not fall under well-known attack types or have unique characteristics. A "Generic Attack" may include a wide range of tactics such as attempting to gain unauthorized access, gaining control over devices or networks, compromising data integrity or confidentiality, or causing disruption to network services. This type of attack is often carried out by skilled hackers who use various techniques and tools to exploit weak points in a system\'s defenses. The motive behind a "Generic Attack" can vary, ranging from financial gain through data theft, to sabotage or espionage. Organizations need to be vigilant and implement robust security measures, such as firewalls, intrusion detection system

In [None]:
label_explanation_list = []

In [None]:
for i, name in enumerate(final_combination_list):
  explanation = get_label_explanation(name, i)
  label_explanation_list.append(explanation)

Completed: 0
Completed: 1
Completed: 2
Completed: 3
Completed: 4
Completed: 5
Completed: 6
Completed: 7
Completed: 8
Completed: 9
Completed: 10
Completed: 11
Completed: 12
Completed: 13
Completed: 14
Completed: 15
Completed: 16
Completed: 17
Completed: 18
Completed: 19
Completed: 20
Completed: 21
Completed: 22
Completed: 23
Completed: 24
Completed: 25
Completed: 26
Completed: 27
Completed: 28
Completed: 29
Completed: 30
Completed: 31
Completed: 32
Completed: 33
Completed: 34
Completed: 35
Completed: 36
Completed: 37
Completed: 38
Completed: 39
Completed: 40
Completed: 41
Completed: 42
Completed: 43
Completed: 44
Completed: 45
Completed: 46
Completed: 47
Completed: 48
Completed: 49
Completed: 50
Completed: 51
Completed: 52
Completed: 53
Completed: 54
Completed: 55
Completed: 56
Completed: 57
Completed: 58
Completed: 59
Completed: 60
Completed: 61
Completed: 62
Completed: 63
Completed: 64
Completed: 65
Completed: 66
Completed: 67
Completed: 68
Completed: 69
Completed: 70
Completed: 71
Co

In [None]:
attack_exp_df = pd.DataFrame({'name': final_combination_list, 'explanation': label_explanation_list})

In [None]:
attack_exp_df

Unnamed: 0,name,explanation
0,Generic Attack,"The network packet labeled as ""Generic Attack""..."
1,Vulnerability exploits,"The network packet label ""Vulnerability exploi..."
2,Remote Code Execution,"The network packet label ""Remote Code Executio..."
3,OpenSSL Heartbeat Attack,"The network packet label ""OpenSSL Heartbeat At..."
4,Standard Attack,"The network packet label ""Standard Attack"" typ..."
...,...,...
341,OpenSSL exploit,"The network packet label ""OpenSSL exploit"" ref..."
342,SSH Bruteforce,"The network packet label ""SSH Bruteforce"" indi..."
343,Exploit techniques,"The label ""Exploit techniques"" pertains to net..."
344,Routine network behavior,"The label ""Routine network behavior"" is typica..."


In [None]:
attack_explanation_df = pd.concat([attack_explanation_df, attack_exp_df], ignore_index=True)

In [None]:
attack_explanation_df

Unnamed: 0,name,explanation
0,Pass the Hash,"In a ""Pass the Hash"" network packet attack, an..."
1,Timing Attack,A timing attack is a type of network packet at...
2,DNS Spoofing,DNS spoofing is a network packet attack that i...
3,Null Byte Injection,Null Byte Injection is a network packet attack...
4,HTTP Response Splitting,HTTP response splitting is a network packet at...
...,...,...
2981,OpenSSL exploit,"The network packet label ""OpenSSL exploit"" ref..."
2982,SSH Bruteforce,"The network packet label ""SSH Bruteforce"" indi..."
2983,Exploit techniques,"The label ""Exploit techniques"" pertains to net..."
2984,Routine network behavior,"The label ""Routine network behavior"" is typica..."


In [None]:
attack_explanation_df.to_csv('ATTACK_EXPLANATION.csv', index=False)

In [None]:
original_list

['DDoS',
 'Normal',
 'DoS Hulk',
 'DoS',
 'Bot',
 'Exploits',
 'Fuzzers',
 'Reconnaissance',
 'Web Attack - XSS',
 'Heartbleed',
 'SSH Patator',
 'DoS SlowHTTPTest',
 'FTP Patator',
 'Generic',
 'Web Attack - Brute Force',
 'DoS GoldenEye',
 'Analysis',
 'Worms',
 'Infiltration',
 'DoS Slowloris',
 'Shellcode',
 'Backdoor',
 'Port Scan',
 'Web Attack - SQL Injection']

In [None]:
label_explanation_list = []

In [None]:
for i, name in enumerate(original_list):
  explanation = get_label_explanation(name, i)
  label_explanation_list.append(explanation)

Completed: 0
Completed: 1
Completed: 2
Completed: 3
Completed: 4
Completed: 5
Completed: 6
Completed: 7
Completed: 8
Completed: 9
Completed: 10
Completed: 11
Completed: 12
Completed: 13
Completed: 14
Completed: 15
Completed: 16
Completed: 17
Completed: 18
Completed: 19
Completed: 20
Completed: 21
Completed: 22
Completed: 23


In [None]:
attack_exp_df = pd.DataFrame({'name': original_list, 'explanation': label_explanation_list})

In [None]:
attack_exp_df

Unnamed: 0,name,explanation
0,DDoS,"The network packet label ""DDoS"" stands for Dis..."
1,Normal,"The network packet label ""Normal"" indicates th..."
2,DoS Hulk,"The network packet label ""DoS Hulk"" refers to ..."
3,DoS,"The network packet label ""DoS"" refers to a Den..."
4,Bot,"The network packet label ""Bot"" refers to netwo..."
5,Exploits,"The ""Exploits"" network packet label refers to ..."
6,Fuzzers,"The network packet label ""Fuzzers"" refers to a..."
7,Reconnaissance,"The ""Reconnaissance"" network packet label refe..."
8,Web Attack - XSS,"The network packet label ""Web Attack - XSS"" re..."
9,Heartbleed,"The network packet label ""Heartbleed"" refers t..."


In [None]:
attack_explanation_df = pd.concat([attack_explanation_df, attack_exp_df], ignore_index=True)

In [None]:
attack_explanation_df

Unnamed: 0,name,explanation
0,Pass the Hash,"In a ""Pass the Hash"" network packet attack, an..."
1,Timing Attack,A timing attack is a type of network packet at...
2,DNS Spoofing,DNS spoofing is a network packet attack that i...
3,Null Byte Injection,Null Byte Injection is a network packet attack...
4,HTTP Response Splitting,HTTP response splitting is a network packet at...
...,...,...
3005,DoS Slowloris,"The network packet label ""DoS Slowloris"" indic..."
3006,Shellcode,"The network packet label ""Shellcode"" indicates..."
3007,Backdoor,"A ""Backdoor"" network packet label refers to a ..."
3008,Port Scan,"A network packet labeled as ""Port Scan"" refers..."


In [None]:
attack_explanation_df.to_csv('ATTACK_EXPLANATION.csv', index=False)

In [None]:
with open('Corpus.txt', 'w') as f:
    for index, row in attack_explanation_df.iterrows():
        f.write(str(row['name']) + "\n")
        f.write(str(row['explanation']) + "\n")

In [None]:
df

Unnamed: 0_level_0,Cluster Name
Cluster,Unnamed: 1_level_1
Cluster 0,Normal
Cluster 1,Heartbleed
Cluster 2,"Normal, DoS Hulk"
Cluster 3,"Normal, DDoS"
Cluster 4,"Exploits, SSH Patator"
...,...
Cluster 511,"DoS, FTP Patator"
Cluster 512,"DoS GoldenEye, Normal"
Cluster 513,SSH Patator
Cluster 514,"Fuzzers, Exploits, Normal"


In [None]:
combination_df

Unnamed: 0,Combination,Combination List
0,Normal,"[1. Regular, 2. Typical, 3. Common, 4. Standar..."
1,Heartbleed,"[1. SSL vulnerability, 2. OpenSSL exploit, 3. ..."
2,"Normal, DoS Hulk","[1. DDoS Shield, 2. Steadyflow, 3. Regular Tra..."
3,"Normal, DDoS","[1. Coordinated Overload, 2. Cyber Storm, 3. N..."
4,"Exploits, SSH Patator","[1. Malicious SSH Exploiter, 2. Unauthorized S..."
...,...,...
511,"DoS, FTP Patator","[1. BruteFTP, 2. PayloadStorm, 3. PassiveFlood..."
512,"DoS GoldenEye, Normal","[1. DestructiveStrike, 2. PeakPerformance, 3. ..."
513,SSH Patator,"[1. SSH Brute Force, 2. SSH Dictionary Attack,..."
514,"Fuzzers, Exploits, Normal","[1. Vulnerability Probing, 2. Attack Simulatio..."


In [None]:
combination_df['Combination List'][0]

['1. Regular', '2. Typical', '3. Common', '4. Standard', '5. Regularized']

In [None]:
def remove_prefixes(lst):
    return [re.sub('^\d+\. ', '', s) for s in lst]

In [None]:
combination_df['sample'] = combination_df['Combination List'].apply(remove_prefixes)

In [None]:
combination_df

Unnamed: 0,Combination,Combination List,sample
0,Normal,"[1. Regular, 2. Typical, 3. Common, 4. Standar...","[Regular, Typical, Common, Standard, Regularized]"
1,Heartbleed,"[1. SSL vulnerability, 2. OpenSSL exploit, 3. ...","[SSL vulnerability, OpenSSL exploit, Informati..."
2,"Normal, DoS Hulk","[1. DDoS Shield, 2. Steadyflow, 3. Regular Tra...","[DDoS Shield, Steadyflow, Regular Traffic, Bru..."
3,"Normal, DDoS","[1. Coordinated Overload, 2. Cyber Storm, 3. N...","[Coordinated Overload, Cyber Storm, Network As..."
4,"Exploits, SSH Patator","[1. Malicious SSH Exploiter, 2. Unauthorized S...","[Malicious SSH Exploiter, Unauthorized SSH Int..."
...,...,...,...
511,"DoS, FTP Patator","[1. BruteFTP, 2. PayloadStorm, 3. PassiveFlood...","[BruteFTP, PayloadStorm, PassiveFlooder, Conne..."
512,"DoS GoldenEye, Normal","[1. DestructiveStrike, 2. PeakPerformance, 3. ...","[DestructiveStrike, PeakPerformance, RuthlessB..."
513,SSH Patator,"[1. SSH Brute Force, 2. SSH Dictionary Attack,...","[SSH Brute Force, SSH Dictionary Attack, SSH P..."
514,"Fuzzers, Exploits, Normal","[1. Vulnerability Probing, 2. Attack Simulatio...","[Vulnerability Probing, Attack Simulation, Sta..."


In [None]:
cluster_info_df = combination_df.copy()

In [None]:
cluster_info_df['Cluster Centroid'] = cluster_centers.tolist()

In [None]:
cluster_info_df = cluster_info_df.drop('Combination List', axis=1)

In [None]:
cluster_info_df = cluster_info_df.rename(columns={'Combination': 'Cluster Name', 'sample': 'Combination List'})

In [None]:
cluster_info_df

Unnamed: 0,Cluster Name,Combination List,Cluster Centroid
0,Normal,"[Regular, Typical, Common, Standard, Regularized]","[0.04477231577038765, -1.1067426204681396, 0.6..."
1,Heartbleed,"[SSL vulnerability, OpenSSL exploit, Informati...","[0.20892228186130524, -1.8042747974395752, 2.1..."
2,"Normal, DoS Hulk","[DDoS Shield, Steadyflow, Regular Traffic, Bru...","[-0.6773971915245056, 0.3524855971336365, -0.2..."
3,"Normal, DDoS","[Coordinated Overload, Cyber Storm, Network As...","[1.542575716972351, 1.337799310684204, -0.9742..."
4,"Exploits, SSH Patator","[Malicious SSH Exploiter, Unauthorized SSH Int...","[0.8602568507194519, 0.048649415373802185, -0...."
...,...,...,...
511,"DoS, FTP Patator","[BruteFTP, PayloadStorm, PassiveFlooder, Conne...","[0.5987561941146851, -0.7863489985466003, -0.7..."
512,"DoS GoldenEye, Normal","[DestructiveStrike, PeakPerformance, RuthlessB...","[0.7857335209846497, 0.16938543319702148, -2.4..."
513,SSH Patator,"[SSH Brute Force, SSH Dictionary Attack, SSH P...","[0.5677225589752197, 0.7569266557693481, -0.43..."
514,"Fuzzers, Exploits, Normal","[Vulnerability Probing, Attack Simulation, Sta...","[-1.5628975629806519, 1.0497198104858398, -0.8..."


In [None]:
cluster_info_df.to_csv('CLUSTER_INFORMATION.csv', index=False)

In [None]:
cosine_dist = cosine_similarity(cluster_info_df['Cluster Centroid'].to_list(), [cluster_info_df['Cluster Centroid'][515]])
cosine_dist_formatted = [round(dist[0], 4) for dist in cosine_dist]
data = {'Cluster {}'.format(i): cosine_dist_formatted[i] for i in range(len(cosine_dist_formatted))}
df_sorted = pd.DataFrame(list(data.items()), columns=['Cluster', 'Cosine Distance'])
df_sorted['Cluster Name'] = df_sorted.Cluster.apply(lambda x: cluster_info_df['Cluster Name'][int(x.split('Cluster ')[1])])
df_sorted = df_sorted.sort_values(by='Cosine Distance', ascending=False)
df_sorted.reset_index(drop=True, inplace=True)
df_sorted.head(20)

Unnamed: 0,Cluster,Cosine Distance,Cluster Name
0,Cluster 515,1.0,"DoS GoldenEye, DoS, Normal"
1,Cluster 218,0.9779,"DoS GoldenEye, DoS, Normal"
2,Cluster 366,0.9777,"DoS, DoS GoldenEye, Normal"
3,Cluster 292,0.9747,"DoS GoldenEye, Normal"
4,Cluster 247,0.9708,"DoS, DoS GoldenEye, Normal"
5,Cluster 512,0.9688,"DoS GoldenEye, Normal"
6,Cluster 365,0.9682,"DoS GoldenEye, Normal"
7,Cluster 168,0.9667,"DoS, DoS GoldenEye, Normal"
8,Cluster 60,0.9667,"DoS GoldenEye, Normal"
9,Cluster 78,0.9632,"DoS GoldenEye, Normal"


# Similar Words

In [None]:
!pip install transformers python-docx wikipedia pycaret

Collecting pycaret
  Downloading pycaret-3.0.4-py3-none-any.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.4/484.4 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyod>=1.0.8 (from pycaret)
  Downloading pyod-1.1.0.tar.gz (153 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.4/153.4 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.1-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting importlib-metadata>=4.12.0 (from pycaret)
  Downloading importlib_metadata-6.7.0-py3-none-any.whl (22 kB)
Collecting deprecation>=2.1.0 (from pycaret)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting xxhash (from pycaret)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/CLUSTER_INFORMATION.csv")

In [None]:
df

Unnamed: 0,Cluster Name,Combination List,Cluster Centroid
0,Normal,"['Regular', 'Typical', 'Common', 'Standard', '...","[0.04477231577038765, -1.1067426204681396, 0.6..."
1,Heartbleed,"['SSL vulnerability', 'OpenSSL exploit', 'Info...","[0.20892228186130524, -1.8042747974395752, 2.1..."
2,"Normal, DoS Hulk","['DDoS Shield', 'Steadyflow', 'Regular Traffic...","[-0.6773971915245056, 0.3524855971336365, -0.2..."
3,"Normal, DDoS","['Coordinated Overload', 'Cyber Storm', 'Netwo...","[1.542575716972351, 1.337799310684204, -0.9742..."
4,"Exploits, SSH Patator","['Malicious SSH Exploiter', 'Unauthorized SSH ...","[0.8602568507194519, 0.048649415373802185, -0...."
...,...,...,...
511,"DoS, FTP Patator","['BruteFTP', 'PayloadStorm', 'PassiveFlooder',...","[0.5987561941146851, -0.7863489985466003, -0.7..."
512,"DoS GoldenEye, Normal","['DestructiveStrike', 'PeakPerformance', 'Ruth...","[0.7857335209846497, 0.16938543319702148, -2.4..."
513,SSH Patator,"['SSH Brute Force', 'SSH Dictionary Attack', '...","[0.5677225589752197, 0.7569266557693481, -0.43..."
514,"Fuzzers, Exploits, Normal","['Vulnerability Probing', 'Attack Simulation',...","[-1.5628975629806519, 1.0497198104858398, -0.8..."


In [None]:
import torch
from transformers import BertModel, AutoTokenizer, logging
import numpy as np
import warnings
import docx
import re
import nltk
import wikipedia
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
try:
  ipython = get_ipython()
  from tqdm.notebook import tqdm
except:
  from tqdm import tqdm

logging.set_verbosity_error()
logging.disable_progress_bar()
warnings.filterwarnings('ignore')
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('tagsets', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [None]:
class BERTSimilarWords:

    def __init__(self, max_heading_length = 10, max_document_length = 300, exclude_stopwords=[]):

        for i in tqdm(range(2), unit=' it', desc='Initializing', postfix='Tokenizer and Model'): pass
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        self.lemmatizer = WordNetLemmatizer()
        self.min_max_scaler = MinMaxScaler()
        self.model = BertModel.from_pretrained('bert-base-cased')
        if torch.cuda.is_available():
            self.processor = 'GPU'
            self.cuda_current_device = torch.cuda.current_device()
            self.model = self.model.to(self.cuda_current_device)
        else:
            self.processor = 'CPU'
        self.max_document_length = max_document_length
        self.max_heading_length = max_heading_length
        self.max_ngram = 10
        self.wikipedia_dataset_info = {}
        self.document_list = []
        self.bert_words = []
        self.bert_vectors = []
        self.bert_documents = []
        self.continous_words = []
        self.temporary_ngram_words = []
        self.count_vectorizer_words = []
        self.cv_counts = []
        self.cv_words = []
        self.count_vectorizer = CountVectorizer(analyzer=self._custom_analyzer)
        self.stop_words = [word for word in stopwords.words() if word not in exclude_stopwords]
        self.punctuations = '''!"#$%&'()*+,-./:—;<=>−?–@[\]^_`{|}~'''
        self.doc_regex = "[\([][0-9]+[\])]|[”“‘’‛‟]|\d+\s"
        self.punctuations_continuity_exclude = '''—-–,−'''
        self.pos_tags_info = nltk.help.upenn_tagset
        self.bert_words_ngram = [[] for _ in range(self.max_ngram)]
        self.bert_vectors_ngram = [[] for _ in range(self.max_ngram)]
        self.bert_documents_ngram = [[] for _ in range(self.max_ngram)]

    def load_dataset(self, dataset_path=None, wikipedia_query=None, wikipedia_query_limit=10, wikipedia_page_list=None):

        """
        This method extracts and processes the text content and generates word embeddings using the BERT model. Either one of the (dataset_path,wikipedia_query,wikipedia_page_list) parameters should be given.

        Parameters
        ----------
        dataset_path : the dataset paths of the text files either as a string (one file) or a list of strings (multiple files) (supported files: .docx / .txt)

        wikipedia_query : the Wikipedia search queries either as a string (one query) or a list of strings (multiple queries)

        wikipedia_query_limit : maximum number of pages to extract for each query (only when wikipedia_query is given)

        wikipedia_page_list : the list of names of Wikipedia pages to be extracted

        """

        if wikipedia_query is not None or wikipedia_page_list is not None:
            if wikipedia_query is not None:
                query_results = []
                if type(wikipedia_query) == str:
                    wikipedia_query = [wikipedia_query]
                for query in wikipedia_query:
                    query_results += wikipedia.search(query, results=wikipedia_query_limit)
            else:
                query_results = wikipedia_page_list
            page_content = []
            for result in tqdm(query_results, unit=' pages', desc='Extracting', postfix='Data from Wikipedia'):
                if '(disambiguation)' not in result and result not in self.wikipedia_dataset_info.keys():
                    try:
                        page = wikipedia.page(result, auto_suggest=False)
                    except:
                        continue
                    page_content += ['== New page =='] + page.content.split('\n\n\n')
                    self.wikipedia_dataset_info[page.title] = page.url
            self.document_list = self._process_wikipedia_dataset(page_content)
        elif dataset_path is not None:
            if type(dataset_path) == str:
                dataset_path = [dataset_path]
            for path in dataset_path:
                if path.endswith('.docx'):
                    docx_content = docx.Document(path)
                    self.document_list += self._process_docx_dataset(docx_content)
                elif path.endswith('.txt'):
                    self.document_list += self._process_txt_dataset(path)
                else:
                    raise ValueError("Files supported: .docx / .txt")
        for words, vectors, document, continous in self._tokenize_and_embeddings(self.document_list):
            self.temporary_ngram_words = []
            for i in range(len(words)):
                self._generate_n_grams(i, words, vectors, document, continous)
            self.bert_words.extend(words)
            self.bert_vectors.extend(vectors)
            self.bert_documents.extend(document)
            self.continous_words.extend(continous)
            self.count_vectorizer_words.append(words + self.temporary_ngram_words)
        self.bert_words_ngram[0] = self.bert_words
        self.bert_vectors_ngram[0] = self.bert_vectors
        self.bert_documents_ngram[0] = self.bert_documents
        self.cv_counts = self.count_vectorizer.fit_transform(self.count_vectorizer_words)
        self.cv_words = self.count_vectorizer.get_feature_names_out()
        return self

    def _process_wikipedia_dataset(self, page_content):

        document_list = []
        for section in page_content:
            if not any(exclude in section for exclude in
                       ['== Further reading ==', '== References ==', '== External links ==', '== See also ==',
                        '== Notes ==']):
                if "==" in section[:self.max_heading_length] and "===" not in section[:self.max_heading_length]:
                    flag = 0
                paragraph = section.split('\n')
                for sentence in paragraph:
                    sentence_words = sentence.split()
                    sentence_length = len(sentence_words)
                    if sentence_length > self.max_heading_length:
                        if len(document_list) != 0 and flag == 1 and len(
                                document_list[-1].split() + sentence_words) < self.max_document_length:
                            document_list[-1] += ' ' + sentence
                        else:
                            document_list = self._process_dataset_long_paragraph(document_list, sentence,
                                                                                 sentence_length)
                            flag = 1
        return document_list

    def _process_docx_dataset(self, docx_content):

        document_list = []
        for paragraph in tqdm(docx_content.paragraphs, unit=' paragraphs', desc='Extracting',
                              postfix='Data from Dataset'):
            if 'Heading' in str(paragraph.style):
                text = re.sub(self.doc_regex, '', paragraph.text)
                if len(document_list) != 0 and len(document_list[-1].split()) <= self.max_heading_length:
                    document_list[-1] = text + '.'
                else:
                    document_list.append(text + '.')
            if 'Body Text' in str(paragraph.style):
                sentence = re.sub(self.doc_regex, '', paragraph.text)
                sentence_length = len(sentence.split())
                if sentence_length > self.max_heading_length:
                    if len(document_list) != 0 and len(
                            document_list[-1].split()) + sentence_length < self.max_document_length:
                        document_list[-1] += ' ' + sentence
                    else:
                        document_list = self._process_dataset_long_paragraph(document_list, sentence, sentence_length)
        return document_list

    def _process_txt_dataset(self, path):

        document_list = []
        with open(path) as file:
            for line in tqdm(file.readlines(), unit=' paragraphs', desc='Extracting', postfix='Data from Dataset'):
                line_text = line.strip()
                line_text = re.sub(self.doc_regex, '', line_text)
                line_length = len(line_text.split())
                if 0 < line_length <= self.max_heading_length:
                    if len(document_list) != 0 and len(document_list[-1].split()) <= self.max_heading_length:
                        document_list[-1] = line_text + '.'
                    else:
                        document_list.append(line_text + '.')
                elif line_length > self.max_heading_length:
                    if len(document_list) != 0 and len(
                            document_list[-1].split()) + line_length <= self.max_document_length:
                        document_list[-1] += ' ' + line_text
                    else:
                        document_list = self._process_dataset_long_paragraph(document_list, line_text, len(
                            document_list[-1].split()) + line_length)
        return document_list

    def _process_dataset_long_paragraph(self, document_list, sentence, sentence_length):

        if sentence_length > self.max_document_length:
            for i in range(2, sentence_length):
                div = sentence_length / i
                if div < self.max_document_length:
                    break
            temp_sent = ''
            sm_sent = sent_tokenize(sentence)

            for sent in sm_sent:
                if len(temp_sent.split() + sent.split()) > div:
                    if len(document_list[-1].split()) <= self.max_heading_length:
                        document_list[-1] += ' ' + temp_sent
                    else:
                        document_list.append(temp_sent)
                    temp_sent = ''
                temp_sent = temp_sent + sent

            if len(document_list[-1].split() + temp_sent.split()) < self.max_document_length:
                document_list[-1] += ' ' + temp_sent
            else:
                document_list.append(temp_sent)
        else:
            document_list.append(sentence)
        return document_list

    def _tokenize_and_embeddings(self, document_list):

        continous_index = 0
        document_index = 0
        for document in tqdm(document_list, unit=' documents', desc='Processing', postfix='Word Embeddings'):
            if self.processor == 'GPU':
                tokens = self.tokenizer(document, truncation=True, return_tensors='pt').to(self.cuda_current_device)
            else:
                tokens = self.tokenizer(document, truncation=True, return_tensors='pt')
            words = self.tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
            word_ids = tokens.word_ids()
            output = self.model(**tokens)
            if self.processor == 'GPU':
                vectors = output.last_hidden_state[0].cpu().detach().numpy()
            else:
                vectors = output.last_hidden_state[0].detach().numpy()
            word_list = []
            vector_list = []
            continous_words = []
            word_index = -1
            for i in range(len(words)):
                if word_ids[i] is None or words[i] in self.punctuations:
                    if words[i] in self.punctuations_continuity_exclude:
                        pass
                    else:
                        continous_index = continous_index + 1
                    continue
                if word_ids[i] > word_index:
                    if len(word_list) != 0 and word_list[-1].lower() in self.stop_words:
                        word_list.pop()
                        vector_list.pop()
                        continous_words.pop()
                        continous_index = continous_index + 1
                    word_list.append(words[i])
                    vector_list.append(vectors[i])
                    continous_words.append(continous_index)
                    word_index = word_ids[i]
                elif word_ids[i] == word_index:
                    sub_word = words[i].replace('##', "")
                    word_list[-1] = word_list[-1] + sub_word
                    vector_list[-1] = (vector_list[-1] + vectors[i])
                    if word_ids[i + 1] != word_ids[i]:
                        vector_list[-1] = vector_list[-1] / word_ids.count(word_index)
            yield word_list, vector_list, [document_index] * len(word_list), continous_words
            document_index += 1

    def _generate_n_grams(self, i, words, vectors, document, continous, n=1):

        if i > n - 1 and n < self.max_ngram and continous[i] == continous[i - n]:
            temp_word = ''
            temp_vector = np.zeros([len(vectors[i])])
            for j in range(n, -1, -1):
                temp_word = temp_word + ' ' + words[i - j]
                temp_vector = temp_vector + vectors[i - j]
            self.temporary_ngram_words.append(temp_word.strip())
            self.bert_words_ngram[n].append(temp_word.strip())
            self.bert_vectors_ngram[n].append(temp_vector / (n + 1))
            self.bert_documents_ngram[n].append(document[i])
            self._generate_n_grams(i, words, vectors, document, continous, n=n + 1)
        return

    def _custom_analyzer(self, words):

        final_list = []
        for word in words:
            final_list.append(word)
            lemmatized_word = ' '.join([self.lemmatizer.lemmatize(token.lower()) for token in word.split()])
            if word != lemmatized_word:
                final_list.append(lemmatized_word)
        return final_list

    def _context_similarity_measurement(self, features, context_length):

        context_total = 0
        word_total = 0
        for i in range(context_length):
            if features[i] != 0:
                context_total += 1
        for i, x in enumerate(features[context_length:]):
            if x != 0:
                word_total += 1
        word_mean = 0.5 * np.mean(features[context_length:])
        if len(features[:context_length]) == 0:
            context_mean = 0
        else:
            context_mean = 0.5 * np.mean(features[:context_length])
        return int(str(context_total) + str(word_total)) + context_mean + word_mean

    def _get_article_words_vectors(self, similar_documents, similarity_scores, similarity_factor, input_words_max):

        document_words = []
        document_vectors = []
        # print(max(similarity_scores), similarity_factor, input_words_max, sorted(similarity_scores, reverse=True)[:20]) # TESTING

        for article in similar_documents:
            if similarity_scores[article] < similarity_scores[similar_documents[0]] - similarity_factor:
                break
            if article == len(similar_documents) - 1:
                for i in range(input_words_max):
                    document_words += self.bert_words_ngram[i][self.bert_documents_ngram[i].index(article):]
                    document_vectors += self.bert_vectors_ngram[i][self.bert_documents_ngram[i].index(article):]
            else:
                for i in range(input_words_max):
                    document_words += self.bert_words_ngram[i][
                                      self.bert_documents_ngram[i].index(article):self.bert_documents_ngram[i].index(
                                          article + 1)]
                    document_vectors += self.bert_vectors_ngram[i][
                                        self.bert_documents_ngram[i].index(article):self.bert_documents_ngram[i].index(
                                            article + 1)]
            # print('looping...') # TESTING
        return document_words, document_vectors

    def _calculate_input_word_embedding(self, input_words, document_words, document_vectors, uncased_lemmatization):

        average_list = np.zeros([len(input_words), len(document_vectors[0])])
        mean_index = []
        for i_index, i_word in enumerate(input_words):
            a_count = 0
            for a_index, a_word in enumerate(document_words):
                if uncased_lemmatization and i_word == self.lemmatizer.lemmatize(a_word.lower()):
                    average_list[i_index] += document_vectors[a_index]
                    a_count = a_count + 1
                elif i_word == a_word:
                    average_list[i_index] += document_vectors[a_index]
                    a_count = a_count + 1
            if average_list[i_index].any():
                average_list[i_index] = average_list[i_index] / a_count
                mean_index.append(i_index)
        average = np.mean(average_list[mean_index], axis=0)
        return average

    def _context_similarity_document_scores(self, input_context_words, input_context_length, input_words_length,
                                            context_similarity_factor):

        cv_list = []
        cv_counts = self.cv_counts.toarray()
        index = [i for i in np.searchsorted(self.cv_words, input_context_words) if
                 self.cv_words[i] in input_context_words]

        for i in range(len(self.document_list)):
            cv_list.append(cv_counts[i][index].tolist())

        cv_list = self.min_max_scaler.fit_transform(cv_list)
        similarity_scores = [self._context_similarity_measurement(counts, input_context_length) for counts in cv_list]
        similarity_factor = context_similarity_factor * input_words_length
        similar_documents = np.flip(np.argsort(similarity_scores))
        return similar_documents, similarity_scores, similarity_factor

    def _find_nearest_cosine_words(self, input_context_words, cosine_sim, cosine_words, pos_to_exclude,
                                   max_output_words, output_filter_factor):

        output_dict = {}
        sorted_list = np.flip(np.argsort(cosine_sim))
        lemmatized_words = {self.lemmatizer.lemmatize(token.lower()) for word in input_context_words for token in
                            word.split()}

        for i in range(len(cosine_words)):
            stop = 0
            pop_list = []
            original_word = cosine_words[sorted_list[i]]
            pos_tags = [pos[1] for pos in nltk.pos_tag(original_word.split())]
            lemmatized_word = {self.lemmatizer.lemmatize(token.lower()) for token in original_word.split()}
            if len(lemmatized_words.intersection(lemmatized_word)) > output_filter_factor * len(original_word.split()):
                continue
            if any(pos in pos_tags for pos in pos_to_exclude):
                continue
            if original_word not in output_dict.keys():
                for word in output_dict.keys():
                    if original_word in word:
                        stop = 1
                        break
                    elif word in original_word:
                        pop_list.append(word)
                        stop = 0
                if stop == 0:
                    pop = [output_dict.pop(key) for key in pop_list]
                    output_dict[original_word] = cosine_sim[sorted_list[i]]
                    if len(output_dict.keys()) == max_output_words:
                        break
        return output_dict

    def _process_input_context_words(self, input_context, input_words, single_word_split, uncased_lemmatization):

        if single_word_split:
            input_context_split = input_context.split()
            input_words_split = list(itertools.chain.from_iterable([word.split() for word in input_words]))
            input_words_max = 1
        else:
            input_context_split = [] if input_context == '' else [input_context]
            input_words_split = input_words
            input_words_max = max([len(word.split()) for word in input_words])
        if uncased_lemmatization:
            input_context_split = [' '.join([self.lemmatizer.lemmatize(token.lower()) for token in word.split()]) for
                                   word in input_context_split]
            input_words_split = [' '.join([self.lemmatizer.lemmatize(token.lower()) for token in word.split()]) for word
                                 in input_words_split]
        input_context_words = input_context_split + input_words_split
        input_context_words_max = max([len(word.split()) for word in input_context_words])
        return input_context_split, input_words_split, input_words_max, input_context_words, input_context_words_max

    def find_similar_words(self,
                           input_context='',
                           input_words=[],
                           input_embedding = [],
                           output_words_ngram=1,
                           pos_to_exclude=[],
                           max_output_words=10,
                           context_similarity_factor=0.25,
                           output_filter_factor=0.5,
                           single_word_split=True,
                           uncased_lemmatization=True
                           ):
        """
        This method calculates the cosine similarity between the average of the input words based on the given context and all the words present in the given vocabulary.

        Parameters
        ----------
        input_context : the input context (string) (optional) (default: None)

        input_words : the input words as (list of strings)

        output_words_ngram : n-gram words expected as output (integer) (optional) (default: 1)

        pos_to_exclude : the words are ignored in the output if these part of speech tags are present in it (list of strings) (optional) (default: None)

        max_output_words : the maximum number of output words to be generated (integer) (optional) (default: 10)

        context_similarity_factor : uses to tune the context-matching process, find the best paragraphs related to the given input words (float) (optional) (default: 0.25) (Range: 0 to 1)

        output_filter_factor : uses to ignore words that are similar to the given input in the output (float) (optional) (default: 0.5) (Range: 0 to 1)

        single_word_split : whether to split n-gram words when given as input (boolean) (optional) (default: True)

        uncased_lemmatization : whether to uncase and lemmatize the input (boolean) (optional) (default: True)

        """

        if input_embedding == []:

            input_context_split, input_words_split, input_words_max, input_context_words, input_context_words_max = self._process_input_context_words(
                input_context, input_words, single_word_split, uncased_lemmatization)

            similar_documents, similarity_scores, similarity_factor = self._context_similarity_document_scores(
                input_context_words, len(input_context_split), len(input_words_split), context_similarity_factor)

            document_words, document_vectors = self._get_article_words_vectors(similar_documents, similarity_scores,
                                                                              similarity_factor, input_words_max)

            input_embedding = self._calculate_input_word_embedding(input_words_split, document_words, document_vectors,
                                                                  uncased_lemmatization)
        else:
            input_context_words = []

        if output_words_ngram == 0:
            cosine_sim = cosine_similarity(list(itertools.chain.from_iterable(self.bert_vectors_ngram)),
                                           [input_embedding]).flatten()
            cosine_words = list(itertools.chain.from_iterable(self.bert_words_ngram))
        else:
            cosine_sim = cosine_similarity(self.bert_vectors_ngram[output_words_ngram - 1], [input_embedding]).flatten()
            cosine_words = self.bert_words_ngram[output_words_ngram - 1]

        output_dictionary = self._find_nearest_cosine_words(input_context_words, cosine_sim, cosine_words,
                                                            pos_to_exclude, max_output_words, output_filter_factor)
        return output_dictionary, input_embedding

In [None]:
similar = BERTSimilarWords(max_document_length=375, exclude_stopwords=['dos']).load_dataset(dataset_path='CORPUS.txt')

Initializing:   0%|          | 0/2 [00:00<?, ? it/s, Tokenizer and Model]

Extracting:   0%|          | 0/6020 [00:00<?, ? paragraphs/s, Data from Dataset]

Processing:   0%|          | 0/3010 [00:00<?, ? documents/s, Word Embeddings]

In [None]:
df.head(10)

Unnamed: 0,Cluster Name,Combination List,Cluster Centroid
0,Normal,"['Regular', 'Typical', 'Common', 'Standard', '...","[0.04477231577038765, -1.1067426204681396, 0.6..."
1,Heartbleed,"['SSL vulnerability', 'OpenSSL exploit', 'Info...","[0.20892228186130524, -1.8042747974395752, 2.1..."
2,"Normal, DoS Hulk","['DDoS Shield', 'Steadyflow', 'Regular Traffic...","[-0.6773971915245056, 0.3524855971336365, -0.2..."
3,"Normal, DDoS","['Coordinated Overload', 'Cyber Storm', 'Netwo...","[1.542575716972351, 1.337799310684204, -0.9742..."
4,"Exploits, SSH Patator","['Malicious SSH Exploiter', 'Unauthorized SSH ...","[0.8602568507194519, 0.048649415373802185, -0...."
5,Exploits,"['Vulnerability exploits', 'Code exploits', 'S...","[0.40389955043792725, -0.9180182814598083, 2.4..."
6,DoS,"['Denial of Service', 'DDoS (Distributed Denia...","[1.6399495601654053, 1.146186113357544, 0.2847..."
7,FTP Patator,"['FTP Brute Force', 'FTP Attack', 'FTP Crackin...","[0.40312013030052185, -0.7873877882957458, -0...."
8,"Fuzzers, Normal","['Packet Anonymizer', 'Traffic Generator', 'Pr...","[-1.376491665840149, 0.5139791369438171, -2.09..."
9,"Exploits, Generic","['Vulnerability Blast', 'Exploit Toolkit', 'Ge...","[-1.019413948059082, -1.3146640062332153, -0.9..."


In [None]:
df['Combination List'][333]

"['Denial of Service Hulk', 'Hulk DoS attack', 'Hulk-based DoS', 'DoS flooding (Hulk)', 'Hulk-powered DoS']"

In [None]:
tags, emb = similar.find_similar_words(input_words=['Cyber Storm', 'Unleashed Fury', 'Stable Connection', 'Network Overwhelming', 'Non Disruptive'], context_similarity_factor=0.05, output_words_ngram=0, uncased_lemmatization=True, single_word_split=False, output_filter_factor=1)
tags

{'DoS Hulk Overload Rampage network packet attack': 0.8898892839709713,
 'Cyber Extortion network packet attack': 0.8895624879582851,
 'Silent Storm network packet attack': 0.8893445347271336,
 'Hulk Resistant Traffic network packet attack': 0.8879038542781112,
 'Web Attack Brute Force network packet attack': 0.8875854981426132,
 'Disruptive Hulk network packet attack': 0.8842772151379651,
 'Fiery Onslaught network packet attack': 0.8837479605292713,
 'Common Exploit Behavior network packet attack': 0.8835440717617162,
 'Multipronged Deluge network packet attack': 0.8824595343390846,
 'Scanning Fury attack network administrators': 0.8823776611684171}

In [None]:
tags, emb = similar.find_similar_words(input_embedding=np.dot(cluster_centers[6], W), context_similarity_factor=0.05, output_words_ngram=0, uncased_lemmatization=True, single_word_split=True, output_filter_factor=1)
tags

{'DoS Breach attack': 0.8628840506477807,
 'DoS GoldenEye attack network administrators employ countermeasures': 0.8620861520492,
 'DDoS Amplification attack': 0.8617234378834395,
 'DoS Exploiter attack': 0.8608997947098893,
 'launch DoS Explosion attacks include UDP flood ICMP': 0.8606501305424887,
 'DoS attack amplifying': 0.8598369234301583,
 'DoS Explosion attacks include UDP flood ICMP flood': 0.8595873032722927,
 'Layer DoS attacks include HTTP flooding': 0.8584698009102507,
 'GoldenEye DDoS attack': 0.8584104964164232,
 'DoS Explosion attack network administrators': 0.8580101154260882}

In [None]:
embedding_list = []

In [None]:
for i in range(476, len(df)):
  try:
    tags, embed = similar.find_similar_words(input_words=eval(df['Combination List'][i]), context_similarity_factor=0.05, output_words_ngram=0, uncased_lemmatization=True, single_word_split=False, output_filter_factor=1)
  except:
    tags, embed = similar.find_similar_words(input_words=eval(df['Combination List'][i]), context_similarity_factor=0.05, output_words_ngram=0, uncased_lemmatization=True, single_word_split=True, output_filter_factor=1)
  embedding_list.append(embed)
  print(f'Completed: {i}')

In [None]:
embedding_list.append(embed)

In [None]:
len(embedding_list)

516

In [None]:
np.save('TAGS-NAMES-EMBEDDINGS.npy', embedding_list)

In [None]:
tags_embeddings = np.load('/content/drive/MyDrive/NLP/TAGS-NAMES-EMBEDDINGS.npy')

In [None]:
cluster_centers = np.load('/content/drive/MyDrive/NLP/KMEANS-CLUSTER-CENTERS.npy')

In [None]:
tags_embeddings.shape

(516, 768)

In [None]:
cluster_centers.shape

(516, 768)

In [None]:
from scipy.linalg import orthogonal_procrustes

In [None]:
R, _ = orthogonal_procrustes(cluster_centers, tags_embeddings)

In [None]:
transformed_embedding = np.dot(reduced_embeddings1[1999], R)

In [None]:
W, _, _, _ = np.linalg.lstsq(cluster_centers, tags_embeddings, rcond=None)

# Testing

In [None]:
import pandas as pd
from pycaret.clustering import *

In [None]:
mdf = pd.read_csv('/content/drive/MyDrive/NLP/DATA_TRAIN.csv')

In [None]:
mdf

Unnamed: 0,packet_dat,attack_cat
0,0 0 141 -1 80 63713 2960 2920 64 0 5 0 -1 119 ...,DDoS
1,1190 1582 3526815 -1 80 50095 1500 1460 118 0 ...,Normal
2,0 0 4 -1 80 41471 4420 4380 64 0 5 0 -1 72 84 ...,DDoS
3,0 0 176 -1 80 45284 2948 2896 64 0 8 0 -1 72 8...,DoS Hulk
4,0 0 128 -1 80 46654 1500 1448 64 0 8 0 -1 72 8...,DoS Hulk
...,...,...
1187776,14492 14492 0 -1 51328 22 164 112 62 0 8 3 -1 ...,SSH Patator
1187777,14 98 131788 -1 80 52067 1500 1460 253 0 5 0 -...,DoS
1187778,1 2 397 -1 47188 22 692 640 62 0 8 3 -1 0 0 2 ...,SSH Patator
1187779,2063 0 0 -1 80 32768 1500 1448 64 0 8 0 -1 32 ...,DoS Hulk


In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("rdpahalavan/bert-network-packet-v2")

model = AutoModel.from_pretrained("rdpahalavan/bert-network-packet-v2")

In [None]:
model = model.to(0)

In [None]:
import pickle
with open('/content/drive/MyDrive/NLP/SCALER.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [None]:
scaler_col_names = [str(i) for i in range(768)]
def get_embedding(text):
  tokens = tokenizer(text, truncation=True, return_tensors='pt').to(0)
  output = model(**tokens)
  embedding = output.last_hidden_state[:, 1:-1, :].mean(dim=1).cpu().detach().numpy()
  df = pd.DataFrame(embedding[0].reshape(1, -1), columns=scaler_col_names)
  return scaler.transform(df)[0]

In [None]:
kmeans = load_model('/content/drive/MyDrive/NLP/K-MEANS')

Transformation Pipeline and Model Successfully Loaded


In [None]:
df.loc[372]

Cluster Name                                              SSH Patator
Combination List    ['SSH Brute Force', 'SSH Attack', 'Password Gu...
Cluster Centroid    [0.5667868256568909, 0.78568035364151, -0.6356...
Name: 372, dtype: object

In [None]:
embed = get_embedding(mdf['packet_dat'][1187776])

In [None]:
df1 = pd.DataFrame(embed.reshape(1, -1), columns=[f'feature_{i}' for i in range(1,769)])

In [None]:
kmeans_pred = predict_model(kmeans, data=df1)
kmeans_pred['Cluster']

0    Cluster 372
Name: Cluster, dtype: object

In [None]:
sdf = mdf.groupby('attack_cat').apply(lambda x: x.sample(min(len(x), 20))).reset_index(drop=True)

In [None]:
tags, emb = similar.find_similar_words(input_embedding=np.dot(get_embedding(mdf['packet_dat'][0]), R), context_similarity_factor=0.05, output_words_ngram=0, uncased_lemmatization=True, single_word_split=True, output_filter_factor=1)
tags

{'Traffic Congestion': 0.36719542023081386,
 'overwhelm': 0.3523911624177617,
 'Traffic flood': 0.33464621844603915,
 'Traffic Flood': 0.32134055336272704,
 'Traffic Surge': 0.3204894597217703,
 'flood burst': 0.31777126234853137,
 'traffic overload': 0.3167699399694465,
 'traffic surge': 0.3160530558571246,
 'onslaught': 0.31011136260923,
 'traffic flood': 0.3100854835209114}

In [None]:
tag_array = []
for i in range(len(sdf)):
  embed = get_embedding(sdf['packet_dat'][i])
  tags, emb = similar.find_similar_words(input_embedding=np.dot(embed, W), context_similarity_factor=0.05, output_words_ngram=0, uncased_lemmatization=True, single_word_split=True, output_filter_factor=1)
  tag_array.append(tags)
  print(f'Completed: {i}')

Completed: 0
Completed: 1
Completed: 2
Completed: 3
Completed: 4
Completed: 5
Completed: 6
Completed: 7
Completed: 8
Completed: 9
Completed: 10
Completed: 11
Completed: 12
Completed: 13
Completed: 14
Completed: 15
Completed: 16
Completed: 17
Completed: 18
Completed: 19
Completed: 20
Completed: 21
Completed: 22
Completed: 23
Completed: 24
Completed: 25
Completed: 26
Completed: 27
Completed: 28
Completed: 29
Completed: 30
Completed: 31
Completed: 32
Completed: 33
Completed: 34
Completed: 35
Completed: 36
Completed: 37
Completed: 38
Completed: 39
Completed: 40
Completed: 41
Completed: 42
Completed: 43
Completed: 44
Completed: 45
Completed: 46
Completed: 47
Completed: 48
Completed: 49
Completed: 50
Completed: 51
Completed: 52
Completed: 53
Completed: 54
Completed: 55
Completed: 56
Completed: 57
Completed: 58
Completed: 59
Completed: 60
Completed: 61
Completed: 62
Completed: 63
Completed: 64
Completed: 65
Completed: 66
Completed: 67
Completed: 68
Completed: 69
Completed: 70
Completed: 71
Co

In [None]:
sdf['tags'] = tag_array

In [None]:
sdf.to_csv('test.csv', index=False)

In [None]:
tokens = similar.tokenizer('Normal', truncation=True, return_tensors='pt').to(0)
output = similar.model(**tokens)
embedding = output.last_hidden_state[:,1,:].cpu().detach().numpy()

In [None]:
similar.find_similar_words(input_embedding=embed, context_similarity_factor=0, output_words_ngram=0, uncased_lemmatization=False, single_word_split=False, output_filter_factor=1)

In [None]:
similar.find_similar_words(input_words=eval(df['Combination List'][513]), context_similarity_factor=0, output_words_ngram=0, uncased_lemmatization=False, single_word_split=False, output_filter_factor=1)

In [None]:
similar.document_list[929:934]

In [None]:
# prompt: find the index of smallest length string in a list

print(min(enumerate(similar.document_list), key=lambda x: len(x[1])))

(929, 'BruteForceDoS. "BruteForceDoS" is a type of network packet attack that involves overwhelming a target system by flooding it with a massive volume of network packets. These packets are specifically designed to exploit the system\'s weaknesses and limitations, with the goal of causing the system to become unresponsive or crash. This attack is carried out by an attacker who initiates a brute force approach, sending an enormous number of packets to the target system without any regard for the integrity or validity of the packets. The attacker often uses automated tools or scripts to generate and send these packets rapidly, making it difficult for the target system to handle the influx of data.')


In [None]:
similar.max_document_length

360

In [None]:
aa = similar.tokenizer('''"DroneStorm" is a network packet attack that utilizes a large number of compromised devices, known as a botnet, to flood a target network with an overwhelming volume of network packets. These network packets are small units of data transmitted over a network, containing information such as the source address, destination address, and payload. In the case of "DroneStorm", the objective is to exhaust the target network's resources by inundating it with an excessive number of network packets, causing a disruption of service and rendering the network unavailable to legitimate users.  To orchestrate a "DroneStorm" attack, the attacker first recruits and controls a vast number of compromised devices, typically through malware or other malicious techniques. These compromised devices, often referred to as "drones" or "zombies," can include computers, servers, routers, IoT devices, and even smartphones. Once under the attacker's control, these devices act as obedient agents, ready to execute their instructions.  The attacker then directs the botnet to send an enormous volume of network packets to the target network. These packets can belong to various protocols, such as TCP (Transmission Control Protocol) or UDP (User Datagram Protocol), and can target specific network ports or flood the entire range of available ports. By overwhelming the target network with an excessive amount of incoming traffic, the attacker aims to consume the network's resources, such as bandwidth, processing power, or memory.  The impact of a "DroneStorm" attack can be devastating. The excessive network traffic can lead to network congestion, packet loss, and increased response times. In some cases, the target network may become completely inaccessible, resulting in a denial-of-service (DoS) or distributed denial-of-service (DDoS) scenario. Furthermore, such attacks can also serve as a distraction or cover for other malicious activities, such as data theft or unauthorized access attempts.  Defending against a "DroneStorm" attack requires a multi-layered approach, involving network monitoring, traffic filtering, and the implementation of robust security measures. This can include deploying firewalls, Intrusion Detection Systems (IDS), and Intrusion Prevention Systems (IPS), as well as regularly patching and updating devices to prevent compromise. Additionally, cooperation and communication between network administrators, Internet Service Providers (ISPs), and law enforcement agencies are crucial to identify and take down these botnets, mitigating the impact of such attacks on the internet infrastructure."''')

In [None]:
len(aa['input_ids'])

550

In [None]:
"DroneStorm" in similar.bert_words

True

In [None]:
import plotly.express as px
from sklearn.manifold import TSNE
import numpy as np

In [None]:
import pandas as pd
main_df = pd.DataFrame(similar.bert_words, columns=['word'])

In [None]:
main_df['Embedding'] = similar.bert_vectors

In [None]:
embedding_array = np.stack(main_df['Embedding'].values)
tsne = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=75, random_state=42)
ne = tsne.fit_transform(embedding_array)
df = pd.DataFrame(ne, columns=['Component 1', 'Component 2'])
df['Word'] = main_df['word']
fig = px.scatter(data_frame=df, x='Component 1', y='Component 2', text='Word', template='ggplot2')
fig.update_traces(textposition="bottom center", marker=dict(size=8))
fig.update_layout(
    title={
        'text': "Word2vec Embeddings",
        'x': 0.5,  # Set the x position to 0.5 for center alignment
        'xanchor': 'center',  # Specify the anchor point for the x position
        'yanchor': 'top'  # Align the title from the top
    }
)
fig.show()