In [None]:
## Synthetic data generation using Great for CICIDS training dataset.

In [7]:
from be_great import GReaT
import pandas as pd
import os
import random
import numpy as np
import torch

# Disable tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set random seed for reproducibility
SEED = 4200
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Load the dataset
train_df = pd.read_csv("./Data/CICIDS_data/CICIDS_clean_train.csv")

def change(df):
    # Standardize column names
    column_names = [name.strip().replace(" ", "_") for name in df.columns]
    _dict = {k: v for k, v in zip(df.columns, column_names)}
    df.rename(columns=_dict, inplace=True)

    # Standardize labels
    labels = df["Label"].unique().tolist()
    new_labels = [''.join(filter(lambda x: ord(x) < 128, l.strip().replace(" ", "_"))) for l in labels]
    _dict = {k: v for k, v in zip(labels, new_labels)}
    df["Label"] = df["Label"].replace(to_replace=_dict)
    return df

train_df = change(train_df)

# Features to be used
rf_features = ['Flow_Duration', 'Total_Length_of_Bwd_Packets', 'Bwd_Packet_Length_Max',
       'Bwd_Packet_Length_Mean', 'Flow_Bytes/s', 'Flow_Packets/s',
       'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min',
       'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max',
       'Fwd_IAT_Min', 'Fwd_Packets/s', 'Bwd_Packets/s', 'Max_Packet_Length',
       'Packet_Length_Mean', 'Average_Packet_Size', 'Avg_Bwd_Segment_Size',
       'Subflow_Bwd_Bytes', 'min_seg_size_forward']

# Small classes for synthetic data generation
small_classes = ['Web_Attack__XSS', 'Web_Attack__Brute_Force', 'Web_Attack__Sql_Injection', 
                 'Infiltration', 'Heartbleed', 'Bot']

# Filter data for selected classes
filtered_data = train_df[train_df['Label'].isin(small_classes)][rf_features + ['Label']]

# DataFrame to store synthetic data
synthetic_data_all = pd.DataFrame()

# Initialize GReaT with a random seed
model = GReaT(llm='distilgpt2', batch_size=32, epochs=200, fp16=True, seed=SEED)

# Generate synthetic data for each class
for label in small_classes:
    class_data = filtered_data[filtered_data['Label'] == label][rf_features]
    model.fit(class_data)
    synthetic_class_data = model.sample(n_samples=1000, max_length=2000)  # Adjusted `max_length` for realistic sampling
    synthetic_class_data['Label'] = label
    synthetic_data_all = pd.concat([synthetic_data_all, synthetic_class_data], ignore_index=True)

# Label data as original or synthetic
filtered_data['Data_Type'] = 'Original'
synthetic_data_all['Data_Type'] = 'Synthetic'

# Combine original and synthetic data
combined_data = pd.concat([filtered_data, synthetic_data_all], ignore_index=True)

# Save combined data to a CSV file
combined_data.to_csv("./Data/CICIDS_data/Great_Synthetic_1000_R3.csv", index=False)

# Display the first few rows of the combined dataset
print(combined_data.head())


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.8902
1000,0.6972
1500,0.6058
2000,0.5369
2500,0.4963
3000,0.4765


1033it [00:35, 29.42it/s]                                                                                                                
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.7174
1000,0.6519
1500,0.6047
2000,0.5579
2500,0.5174
3000,0.4842
3500,0.4579
4000,0.4379
4500,0.4225
5000,0.4117


1012it [00:25, 39.36it/s]                                                                                                                
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


1014it [00:26, 39.00it/s]                                                                                                                
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


1033it [00:30, 33.50it/s]                                                                                                                
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


1076it [00:32, 32.77it/s]                                                                                                                
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.7151
1000,0.6138
1500,0.5711
2000,0.5306
2500,0.4907
3000,0.4594
3500,0.4336
4000,0.4132
4500,0.3981
5000,0.3857


1010it [00:23, 42.30it/s]                                                                                                                

   Flow_Duration  Total_Length_of_Bwd_Packets  Bwd_Packet_Length_Max  \
0      5907512.0                          0.0                    0.0   
1      5185197.0                          0.0                    0.0   
2      5446804.0                          0.0                    0.0   
3      5754952.0                          0.0                    0.0   
4        72994.0                        134.0                  128.0   

   Bwd_Packet_Length_Mean  Flow_Bytes/s  Flow_Packets/s  Flow_IAT_Std  \
0                0.000000      0.000000        0.677104  3.409869e+06   
1                0.000000      0.000000        0.771427  2.992884e+06   
2                0.000000      0.000000        0.734376  3.143948e+06   
3                0.000000      0.000000        0.695054  3.321858e+06   
4               44.666667   4685.316601       95.898293  2.885074e+04   

   Flow_IAT_Max  Flow_IAT_Min  Fwd_IAT_Total  ...  Fwd_Packets/s  \
0     5906548.0         211.0      5907512.0  ...       0.50




In [None]:
## Synthetic data generation on CICIDS Training dataset using the knowledge-base

In [1]:

from sdv.single_table import CTGANSynthesizer, TVAESynthesizer
#from sdv.tabular import CTGAN
from sdv.metadata import Metadata
import pandas as pd
import numpy as np
import json
import pickle
import random
from collections import defaultdict
#import pdb;pdb.set_trace()

from be_great import GReaT
import pandas as pd
import os
import random
import numpy as np
import torch
from transformers import TrainingArguments

# Disable tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set random seed for reproducibility
SEED = 4200
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

#use this onee
rf_features = ['Flow_Duration', 'Total_Length_of_Bwd_Packets', 'Bwd_Packet_Length_Max',
       'Bwd_Packet_Length_Mean', 'Flow_Bytes/s', 'Flow_Packets/s',
       'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
       'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max',
       'Fwd_IAT_Min', 'Fwd_Packets/s', 'Bwd_Packets/s', 'Max_Packet_Length',
       'Packet_Length_Mean', 'Average_Packet_Size', 'Avg_Bwd_Segment_Size',
       'Subflow_Bwd_Bytes', 'min_seg_size_forward']

real_data = pd.read_csv("./Data/CICIDS_data/CICIDS_clean_train.csv")


def change(df):
    column_names = list(df.columns)
    new_column_names = [name.strip().replace(" ", "_") for name in column_names]
    _dict = {k:v for k,v in zip(column_names, new_column_names)}
    df.rename(columns = _dict, inplace = True)
    labels = df["Label"].unique().tolist()
    new_labels = [''.join(filter(lambda x: ord(x)<128, l.strip().replace(" ","_"))) for l in labels]
    _dict = {k:v for k,v in zip(labels, new_labels)}
    df["Label"].replace(to_replace=_dict, inplace=True)
    return df


real_data = change(real_data)

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    for column in df.select_dtypes(include=[np.number]).columns:
        mean_value = df[df[column] >= 0][column].mean()
        df[column] = df[column].apply(lambda x: mean_value if x < 0 else x)
    df = df.loc[:, (df != 0).any(axis=0)]
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep]

filtered_data = clean_dataset(real_data[rf_features+['Label']])

#real_data = clean_dataset(real_data)
#real_data = real_data[rf_features]



with open("./DT_KB_CIC_2.json", "r") as file:
    rules = json.load(file)


def clean_kb(kb):
    up_kb = {}
    for cls in kb.keys():
        ld_rl = kb[cls]
        print(f"Before: {len(ld_rl)}")
        result = []
        for item in ld_rl:
            if item not in result:  # Directly check for duplicates
                result.append(item)
            #else:
            #    print(item)
        print(f"After: {len(result)}")
        up_kb[cls] = result
    return up_kb

rules = clean_kb(rules)


def process_data(x, cls):
    global rules
    kb = rules[cls]

    indices_to_remove = []
    for i,row in x.iterrows():
        isTrue = 0
        for path in kb:
            flag = 0
            for f,r,t in path:
                if r == '>' and not (row.iloc[f]>t):
                    flag = 1
                    break
                elif r == '<=' and not (row.iloc[f]<=t):
                    flag = 1
                    break
            if flag == 0:
                isTrue = 1
                break
        if isTrue==0:
            indices_to_remove.append(i)
    print(f"Data Dropped: {len(indices_to_remove)}")
    t_x = x.drop(indices_to_remove).reset_index(drop=True)
    return t_x

selected_classes = ["Web_Attack__Sql_Injection","Web_Attack__XSS", "Web_Attack__Brute_Force", "Heartbleed", "Infiltration", "Bot"]

#selected_classes = [ "Heartbleed"]

#model = GReaT(llm='distilgpt2', batch_size=32, epochs=1000, fp16=True, seed=SEED)
#model = GReaT(llm='distilgpt2', batch_size=32, epochs=500, fp16=True, seed=SEED, **{save_total_limit=2,load_best_model_at_end=True})
model = GReaT(
    llm='distilgpt2',
    batch_size=32,
    epochs=1000,
    fp16=True,
    seed=SEED,
    save_total_limit=5,  # Limit the number of checkpoints
    #load_best_model_at_end=True,  # Load the best model at the end of training
    #eval_strategy ='steps',
    #eval_dataset = ''
)

target = 1000
synthetic_data_all = pd.DataFrame()

for label in selected_classes:
    class_data = filtered_data[filtered_data['Label'] == label]

    model.fit(class_data)
    

    synthetic_class_data = []
    sampled_data_len = 0

    
    while sampled_data_len < target:
            
        sampled = model.sample(n_samples=target-sampled_data_len, max_length=2000)

        print(len(sampled))
        cons_sampled = process_data(sampled, label)

  
        sampled_data_len += len(cons_sampled)
        synthetic_class_data.append(cons_sampled)
            
    syn_data = pd.concat(synthetic_class_data, ignore_index=True)
    
    syn_data['Label'] = label
    synthetic_data_all = pd.concat([synthetic_data_all, syn_data], ignore_index=True)

# Save the synthetic data to a CSV file
synthetic_data_all.to_csv("./Data/CICIDS_data/Great_Synthetic_1000_R1_N.csv", index=False)
print("Synthetic data has been saved to 'synthetic_data.csv'.")


  from .autonotebook import tqdm as notebook_tqdm


Before: 227
After: 217
Before: 337
After: 242
Before: 14
After: 12
Before: 13
After: 13
Before: 1
After: 1
Before: 649
After: 131


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.4569
1000,0.2296


  0%|                                                                                                                                 | 0/1000 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
1004it [03:58,  4.21it/s]                                                                                                                                      


1000
Data Dropped: 22


28it [00:05,  5.59it/s]                                                                                                                                        


22
Data Dropped: 0


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.7724
1000,0.6422
1500,0.5143
2000,0.4217
2500,0.3683
3000,0.3371
3500,0.3164
4000,0.3001
4500,0.2874
5000,0.2765


1003it [00:26, 38.46it/s]                                                                                                                                      


1000
Data Dropped: 3


92it [00:02, 38.60it/s]                                                                                                                                        


3
Data Dropped: 0


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.6796
1000,0.5922
1500,0.5264
2000,0.4559
2500,0.3988
3000,0.359
3500,0.332
4000,0.3131
4500,0.2997
5000,0.2884


1069it [00:26, 40.60it/s]                                                                                                                                      


1000
Data Dropped: 98


192it [00:04, 40.28it/s]                                                                                                                                       


98
Data Dropped: 15


97it [00:02, 39.14it/s]                                                                                                                                        


15
Data Dropped: 2


99it [00:02, 42.74it/s]                                                                                                                                        


2
Data Dropped: 0


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.3228
1000,0.1824


1059it [00:27, 38.23it/s]                                                                                                                                      


1000
Data Dropped: 3


97it [00:02, 38.99it/s]                                                                                                                                        


3
Data Dropped: 0


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.364
1000,0.2009


1054it [00:28, 37.05it/s]                                                                                                                                      


1000
Data Dropped: 142


192it [00:04, 40.20it/s]                                                                                                                                       


142
Data Dropped: 25


94it [00:02, 39.06it/s]                                                                                                                                        


25
Data Dropped: 1


99it [00:02, 41.79it/s]                                                                                                                                        


1
Data Dropped: 1


97it [00:02, 40.34it/s]                                                                                                                                        


1
Data Dropped: 0


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.7116
1000,0.5897
1500,0.5357
2000,0.4781
2500,0.4242
3000,0.3853
3500,0.359
4000,0.3411
4500,0.3271
5000,0.3158


1081it [00:25, 41.90it/s]                                                                                                                                      


1000
Data Dropped: 1


99it [00:02, 42.27it/s]                                                                                                                                        

1
Data Dropped: 0
Synthetic data has been saved to 'synthetic_data.csv'.





In [2]:
synthetic_data_all

Unnamed: 0,Flow_Duration,Total_Length_of_Bwd_Packets,Bwd_Packet_Length_Max,Bwd_Packet_Length_Mean,Flow_Bytes/s,Flow_Packets/s,Flow_IAT_Mean,Flow_IAT_Std,Flow_IAT_Max,Fwd_IAT_Total,...,Fwd_IAT_Min,Fwd_Packets/s,Bwd_Packets/s,Max_Packet_Length,Packet_Length_Mean,Average_Packet_Size,Avg_Bwd_Segment_Size,Subflow_Bwd_Bytes,min_seg_size_forward,Label
0,43.0,0.0,0.0,0.000000,0.000000,46511.627910,43.0000,0.000000e+00,43.0,0.0,...,0.0,23255.813950,23255.813950,0.0,0.000000,0.000000,0.000000,0.0,32.0,Web_Attack__Sql_Injection
1,71.0,0.0,0.0,0.000000,0.000000,28169.014080,71.0000,0.000000e+00,71.0,0.0,...,0.0,14084.507040,14084.507040,0.0,0.000000,0.000000,0.000000,0.0,32.0,Web_Attack__Sql_Injection
2,5005388.0,2021.0,2021.0,673.666667,523.435945,1.598278,715055.4286,1.889529e+04,5000145.0,6119.0,...,4.0,0.799014,0.799014,2021.0,291.111111,327.500000,673.666667,2021.0,32.0,Web_Attack__Sql_Injection
3,5005168.0,2021.0,2021.0,673.666667,933.644955,1.597616,715055.4286,1.765197e+06,5000160.0,52746.0,...,4.0,0.798455,0.798455,2021.0,291.111111,327.500000,673.666667,2021.0,32.0,Web_Attack__Sql_Injection
4,5012479.0,4149.0,2701.0,1037.250000,933.644955,1.995021,556942.1111,1.666210e+06,5000160.0,5012479.0,...,3.0,1.197012,0.798008,2701.0,431.727273,474.900000,1037.250000,4149.0,32.0,Web_Attack__Sql_Injection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,83.0,6.0,6.0,6.000000,144578.313300,24096.385540,83.0000,0.000000e+00,83.0,0.0,...,0.0,12048.192770,12048.192770,6.0,6.000000,9.000000,6.000000,6.0,20.0,Bot
5996,50.0,6.0,6.0,6.000000,240000.000000,40000.000000,50.0000,0.000000e+00,50.0,0.0,...,0.0,20000.000000,20000.000000,6.0,6.000000,9.000000,6.000000,6.0,20.0,Bot
5997,69188.0,134.0,128.0,44.666667,4958.030705,101.298062,11586.0000,2.713479e+04,67223.0,69188.0,...,25.0,57.540710,43.516101,194.0,42.500000,48.571429,44.666667,134.0,20.0,Bot
5998,20.0,6.0,6.0,6.000000,600000.000000,100000.000000,20.0000,0.000000e+00,20.0,0.0,...,0.0,50000.000000,50000.000000,6.0,6.000000,9.000000,6.000000,6.0,20.0,Bot
