In [4]:
import sys
sys.path.append('/home/sharedrive/nafi/traffic/lib/python3.9/site-packages')

import pandas as pd
from chronos import ChronosPipeline
import torch
from table_evaluator import TableEvaluator
import matplotlib.pyplot as plt
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer
from sdv.metadata import Metadata
import numpy as np
from ctgan import CTGAN
from ctgan import load_demo
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import xgboost as xgb
from table_evaluator import TableEvaluator
import pickle
import os
import time
from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier

In [5]:
cic_train = pd.read_csv("/home/sharedrive/nafi/trafficp3/cicidis/datasets/clean_cicids_neg_train.csv")
cic_test = pd.read_csv("/home/sharedrive/nafi/trafficp3/cicidis/datasets/clean_cicids_neg_test.csv")
cic_train.shape, cic_test.shape

((1979513, 23), (848363, 23))

In [6]:
label_encoder = LabelEncoder()
scaler = StandardScaler()

In [7]:
def generate_data_for_classes_ctgan(df, classes, num_samples, num_epochs=500):
    synthetic_data = pd.DataFrame()
    metadata = Metadata.detect_from_dataframe(data=df)
    for cls in classes:
        selected_df = df[df["Label"] == cls]
        synthesizer = CTGANSynthesizer(metadata, epochs=num_epochs, verbose=True)
        synthesizer.fit(selected_df)
        samples = synthesizer.sample(num_samples)
        
        synthetic_data = pd.concat([synthetic_data, samples])
    return synthetic_data

In [10]:
small_classes = ['Web_Attack__XSS', 'Web_Attack__Brute_Force',
       'Web_Attack__Sql_Injection', 'Infiltration', 'Heartbleed', 'Bot']
save_path = "/home/sharedrive/nafi/trafficp3/cicidis"

def save_label_encoder(le, file_name):
    with open(f'{save_path}/models/{file_name}_label_encoder.pkl', 'wb') as le_file:
        pickle.dump(le, le_file)

def save_scaler(scaler, file_name):
    with open(f'{save_path}/models/{file_name}_scaler.pkl', 'wb') as scaler_file:
        pickle.dump(scaler, scaler_file)
    
def ctgan_train(train_df, epochi, file_name, reduced_test_df, rounds=0, model_name=None):
    model_name = f"{model_name}" if model_name else ""
    for num_sample in [1000]:
        print(f"{file_name} epoch={epochi} samples={num_sample} round={rounds}")

        synthetic_file_path = f"{save_path}/datasets/{file_name}sample{num_sample}epoch{epochi}rounds{rounds}.csv"
        
        if os.path.exists(synthetic_file_path):
            print(f"Synthetic data already exists: {synthetic_file_path}")
            synthetic_data_df = pd.read_csv(synthetic_file_path)
        else:
            start_time = time.time()
            print("Generating synthetic data...") 
            synthetic_data = generate_data_for_classes_ctgan(train_df, small_classes, num_sample, epochi)

            # save and load synthetic data
            synthetic_data.to_csv(f'{save_path}/datasets/{file_name}sample{num_sample}epoch{epochi}rounds{rounds}.csv', index=False)
            synthetic_data_df = pd.read_csv(synthetic_file_path)
            
            elapsed_time = time.time() - start_time
            print(f"Execution time for generating data: {elapsed_time:.2f} seconds")

        # combine synthetic and real data
        combined_train_synth_df = pd.concat([train_df, synthetic_data_df], axis=0, ignore_index=True)

        # divide to x_train and y_train from reduced_train_df
        x_train = combined_train_synth_df.drop(columns=["Label"])
        y_train = combined_train_synth_df["Label"]
        x_test = reduced_test_df.drop(columns=["Label"])
        y_test = reduced_test_df["Label"]
        
        y_train = label_encoder.fit_transform(y_train) # Encode string labels into numeric format
        y_test = label_encoder.transform(y_test)
        x_train = scaler.fit_transform(x_train) # Standardize the features
        x_test = scaler.transform(x_test)
        save_label_encoder(label_encoder, file_name)
        save_scaler(scaler, file_name)

        model = DecisionTreeClassifier(random_state=1002)
        model.fit(x_train, y_train) #For sklearn no one hot encoding

        pred_rf = model.predict(x_test)


        pickle.dump(model, open(f'{save_path}/models/{file_name}sample{num_sample}epoch{epochi}rounds{rounds}{model_name}.pkl', 'wb'))

        print(f"Classification Report for Sample Size {num_sample}:")
        print(classification_report(y_test, pred_rf, digits=4, target_names=label_encoder.classes_))

In [11]:
ctgan_train(train_df=cic_train, epochi=1500, file_name='synthetic_from_cicids_ctgan', reduced_test_df=cic_test, rounds=11, model_name='DT')


synthetic_from_cicids_ctgan epoch=1500 samples=1000 round=11
Synthetic data already exists: /home/sharedrive/nafi/trafficp3/cicidis/datasets/synthetic_from_cicids_ctgansample1000epoch1500rounds11.csv
Classification Report for Sample Size 1000:
                           precision    recall  f1-score   support

                   BENIGN     0.9953    0.9900    0.9927    681396
                      Bot     0.7661    0.5077    0.6107       587
                     DDoS     0.9984    0.9981    0.9983     38408
            DoS_GoldenEye     0.9777    0.9780    0.9778      3088
                 DoS_Hulk     0.9161    0.9750    0.9447     69037
         DoS_Slowhttptest     0.9469    0.9509    0.9489      1650
            DoS_slowloris     0.9948    0.9914    0.9931      1739
              FTP-Patator     0.9970    0.9874    0.9922      2380
               Heartbleed     1.0000    0.6667    0.8000         3
             Infiltration     0.5714    0.7273    0.6400        11
                 P

In [12]:
ctgan_train(train_df=cic_train, epochi=1500, file_name='synthetic_from_cicids_ctgan', reduced_test_df=cic_test, rounds=22, model_name='DT')


synthetic_from_cicids_ctgan epoch=1500 samples=1000 round=22
Synthetic data already exists: /home/sharedrive/nafi/trafficp3/cicidis/datasets/synthetic_from_cicids_ctgansample1000epoch1500rounds22.csv
Classification Report for Sample Size 1000:
                           precision    recall  f1-score   support

                   BENIGN     0.9953    0.9900    0.9926    681396
                      Bot     0.7564    0.5026    0.6039       587
                     DDoS     0.9985    0.9982    0.9983     38408
            DoS_GoldenEye     0.9758    0.9773    0.9765      3088
                 DoS_Hulk     0.9163    0.9750    0.9447     69037
         DoS_Slowhttptest     0.9458    0.9509    0.9483      1650
            DoS_slowloris     0.9960    0.9931    0.9945      1739
              FTP-Patator     0.9979    0.9874    0.9926      2380
               Heartbleed     1.0000    0.6667    0.8000         3
             Infiltration     0.5833    0.6364    0.6087        11
                 P

In [13]:
ctgan_train(train_df=cic_train, epochi=1500, file_name='synthetic_from_cicids_ctgan', reduced_test_df=cic_test, rounds=33, model_name='DT')


synthetic_from_cicids_ctgan epoch=1500 samples=1000 round=33
Synthetic data already exists: /home/sharedrive/nafi/trafficp3/cicidis/datasets/synthetic_from_cicids_ctgansample1000epoch1500rounds33.csv
Classification Report for Sample Size 1000:
                           precision    recall  f1-score   support

                   BENIGN     0.9953    0.9900    0.9927    681396
                      Bot     0.7413    0.5077    0.6026       587
                     DDoS     0.9985    0.9981    0.9983     38408
            DoS_GoldenEye     0.9786    0.9777    0.9781      3088
                 DoS_Hulk     0.9165    0.9751    0.9449     69037
         DoS_Slowhttptest     0.9464    0.9624    0.9543      1650
            DoS_slowloris     0.9954    0.9919    0.9937      1739
              FTP-Patator     0.9975    0.9874    0.9924      2380
               Heartbleed     1.0000    0.6667    0.8000         3
             Infiltration     0.6154    0.7273    0.6667        11
                 P

In [14]:
def generate_data_for_classes_tvae(df, classes, num_samples, num_epochs=500):
    synthetic_data = pd.DataFrame()
    metadata = Metadata.detect_from_dataframe(data=df)
    for cls in classes:
        selected_df = df[df["Label"] == cls]
        synthesizer = TVAESynthesizer(metadata, epochs=num_epochs, verbose=True)
        synthesizer.fit(selected_df)
        samples = synthesizer.sample(num_samples)
        
        synthetic_data = pd.concat([synthetic_data, samples])
    return synthetic_data


small_classes = ['Web_Attack__XSS', 'Web_Attack__Brute_Force',
       'Web_Attack__Sql_Injection', 'Infiltration', 'Heartbleed', 'Bot']
save_path = "/home/sharedrive/nafi/trafficp3/cicidis"

def save_label_encoder(le, file_name):
    with open(f'{save_path}/models/{file_name}_label_encoder.pkl', 'wb') as le_file:
        pickle.dump(le, le_file)

def save_scaler(scaler, file_name):
    with open(f'{save_path}/models/{file_name}_scaler.pkl', 'wb') as scaler_file:
        pickle.dump(scaler, scaler_file)
    
def tvae_train(train_df, epochi, file_name, reduced_test_df, rounds=0, model_name=None):
    model_name = f"{model_name}" if model_name else ""

    for num_sample in [1000]:
        print(f"{file_name} epoch={epochi} samples={num_sample} round={rounds}")

        synthetic_file_path = f"{save_path}/datasets/{file_name}sample{num_sample}epoch{epochi}rounds{rounds}.csv"
        
        if os.path.exists(synthetic_file_path):
            print(f"Synthetic data already exists: {synthetic_file_path}")
            synthetic_data_df = pd.read_csv(synthetic_file_path)
        else:
            start_time = time.time()
            print("Generating synthetic data...") 
            synthetic_data = generate_data_for_classes_tvae(train_df, small_classes, num_sample, epochi)

            # save and load synthetic data
            synthetic_data.to_csv(f'{save_path}/datasets/{file_name}sample{num_sample}epoch{epochi}rounds{rounds}.csv', index=False)
            synthetic_data_df = pd.read_csv(synthetic_file_path)
            
            elapsed_time = time.time() - start_time
            print(f"Execution time for generating data: {elapsed_time:.2f} seconds")

        # combine synthetic and real data
        combined_train_synth_df = pd.concat([train_df, synthetic_data_df], axis=0, ignore_index=True)

        # divide to x_train and y_train from reduced_train_df
        x_train = combined_train_synth_df.drop(columns=["Label"])
        y_train = combined_train_synth_df["Label"]
        x_test = reduced_test_df.drop(columns=["Label"])
        y_test = reduced_test_df["Label"]
        
        y_train = label_encoder.fit_transform(y_train) # Encode string labels into numeric format
        y_test = label_encoder.transform(y_test)
        x_train = scaler.fit_transform(x_train) # Standardize the features
        x_test = scaler.transform(x_test)
        save_label_encoder(label_encoder, file_name)
        save_scaler(scaler, file_name)

        model = DecisionTreeClassifier(random_state=1002)
        model.fit(x_train, y_train) #For sklearn no one hot encoding

        pred_rf = model.predict(x_test)


        pickle.dump(model, open(f'{save_path}/models/{file_name}sample{num_sample}epoch{epochi}rounds{rounds}{model_name}.pkl', 'wb'))

        print(f"Classification Report for Sample Size {num_sample}:")
        print(classification_report(y_test, pred_rf, digits=4, target_names=label_encoder.classes_))

In [15]:
tvae_train(train_df=cic_train, epochi=1500, file_name='synthetic_from_cicids_tvae', reduced_test_df=cic_test, rounds=11, model_name="DT")


synthetic_from_cicids_tvae epoch=1500 samples=1000 round=11
Synthetic data already exists: /home/sharedrive/nafi/trafficp3/cicidis/datasets/synthetic_from_cicids_tvaesample1000epoch1500rounds11.csv
Classification Report for Sample Size 1000:
                           precision    recall  f1-score   support

                   BENIGN     0.9953    0.9900    0.9927    681396
                      Bot     0.7500    0.5111    0.6079       587
                     DDoS     0.9986    0.9981    0.9983     38408
            DoS_GoldenEye     0.9758    0.9780    0.9769      3088
                 DoS_Hulk     0.9163    0.9749    0.9447     69037
         DoS_Slowhttptest     0.9476    0.9648    0.9562      1650
            DoS_slowloris     0.9925    0.9902    0.9914      1739
              FTP-Patator     0.9979    0.9870    0.9924      2380
               Heartbleed     1.0000    0.6667    0.8000         3
             Infiltration     0.4444    0.7273    0.5517        11
                 Por

In [16]:
tvae_train(train_df=cic_train, epochi=1500, file_name='synthetic_from_cicids_tvae', reduced_test_df=cic_test, rounds=22, model_name="DT")


synthetic_from_cicids_tvae epoch=1500 samples=1000 round=22
Synthetic data already exists: /home/sharedrive/nafi/trafficp3/cicidis/datasets/synthetic_from_cicids_tvaesample1000epoch1500rounds22.csv
Classification Report for Sample Size 1000:
                           precision    recall  f1-score   support

                   BENIGN     0.9953    0.9900    0.9927    681396
                      Bot     0.7402    0.5145    0.6070       587
                     DDoS     0.9985    0.9981    0.9983     38408
            DoS_GoldenEye     0.9770    0.9786    0.9778      3088
                 DoS_Hulk     0.9162    0.9750    0.9446     69037
         DoS_Slowhttptest     0.9458    0.9630    0.9544      1650
            DoS_slowloris     0.9925    0.9908    0.9917      1739
              FTP-Patator     0.9979    0.9870    0.9924      2380
               Heartbleed     1.0000    0.6667    0.8000         3
             Infiltration     0.5000    0.7273    0.5926        11
                 Por

In [17]:
tvae_train(train_df=cic_train, epochi=1500, file_name='synthetic_from_cicids_tvae', reduced_test_df=cic_test, rounds=33, model_name="DT")


synthetic_from_cicids_tvae epoch=1500 samples=1000 round=33
Synthetic data already exists: /home/sharedrive/nafi/trafficp3/cicidis/datasets/synthetic_from_cicids_tvaesample1000epoch1500rounds33.csv
Classification Report for Sample Size 1000:
                           precision    recall  f1-score   support

                   BENIGN     0.9953    0.9900    0.9927    681396
                      Bot     0.7519    0.5111    0.6085       587
                     DDoS     0.9984    0.9982    0.9983     38408
            DoS_GoldenEye     0.9764    0.9777    0.9770      3088
                 DoS_Hulk     0.9164    0.9749    0.9447     69037
         DoS_Slowhttptest     0.9471    0.9648    0.9559      1650
            DoS_slowloris     0.9880    0.9902    0.9891      1739
              FTP-Patator     0.9975    0.9870    0.9922      2380
               Heartbleed     1.0000    0.6667    0.8000         3
             Infiltration     0.5294    0.8182    0.6429        11
                 Por