In [1]:
import sys
sys.path.append('/home/sharedrive/nafi/traffic/lib/python3.9/site-packages')

import pandas as pd
from chronos import ChronosPipeline
import torch
from table_evaluator import TableEvaluator
import matplotlib.pyplot as plt
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata
import numpy as np
from ctgan import CTGAN
from ctgan import load_demo
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import xgboost as xgb
from table_evaluator import TableEvaluator
import pickle
import os

In [2]:
itu_test_df = pd.read_csv("/home/sharedrive/nafi/trafficp3/datasets/feature_selected_itu_test.csv")
itu_test_df.shape

(511540, 25)

In [3]:
itu_test_df.drop(columns=["Init_Win_bytes_forward", "Flow_IAT_Min"], inplace=True)
itu_test_df.shape

(511540, 23)

In [4]:
def replace_negatives_with_positive_mean(df):
    df_copy = df.copy()
    
    for col in df_copy.columns:
        if col == "Label":
            continue
        if pd.api.types.is_numeric_dtype(df_copy[col]):
            # calculate the mean of positive values in the column
            positive_values = df_copy[col][df_copy[col] >= 0]
            if not positive_values.empty:
                positive_mean = positive_values.mean()
                # replace negatives with the mean of positive values
                df_copy.loc[df_copy[col] < 0, col] = positive_mean
            else:
                print(f"Column '{col}' has no positive values. Negatives remain unchanged.")
    
    return df_copy

itu_test_df = replace_negatives_with_positive_mean(itu_test_df)

import pandas as pd
from tqdm import tqdm

def analyze_negative_values(df, show_values=False, sample=10):
    # Select only numeric columns to optimize processing
    numeric_cols = df.select_dtypes(include=['number']).columns
    summary = []
    
    # Use tqdm for progress bar (only in Jupyter)
    for col in tqdm(numeric_cols, desc="Processing columns"):
        negative_count = (df[col] < 0).sum()
        result = {"Feature": col, "Negative Count": negative_count}
        
        if show_values and negative_count > 0:
            negatives = df[col][df[col] < 0]
            result["Sample Values"] = negatives.head(sample).tolist()
        
        summary.append(result)
    
    return pd.DataFrame(summary)

analyze_negative_values(itu_test_df)

Processing columns: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 1395.97it/s]


Unnamed: 0,Feature,Negative Count
0,Flow_Duration,0
1,Total_Length_of_Bwd_Packets,0
2,Bwd_Packet_Length_Max,0
3,Bwd_Packet_Length_Mean,0
4,Flow_Bytes/s,0
5,Flow_Packets/s,0
6,Flow_IAT_Mean,0
7,Flow_IAT_Std,0
8,Flow_IAT_Max,0
9,Fwd_IAT_Total,0


In [7]:
def load_label_encoder(file_name):
    save_path = "/home/sharedrive/nafi/trafficp3/cicidis"
    with open(f'{save_path}/models/{file_name}_label_encoder.pkl', 'rb') as le_file:
        return pickle.load(le_file)

def load_scaler(file_name):
    save_path = "/home/sharedrive/nafi/trafficp3/cicidis"
    with open(f'{save_path}/models/{file_name}_scaler.pkl', 'rb') as scaler_file:
        return pickle.load(scaler_file)

def load_model(file_name, rounds, num_samples=1000, epochi=1500):
    save_path = "/home/sharedrive/nafi/trafficp3/cicidis"
    with open(f'{save_path}/models/{file_name}sample{num_samples}epoch{epochi}rounds{rounds}.pkl', 'rb') as model:
        return pickle.load(model) 


def inference(test_df, load_file_name, rounds, num_sample=1000):
    x_test = test_df.drop(columns=["Label"])
    y_test = test_df["Label"]

    scaler = load_scaler(load_file_name)
    le = load_label_encoder(load_file_name)
    RF_model = load_model(load_file_name, rounds)
    
    x_test = scaler.transform(x_test)
    y_test = le.transform(y_test)
    print("scaler and label encoder loaded & applied!!")


    pred_rf = RF_model.predict(x_test)

    print(f"Classification Report for Sample Size {num_sample}:")
    print(classification_report(y_test, pred_rf, digits=4, target_names=le.classes_))

In [8]:
inference(itu_test_df, 'synthetic_from_cicids_tvae', 11)


scaler and label encoder loaded & applied!!


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    3.0s


Classification Report for Sample Size 1000:
                           precision    recall  f1-score   support

                   BENIGN     0.9600    0.9573    0.9587    410865
                      Bot     0.0263    0.9774    0.0512       354
                     DDoS     0.6391    0.9956    0.7784     23160
            DoS_GoldenEye     0.9897    0.9833    0.9865      1861
                 DoS_Hulk     0.8580    0.6623    0.7476     41626
         DoS_Slowhttptest     0.9432    0.9527    0.9479       994
            DoS_slowloris     0.9949    0.9284    0.9605      1048
              FTP-Patator     0.9944    0.9861    0.9902      1436
               Heartbleed     1.0000    1.0000    1.0000         2
             Infiltration     1.0000    0.5000    0.6667         6
                 PortScan     0.9992    0.4955    0.6625     28728
              SSH-Patator     1.0000    0.4902    0.6579      1067
  Web_Attack__Brute_Force     1.0000    0.1029    0.1867       272
Web_Attack__Sql_I

In [9]:
inference(itu_test_df, 'synthetic_from_cicids_tvae', 22)


scaler and label encoder loaded & applied!!


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    3.0s


Classification Report for Sample Size 1000:
                           precision    recall  f1-score   support

                   BENIGN     0.9890    0.9584    0.9734    410865
                      Bot     0.0292    0.9802    0.0568       354
                     DDoS     0.9931    0.9380    0.9648     23160
            DoS_GoldenEye     0.9908    0.9850    0.9879      1861
                 DoS_Hulk     0.8880    0.9147    0.9012     41626
         DoS_Slowhttptest     0.9519    0.8763    0.9125       994
            DoS_slowloris     0.9949    0.9265    0.9595      1048
              FTP-Patator     0.9944    0.9861    0.9902      1436
               Heartbleed     1.0000    1.0000    1.0000         2
             Infiltration     1.0000    0.5000    0.6667         6
                 PortScan     0.9939    0.9931    0.9935     28728
              SSH-Patator     1.0000    0.4883    0.6562      1067
  Web_Attack__Brute_Force     1.0000    0.1066    0.1927       272
Web_Attack__Sql_I

In [10]:
inference(itu_test_df, 'synthetic_from_cicids_tvae', 33)


scaler and label encoder loaded & applied!!


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    3.2s


Classification Report for Sample Size 1000:
                           precision    recall  f1-score   support

                   BENIGN     0.9959    0.9906    0.9932    410865
                      Bot     0.9427    0.6045    0.7367       354
                     DDoS     0.9998    0.9994    0.9996     23160
            DoS_GoldenEye     0.9872    0.9914    0.9893      1861
                 DoS_Hulk     0.9183    0.9777    0.9470     41626
         DoS_Slowhttptest     0.9812    0.9970    0.9890       994
            DoS_slowloris     0.9990    1.0000    0.9995      1048
              FTP-Patator     0.9951    0.9868    0.9909      1436
               Heartbleed     1.0000    1.0000    1.0000         2
             Infiltration     0.8333    0.8333    0.8333         6
                 PortScan     0.9940    0.9999    0.9969     28728
              SSH-Patator     0.9981    0.4958    0.6625      1067
  Web_Attack__Brute_Force     0.9160    0.8419    0.8774       272
Web_Attack__Sql_I