In [None]:
import sys, os
sys.path.append(os.path.abspath('../'))

In [2]:
from src.utils.flow import *
from src.utils.flow_calculations import *
from src.utils.restoration import *
from src.utils.truncated_packet import *
from src.operations.size_perturbation_logic import *
from src.operations.timing_perturbation_logic import *
from src.operations.calculate_fitness import *

In [3]:
def apply_best_on_packets(packets, max_fitness, flow_id):

    index = max_fitness[1][2]
    #print(index)
    if index == 0:
        return packets
    
    params = max_fitness[1][1][index-1] # best technique
    #print(f'saved:{params}')
    
    if index == 1:
        #print('packet_size_nor')
        #print(f'size: {prepare_size_stats(packets, flow_id)}')
        
        return adjust_packet_size(packets, flow_id, params['direction'], 'normal', params['scaling_factor'], params['focus_point'])
        
                
    elif index == 2:
        #print('packet_size_uni')
        #print(f'size: {prepare_size_stats(packets, flow_id)}')
        return adjust_packet_size(packets, flow_id, params['direction'], 'uniform', params['scaling_factor'], params['focus_point'])
                
    elif index == 3:
        #print('packet_time_norm')
        #print(f'time: {prepare_timing_stats(packets, flow_id)}')
        return apply_time_perturbation_with_focus(packets, flow_id, 'normal', params['focus_point'], params['scaling_factor'])
                
    elif index == 4:
        #print('packet_time_uni')
        #print(f'time: {prepare_timing_stats(packets, flow_id)}')
        return apply_time_perturbation_with_focus(packets, flow_id, 'uniform', params['focus_point'], params['scaling_factor'])
        
    return packets

In [4]:
import pygad
import copy

class FlowOptimizer:
    def __init__(self, packets, model_name, flow_id, max_fitness, target):
        self.packets = packets
        self.model_name = model_name
        self.flow_id = flow_id
        self.ga_instance = None
        self.max_fitness = max_fitness
        self.target = target
        
    def apply_best_solution(self):
        best_solution, best_solution_fitness, _ = self.ga_instance.best_solution()
        best_operations = self.decode_operations(best_solution)
        # Stosujemy najlepsze operacje do pakietów
        self.best_modified_packets = self.apply_operations_to_packets(best_operations, self.packets, self.flow_id)
        return self.best_modified_packets, best_solution_fitness

    def on_generation(self, ga_instance):
        pass
            
    def on_fitness(self, ga_instance):
        ga_instance.plot_fitness()

    def fitness_function(self, ga_instance, solution, solution_idx):
        
        fitnesses = []
        target_fitnesses = []
        
        operations = self.decode_operations(solution)
        
        modified_packets = self.packets # always check nop
        sizing_stats = prepare_size_stats(modified_packets, self.flow_id)
        timing_stats = prepare_timing_stats(modified_packets, self.flow_id)
        fitnesses.append(1.0 - predict_single_flow(self.model_name, sizing_stats, timing_stats))
        target_fitnesses.append(1.0 - predict_single_flow_target(self.model_name, sizing_stats, timing_stats))
        
        modified_packets = self.apply_operations_to_packets(operations[0], op_type = 'size_norm')
        sizing_stats = prepare_size_stats(modified_packets, self.flow_id)
        #print(sizing_stats)
        timing_stats = prepare_timing_stats(modified_packets, self.flow_id)
        fitnesses.append(1.0 - predict_single_flow(self.model_name, sizing_stats, timing_stats))
        target_fitnesses.append(1.0 - predict_single_flow_target(self.model_name, sizing_stats, timing_stats))
        
        modified_packets = self.apply_operations_to_packets(operations[1], op_type = 'size_uni')
        sizing_stats = prepare_size_stats(modified_packets, self.flow_id)
        timing_stats = prepare_timing_stats(modified_packets, self.flow_id)
        fitnesses.append(1.0 - predict_single_flow(self.model_name, sizing_stats, timing_stats))
        target_fitnesses.append(1.0 - predict_single_flow_target(self.model_name, sizing_stats, timing_stats))
        
        modified_packets = self.apply_operations_to_packets(operations[2], op_type = 'time_norm')
        sizing_stats = prepare_size_stats(modified_packets, self.flow_id)
        timing_stats = prepare_timing_stats(modified_packets, self.flow_id)
        fitnesses.append(1.0 - predict_single_flow(self.model_name, sizing_stats, timing_stats))
        target_fitnesses.append(1.0 - predict_single_flow_target(self.model_name, sizing_stats, timing_stats))
        
        modified_packets = self.apply_operations_to_packets(operations[3], op_type = 'time_uni')
        sizing_stats = prepare_size_stats(modified_packets, self.flow_id)
        timing_stats = prepare_timing_stats(modified_packets, self.flow_id)
        fitnesses.append(1.0 - predict_single_flow(self.model_name, sizing_stats, timing_stats))
        target_fitnesses.append(1.0 - predict_single_flow_target(self.model_name, sizing_stats, timing_stats))

        if self.target:
            max_fitness_value = max(target_fitnesses)
            max_fitness_index = target_fitnesses.index(max_fitness_value) 

        else:
            max_fitness_value = max(fitnesses) 
            max_fitness_index = fitnesses.index(max_fitness_value) 

        max_fitness_target = target_fitnesses[max_fitness_index]

        #print(self.max_fitness)
        if max_fitness_value > self.max_fitness[0][0]:
            #print(f'CONDITION max: {max_fitness_value}, prev: {self.max_fitness[0]}')
            self.max_fitness = [[max_fitness_value, max_fitness_target], self.decode_operations(solution), max_fitness_index]
            #print(f'self:{self.max_fitness[0]}')
        
        #print(max_fitness_value)
        #print(f'solution:{solution}')
        #print(f'finresses: {fitnesses}, solution:{self.decode_operations(solution)}')
        
        return max_fitness_value

    def optimize_for_flow(self):
        
        #with open(f"progress_for_flow_{self.flow_id}.txt", "w") as file:
        #    file.write("")  # Tylko czyszczenie zawartości pliku
            
        ga_instance = pygad.GA(num_generations=2,
                               num_parents_mating=1,
                               fitness_func=self.fitness_function,
                               sol_per_pop=3,                          
                               mutation_probability=0.7,
                               suppress_warnings= True,
                               on_generation=self.on_generation,
                               num_genes=6, # operations number
                               gene_space = [{'low': 1.1, 'high': 5.0},  # size normal
                                             {'low': 0.25, 'high': 5.0}, # packet size uniform
                                             {'low': 1.1, 'high': 3.0}, # time normal
                                             {'low': 0.4, 'high': 2.4}, # time uniform
                                            [0,1,2],  # direction
                                            [0,1,2]]) # focus point 

        
        ga_instance.run()
        return ga_instance, self.max_fitness

    def decode_operations(self, genotype):
        operations = []
        

        focus_point_map = {0: "start", 1: "middle", 2: "end"}
        focus_point = None
        focus_point_index = genotype[5] 
        focus_point = focus_point_map.get(focus_point_index, None)  

        size_norm = {
            'scaling_factor': genotype[0],
            'direction': genotype[4],
            'focus_point': focus_point
        }
        
        size_uni = {
            'scaling_factor': genotype[1],
            'direction': genotype[4],
            'focus_point': focus_point
        }
        
        time_norm = {
            'scaling_factor': genotype[2],
            'direction': genotype[4],
            'focus_point': focus_point
        }
        
        
        time_uni = {
            'scaling_factor': genotype[3],
            'direction': genotype[4],
            'focus_point': focus_point
        }  
            
            
        
        operations = [size_norm, size_uni, time_norm, time_uni]
        return operations 
        


    def apply_operations_to_packets(self, operation, op_type):

        #packets = copy.deepcopy(self.packets) # deep copy for testing
        #print(f'size: {prepare_size_stats(self.packets, self.flow_id)}')
        #print(f'time: {prepare_timing_stats(self.packets, self.flow_id)}')


        scaling_factor = operation['scaling_factor']
        direction = operation.get('direction')
        focus_point = operation.get('focus_point')
        

        if op_type == 'size_norm':
            packets = adjust_packet_size_deepcopy(self.packets, self.flow_id, direction, 'normal', scaling_factor, focus_point)
            
            
        elif op_type == 'size_uni':
            packets = adjust_packet_size_deepcopy(self.packets, self.flow_id, direction, 'uniform', scaling_factor, focus_point)
            
        elif op_type == 'time_norm':
            packets = apply_time_perturbation_with_focus_deepcopy(self.packets, self.flow_id, 'normal', focus_point, scaling_factor)
            
        elif op_type == 'time_uni':
            #print(f'przed: {prepare_timing_stats(self.packets, self.flow_id)}')
            packets = apply_time_perturbation_with_focus_deepcopy(self.packets, self.flow_id, 'uniform', focus_point, scaling_factor)
            #print(f'po: {prepare_timing_stats(packets, self.flow_id)}')
            
        #print(packets)
        return packets
    

            
        

    #print(optimizer.best_solution())
    


In [5]:
import os
import pickle

def save_object_pickle(obj, filepath):
    """
    Saves an object to a specified filepath using pickle.
    
    Parameters:
    - obj: The object to save.
    - filepath: The path to the file where the object should be saved.
    """
    # Ensure the directory exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    
    with open(filepath, 'wb') as pickle_file:  # Note 'wb' for writing in binary mode
        pickle.dump(obj, pickle_file)

def load_object_pickle(filepath):
    """
    Loads an object from a specified filepath using pickle.
    
    Parameters:
    - filepath: The path to the file from which to load the object.
    
    Returns:
    - The loaded object.
    """
    with open(filepath, 'rb') as pickle_file:  # Note 'rb' for reading in binary mode
        obj = pickle.load(pickle_file)
    return obj


In [6]:
captures = [
    "botnet-capture-20110811-neris",
    "botnet-capture-20110812-rbot",
    "botnet-capture-20110815-fast-flux-2",
    "botnet-capture-20110815-fast-flux",
    "botnet-capture-20110815-rbot-dos",
    "botnet-capture-20110816-donbot",
    "botnet-capture-20110816-qvod",
    "botnet-capture-20110816-sogou",
    "botnet-capture-20110817-bot",
    "botnet-capture-20110818-bot-2",
    "botnet-capture-20110819-bot"
]


In [7]:
def save_truncated(file_name, truncated_packets_list):
    with open(file_name, 'wb') as f:
        pickle.dump(truncated_packets_list, f)
        print(f'Saved: {file_name}')


In [None]:
import copy
import os
import pickle

def process_flow_id(flow_id, packets, model, target=False):
    print(f"Starting work on flow_id: {flow_id}")
    truncated_packets = copy.deepcopy(packets)  
    inter = []
    max_fitness = [0, [[0, 0], 0, 0], 0]
    previous_max = 0
    
    for i in range(5):
        optimizer = FlowOptimizer(truncated_packets, model, flow_id, max_fitness[1], target)
        max_fitness = optimizer.optimize_for_flow()
        inter.append(max_fitness[1][0][0])
        
        if max_fitness[1][0][0] > previous_max:
            truncated_packets = apply_best_on_packets(truncated_packets, max_fitness, flow_id)
            previous_max = max_fitness[1][0][0]

    return flow_id, inter, truncated_packets

def main():

    updated_truncated_packets_list = []

    # FITNESS FUNCTION RF
    for capture in captures:
        print(f'Working on capture: {capture}')

        modified_pcap_path = f"../data/interim/{capture}_perturbed.pcapng"
        pcap_file_path = f'../data/raw/filtered/{capture}.pcap'
        save_object_path = f"../data/metadata/ga_rf_fitness_{capture}_perturbed.pkl"
        model_name = f'{capture}'

        truncated_packets = create_truncated_packets_from_pcap(pcap_file_path)
        print(f"Total packets loaded: {len(truncated_packets)}")
        truncated_packets = assign_flow_ids_to_packets(truncated_packets)

        flow_ids = set(packet.flow_id for packet in truncated_packets)
        extern = []
        updated_truncated_packets = truncated_packets  # Start with the original packet list

        for fid in flow_ids:
            flow_id, inter, updated_truncated_packets = process_flow_id(fid, updated_truncated_packets, model_name, target=False)
            extern.append(inter)
            print(f"Completed processing flow_id {flow_id}, final results: {inter}")

        modify_and_write_packets_one_by_one(pcap_file_path, modified_pcap_path, updated_truncated_packets)
        updated_truncated_packets_list.append(updated_truncated_packets)
        
        with open(save_object_path, 'wb') as f:
            pickle.dump(extern, f)

        print(f"Finished and saved: {capture}")

    save_truncated("../data/metadata/truncated_list_fitness", updated_truncated_packets_list)
    updated_truncated_packets_list = [] # cleaning the list for another case
    
    # FITNESS FUNCTION TARGET MODEL
    for capture in captures:
        print(f'(Target) Working on capture: {capture}')

        modified_pcap_path = f"../data/interim/{capture}_perturbed_target.pcapng"
        pcap_file_path = f'../data/raw/filtered/{capture}.pcap'
        save_object_path = f"../data/metadata/ga_rf_fitness_{capture}_perturbed_target.pkl"
        model_name = f'{capture}'

        truncated_packets = create_truncated_packets_from_pcap(pcap_file_path)
        print(f"Total packets loaded: {len(truncated_packets)}")
        truncated_packets = assign_flow_ids_to_packets(truncated_packets)

        flow_ids = set(packet.flow_id for packet in truncated_packets)
        extern = []
        updated_truncated_packets = truncated_packets  # Start with the original packet list

        for fid in flow_ids:
            flow_id, inter, updated_truncated_packets = process_flow_id(fid, updated_truncated_packets, model_name, target=True)
            extern.append(inter)
            print(f"Completed processing flow_id {flow_id}, final results: {inter}")

        modify_and_write_packets_one_by_one(pcap_file_path, modified_pcap_path, updated_truncated_packets)
        updated_truncated_packets_list.append(updated_truncated_packets)
        
        with open(save_object_path, 'wb') as f:
            pickle.dump(extern, f)

        print(f"Finished and saved: {capture}")

    save_truncated("../data/metadata/truncated_list_target" ,updated_truncated_packets_list)

if __name__ == "__main__":
    main()


Working on capture: botnet-capture-20110811-neris
Total packets loaded: 4955
hash groups: 593
Starting work on flow_id: 1
Completed processing flow_id 1, final results: [0.148018868600873, 0.2080194822274305, 0.20926018030148952, 0.21335378889877465, 0.21335378889877465]
Starting work on flow_id: 2
Completed processing flow_id 2, final results: [0.08528348087665605, 0.10878278692751864, 0.1115551000400613, 0.1115551000400613, 0.1115551000400613]
Starting work on flow_id: 3
Completed processing flow_id 3, final results: [0.040148350018459955, 0.08665082384874467, 0.11639819239382643, 0.11674891791631581, 0.11674891791631581]
Starting work on flow_id: 4
Completed processing flow_id 4, final results: [0.058190477427759624, 0.06610414945111631, 0.08459979866945222, 0.1007700871901801, 0.11063632070890339]
Starting work on flow_id: 5
Completed processing flow_id 5, final results: [0.10770731907846232, 0.11455509140797804, 0.12771687567933754, 0.1310350486382229, 0.14102797900673014]
Startin

In [None]:
truncated_packets

In [None]:
# TODO: evaluation processing pipeline with all commands

In [None]:
captures

In [None]:
# processing each capture (perturbed and raw) with cicflowmeter tool
env_name = 'cicflowmeter_env'

for capture in captures:
    raw_command = f'cicflowmeter -f ../data/raw/filtered/{capture}.pcap -c ../data/processed/{capture}_orig_flows.csv'
    perturbed_command = f'cicflowmeter -f ../data/interim/{capture}_perturbed.pcapng -c ../data/processed/{capture}_perturbed_flows.csv'
    perturbed_target_command = f'cicflowmeter -f ../data/interim/{capture}_perturbed_target.pcapng -c ../data/processed/{capture}_perturbed_target_flows.csv'

    print(f'Running: {raw_command}')
    !conda run -n {env_name} {raw_command}

    print(f'Running: {perturbed_command}')
    !conda run -n {env_name} {perturbed_command}

    print(f'Running: {perturbed_target_command}')
    !conda run -n {env_name} {perturbed_target_command}

print("Finished running the commands")


In [None]:
import pickle

def load_truncated_packets_from_pickle(file_path):
    with open(file_path, 'rb') as f:
        truncated_packets_list = pickle.load(f)
    return truncated_packets_list

file_path = "updated_truncated_packets_pickle"
loaded_packets_list = load_truncated_packets_from_pickle(file_path)
print(f"Loaded {len(loaded_packets_list)} lists of TruncatedPacket objects.")


In [None]:
def load_truncated_packets_from_pickle(file_path):
        with open(file_path, 'rb') as f:
            truncated_packets_list = pickle.load(f)
        return truncated_packets_list

fitness_file_path = "../data/metadata/truncated_list_fitness"
target_file_path = "../data/metadata/truncated_list_target"
 
loaded_packets_list_target = load_truncated_packets_from_pickle(target_file_path)
print(f"Loaded {len(loaded_packets_list_target)} lists of TruncatedPacket objects for target.")

loaded_packets_list_fitness = load_truncated_packets_from_pickle(fitness_file_path)
print(f"Loaded {len(loaded_packets_list_fitness)} lists of TruncatedPacket objects for fitness.")

# the above is equivalent to fitness function calculations (so probably not necessary)


In [None]:
# load packets from perturbed files and make flow IDs assigment

loaded_packets_fitness = []
loaded_packets_target = []

for capture in captures:
    modified_pcap_path_fitness = f"../data/interim/{capture}_perturbed.pcapng"
    modified_pcap_path_target = f"../data/interim/{capture}_perturbed_target.pcapng"

    print(f"loading: {modified_pcap_path_fitness}...")
    loaded_packets_fitness.append(assign_flow_ids_to_packets(create_truncated_packets_from_pcap(modified_pcap_path_fitness)))

    print(f"loading: {modified_pcap_path_target}...")
    loaded_packets_target.append(assign_flow_ids_to_packets(create_truncated_packets_from_pcap(modified_pcap_path_target)))

In [None]:
# make statistical calculations

fitness_predictions = [[]]

# fitness
for i in range(0, len(captures)):

    capture = captures[i]

    print(f"Processing fitness capture {capture}")
    
    unique_flow_ids = list({packet.flow_id for packet in loaded_packets_fitness[i]})
    for id in unique_flow_ids:
        
        sizing_stats = prepare_size_stats(loaded_packets_fitness[i], id)
        timing_stats = prepare_timing_stats(loaded_packets_fitness[i], id)
        
        fitness_predictions[i].append(predict_single_flow_target(f"{capture}", sizing_stats, timing_stats))

target_predictions = [[]]

# target
for i in range(0, len(captures)):

    capture = captures[i]

    print(f"Processing target capture {capture}")
    
    unique_flow_ids = list({packet.flow_id for packet in loaded_packets_target[i]})
    for id in unique_flow_ids:
        
        sizing_stats = prepare_size_stats(loaded_packets_target[i], id)
        timing_stats = prepare_timing_stats(loaded_packets_target[i], id)
        
        target_predictions[i].append(predict_single_flow_target(f"{capture}", sizing_stats, timing_stats))


In [None]:
for i in range(0, len(captures)):

    capture = captures[i]

    print(f"FITNESS {capture} : {np.meanfitness_predictions[i]}")
    print(f"TARGET {capture} : {target_predictions[i]}")

# TODO: niedokladnie wyniki, trzeba bedzie przemapowac jeszcze

In [None]:
# testing 

#captures = [
    #"botnet-capture-20110811-neris",
    #"botnet-capture-20110816-donbot",
    #"botnet-capture-20110816-qvod"
#]

In [None]:
# # before PERTURBATIONS

import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor

import sys, os
sys.path.append(os.path.abspath('../'))
from src.data.cic_preprocess import *

# Directory paths
models_dir = "../models"
data_dir = "../data/processed"

# Loading models
models = {file.split('_model.cbm')[0]: CatBoostRegressor().load_model(os.path.join(models_dir, file))
          for file in os.listdir(models_dir) if file.endswith('.cbm')}

for capture in captures:

    # Testing models
    accuracies = []
    for model_name, model in models.items():
        if capture in model_name and "regressor" in model_name:
            # File name of the test data corresponds to the model name
            test_file_name = f"{capture}_orig_flows.csv"
            test_file_path = os.path.join(data_dir, test_file_name)
            
            # Loading test data
            X_test = preprocess_traffic(test_file_path)
            y_test = np.ones(X_test.shape[0])  # Assuming all samples are malicious, so y_test should be all ones
            
            # Model prediction
            predictions = model.predict(X_test)
            
            # Calculating accuracy
            accuracy = np.mean(predictions)
            accuracies.append(predictions)
            
            predictions = pd.DataFrame(predictions).clip(lower=0, upper =1)
            
            
            # Detailed logging
            print(f"Testing {model_name}:")
            print(f"Accuracy: {accuracy:.5%}")
            print(f"Predicted distribution: {predictions.mean()}")
            print(f"Number of samples: {len(predictions)}")
            print(f"Number of malicious (1) predictions: {np.sum(predictions > 0.5)}")
            print(f"Number of benign (0) predictions: {np.sum(predictions <= 0.5)}\n")

            # Average accuracy
            #mean_accuracy = np.mean(accuracies)
            #print(f"\nAverage accuracy across all test cases: {mean_accuracy:.5%}")
            
           
            import matplotlib.pyplot as plt

            plt.figure(figsize=(10, 6))
            plt.hist(predictions, bins=30, color='skyblue', edgecolor='black', log=False)
            plt.title(f'Predictions Histogram (Log Scale) for {capture}_orig_flows', fontsize=15)
            plt.xlabel('Prediction Value', fontsize=12)
            plt.ylabel('Frequency', fontsize=12)
            plt.grid(axis='y', alpha=0.75)
            plt.xlim(0, 1.0)

            plt.show()


In [None]:
# # after fitness PERTURBATIONS

import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor

import sys, os
sys.path.append(os.path.abspath('../'))
from src.data.cic_preprocess import *

# Directory paths
models_dir = "../models"
data_dir = "../data/processed"

# Loading models
models = {file.split('_model.cbm')[0]: CatBoostRegressor().load_model(os.path.join(models_dir, file))
          for file in os.listdir(models_dir) if file.endswith('.cbm')}

for capture in captures:

    # Testing models
    accuracies = []
    for model_name, model in models.items():
        if capture in model_name and "regressor" in model_name:
            # File name of the test data corresponds to the model name
            test_file_name = f"{capture}_perturbed_flows.csv"
            test_file_path = os.path.join(data_dir, test_file_name)
            
            # Loading test data
            X_test = preprocess_traffic(test_file_path)
            y_test = np.ones(X_test.shape[0])  # Assuming all samples are malicious, so y_test should be all ones
            
            # Model prediction
            predictions = model.predict(X_test)
            
            # Calculating accuracy
            accuracy = np.mean(predictions)
            #accuracies.append(predictions)
            
            predictions = pd.DataFrame(predictions).clip(lower=0, upper =1)
            
            
            # Detailed logging
            print(f"Testing {model_name}:")
            print(f"Accuracy: {accuracy:.5%}")
            print(f"Predicted distribution: {predictions.mean()}")
            print(f"Number of samples: {len(predictions)}")
            print(f"Number of malicious (1) predictions: {np.sum(predictions > 0.5)}")
            print(f"Number of benign (0) predictions: {np.sum(predictions <= 0.5)}\n")

            # Average accuracy
            mean_accuracy = np.mean(accuracies)
            print(f"\nAverage accuracy across all test cases: {mean_accuracy:.5%}")
            
           
            import matplotlib.pyplot as plt

            plt.figure(figsize=(10, 6))
            plt.hist(predictions, bins=30, color='skyblue', edgecolor='black', log=False)
            plt.title(f'Predictions Histogram (Log Scale) for {capture}_perturbed', fontsize=15)
            plt.xlabel('Prediction Value', fontsize=12)
            plt.ylabel('Frequency', fontsize=12)
            plt.grid(axis='y', alpha=0.75)
            plt.xlim(0, 1.0)

            plt.show()


In [None]:
# # after target PERTURBATIONS

import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor

import sys, os
sys.path.append(os.path.abspath('../'))
from src.data.cic_preprocess import *

# Directory paths
models_dir = "../models"
data_dir = "../data/processed"

# Loading models
models = {file.split('_model.cbm')[0]: CatBoostRegressor().load_model(os.path.join(models_dir, file))
          for file in os.listdir(models_dir) if file.endswith('.cbm')}

for capture in captures:

    # Testing models
    accuracies = []
    for model_name, model in models.items():
        if capture in model_name and "regressor" in model_name:
            # File name of the test data corresponds to the model name
            test_file_name = f"{capture}_perturbed_target_flows.csv"
            test_file_path = os.path.join(data_dir, test_file_name)
            
            # Loading test data
            X_test = preprocess_traffic(test_file_path)
            y_test = np.ones(X_test.shape[0])  # Assuming all samples are malicious, so y_test should be all ones
            
            # Model prediction
            predictions = model.predict(X_test)
            
            # Calculating accuracy
            accuracy = np.mean(predictions)
            #accuracies.append(predictions)
            
            predictions = pd.DataFrame(predictions).clip(lower=0, upper =1)
            
            # Detailed logging
            print(f"Testing {model_name}:")
            print(f"Accuracy: {accuracy:.5%}")
            print(f"Predicted distribution: {predictions.mean()}")
            print(f"Number of samples: {len(predictions)}")
            print(f"Number of malicious (1) predictions: {np.sum(predictions > 0.5)}")
            print(f"Number of benign (0) predictions: {np.sum(predictions <= 0.5)}\n")

            # Average accuracy
            mean_accuracy = np.mean(accuracies)
            print(f"\nAverage accuracy across all test cases: {mean_accuracy:.5%}")
            
           
            import matplotlib.pyplot as plt
            plt.figure(figsize=(10, 6))
            plt.hist(predictions, bins=30, color='skyblue', edgecolor='black', log=False)
            plt.title(f'Predictions Histogram for {capture}_perturbed', fontsize=15)
            plt.xlabel('Prediction Value', fontsize=12)
            plt.ylabel('Frequency', fontsize=12)
            plt.grid(axis='y', alpha=0.75)
            plt.xlim(0, 1.0)

            plt.show()


In [None]:
# TODO: wizualizacja do pokazania spadku po funkcji fitness []

In [None]:
# TODO: sprawdzenie wyników bez flow o długości 0 [] (?)