#### Import Packages

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.nonparametric.kde import KDEUnivariate

#### Define aux functions

In [2]:
# Define reusable functions
def load_data(input_folder):
    """Load and combine CSV files from a given folder into a single DataFrame."""
    dataframes = [pd.read_csv(os.path.join(input_folder, file)) for file in os.listdir(input_folder) if file.endswith('.csv')]
    return pd.concat(dataframes, ignore_index=True)

def calculate_statistics(data, features, group_column):
    """Calculate descriptive statistics and variance grouped by a specific column."""
    grouped_data = data.groupby(group_column)[features]
    stats = {f"Label {label}": group.describe().T.assign(variance=group.var()) for label, group in grouped_data}
    return pd.concat(stats, axis=0)

def save_statistics(stats, filename):
    """Save statistics as tab-separated text and CSV for Excel."""
    print(stats.to_csv(sep="\t"))
    stats.to_csv(filename, index=True)
def plot_distributions(data, features, group_column, is_multiclass=False, output_folder="plots"):
    """
    Plot KDE distributions of features for binary or multiclass labels and save them to a folder.

    Parameters:
    - data: DataFrame containing the data.
    - features: List of feature names to plot.
    - group_column: Column name used for grouping (e.g., 'binary_label' or 'multiclass_label').
    - is_multiclass: Boolean indicating whether it's multiclass (True) or binary (False).
    - output_folder: Path to the folder where plots will be saved.
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    for feature in features:
        plt.figure(figsize=(12, 7))
        
        # Loop through each unique label in the group_column
        for label in data[group_column].unique():
            subset = data[data[group_column] == label][feature].dropna()  # Drop NaNs for clean data
            # Label naming to differentiate normal and attack types
            label_name = "Normal" if label == 0 else (f"Attack Type {int(label)}" if is_multiclass else "Attack")
            
            # Check if the variance is zero; if so, skip this subset
            if subset.nunique() > 1:  # Only plot if there is more than one unique value
                sns.kdeplot(data=subset, label=label_name, fill=True, alpha=0.5)
            else:
                print(f"Skipping {feature} for {label_name} due to zero variance.")

        plt.title(f'Distribution of {feature} Across {group_column} Labels')
        plt.xlabel(feature)
        plt.ylabel('Density')
        plt.legend()

        # Save the plot
        plot_filename = os.path.join(output_folder, f"{feature}_distribution.png")
        plt.savefig(plot_filename)
        plt.close()  # Close the plot to free memory
        print(f"Saved plot: {plot_filename}")

In [3]:
# Set Seaborn style directly
sns.set_style("darkgrid")  # Use Seaborn's darkgrid style directly

def plot_kde_with_gaussian_kernel(data, features, group_column, bandwidth_methods=["scott", "silverman"], output_folder="kde_kernel_comparison_plots"):
    """
    Plot KDEs with the Gaussian kernel and different bandwidth methods, comparing Normal and Attack on the same plot,
    with filled areas under each line for better comparison.

    Parameters:
    - data: DataFrame containing the data.
    - features: List of feature names to plot.
    - group_column: Column name used for grouping (e.g., 'binary_label' or 'multiclass_label').
    - bandwidth_methods: List of bandwidth methods to apply (e.g., 'scott', 'silverman').
    - output_folder: Path to the folder where plots will be saved.
    """

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Set the context for high-quality appearance
    plt.rcParams.update({
        "figure.dpi": 150,  # High resolution for display
        "axes.titlesize": 14,
        "axes.labelsize": 12,
        "legend.fontsize": 10,
        "lines.linewidth": 1.8
    })

    # Iterate over each feature and generate KDEs
    for feature in features:
        for bw_method in bandwidth_methods:
            plt.figure(figsize=(10, 6))
            
            # Plot KDEs for both Normal and Attack on the same plot
            for label in data[group_column].unique():
                subset = data[data[group_column] == label][feature].dropna()
                label_name = "Normal" if label == 0 else "Attack"
                
                # Check if the variance is zero; if so, skip this subset
                if subset.var() == 0:
                    print(f"Skipping {feature} for {label_name} due to zero variance.")
                    continue

                # Fit KDE with Gaussian kernel and specified bandwidth
                kde = KDEUnivariate(subset)
                try:
                    kde.fit(kernel="gau", bw=bw_method)
                except RuntimeError:
                    print(f"Failed to estimate density for {feature} - {label_name} with bw={bw_method}.")
                    continue
                
                # Set color and plot filled area for each category
                color = "steelblue" if label == 0 else "coral"
                plt.fill_between(kde.support, kde.density, color=color, alpha=0.5, label=f"{label_name} - BW: {bw_method}")

            # Customizing the title, labels, and legend
            plt.title(f'Distribution of {feature} Across {group_column} Labels (Gaussian Kernel, BW: {bw_method})')
            plt.xlabel(feature)
            plt.ylabel("Density")
            plt.legend(loc="upper right", frameon=True, shadow=True)

            # Save the plot with a high-quality format
            plot_filename = os.path.join(output_folder, f"{feature}_comparison_gaussian_bw_{bw_method}.png")
            plt.savefig(plot_filename, format="png", dpi=300, bbox_inches="tight")
            plt.close()  # Close the plot to free memory
            print(f"Saved high-quality plot with filled area: {plot_filename}")

#### Define features for analysis

In [4]:
# Select key features to analyze
key_features = ['bearer_0_dl_total_bytes', 'bearer_0_ul_total_bytes',
       'bearer_1_dl_total_bytes', 'bearer_1_ul_total_bytes', 'dl_bitrate',
       'ul_bitrate', 'ul_retx', 'ul_err', 'ul_mcs', 'ul_path_loss', 'ul_phr',
       'dl_err', 'dl_mcs', 'dl_retx', 'dl_tx', 'cqi', 'epre', 'p_ue',
       'pusch_snr', 'turbo_decoder_avg', 'ul_tx']

#### EDA for the normal final datasets

In [5]:
# EDA for main dataset
input_folder = 'Per_UE_Datasets_final'
combined_data = load_data(input_folder)
normal_data, attack_data = combined_data[combined_data["binary_label"] == 0], combined_data[combined_data["binary_label"] == 1]

In [6]:
# Plot binary label distributions
plot_distributions(combined_data, key_features, group_column="binary_label",is_multiclass=False,output_folder='binary_dist_plots')

Saved plot: binary_dist_plots\bearer_0_dl_total_bytes_distribution.png
Saved plot: binary_dist_plots\bearer_0_ul_total_bytes_distribution.png
Saved plot: binary_dist_plots\bearer_1_dl_total_bytes_distribution.png
Saved plot: binary_dist_plots\bearer_1_ul_total_bytes_distribution.png
Saved plot: binary_dist_plots\dl_bitrate_distribution.png
Saved plot: binary_dist_plots\ul_bitrate_distribution.png
Saved plot: binary_dist_plots\ul_retx_distribution.png
Saved plot: binary_dist_plots\ul_err_distribution.png
Saved plot: binary_dist_plots\ul_mcs_distribution.png
Saved plot: binary_dist_plots\ul_path_loss_distribution.png
Saved plot: binary_dist_plots\ul_phr_distribution.png
Saved plot: binary_dist_plots\dl_err_distribution.png
Saved plot: binary_dist_plots\dl_mcs_distribution.png
Saved plot: binary_dist_plots\dl_retx_distribution.png
Saved plot: binary_dist_plots\dl_tx_distribution.png
Saved plot: binary_dist_plots\cqi_distribution.png
Saved plot: binary_dist_plots\epre_distribution.png
Save

In [7]:
plot_kde_with_gaussian_kernel(
    data=combined_data,
    features=key_features,
    group_column="binary_label",
    bandwidth_methods=["scott", "silverman"],
    output_folder="kde_gaussian_kernel_experiments_binary"
)

In [8]:
# Calculate and save statistics for binary labels
binary_stats_combined = calculate_statistics(combined_data, key_features, "binary_label")
save_statistics(binary_stats_combined, "binary_stats_combined.csv")

		count	mean	std	min	25%	50%	75%	max	variance
Label 0	bearer_0_dl_total_bytes	674570.0	6799738888.840869	17763902926.13668	88.0	5147481.0	8694907.0	14194781.0	87991056255.0	3.155562471692073e+20
Label 0	bearer_0_ul_total_bytes	674570.0	29853010769.196995	80579429065.1857	0.0	538821.0	1001362.0	2052056.0	425728090140.0	6.493044388471295e+21
Label 0	bearer_1_dl_total_bytes	674570.0	6914896072.0263605	18027516051.309536	0.0	3805959.0	10727921.5	23466751.5	90169667041.0	3.24991334980223e+20
Label 0	bearer_1_ul_total_bytes	674570.0	40677478717.37499	112949432101.05373	0.0	1037923.75	15845324.5	11211046130.0	732967598835.0	1.2757574211950548e+22
Label 0	dl_bitrate	674570.0	1156557.9124256934	1573068.8532443282	0.0	3611.0	5538.0	3209316.0	71486108.0	2474545617047.426
Label 0	ul_bitrate	674570.0	5323119.854922395	8441077.288567143	0.0	3070.0	81975.5	10236734.0	69480666.0	71251785791564.03
Label 0	ul_retx	674570.0	56.6773440858621	74.79692622591163	0.0	0.0	24.0	118.0	656.0	5594.580172844

In [9]:
# Plot multiclass label distributions
plot_distributions(combined_data, key_features, group_column="multiclass_label", is_multiclass=True,output_folder='mclass_dist_plots')

Saved plot: mclass_dist_plots\bearer_0_dl_total_bytes_distribution.png
Saved plot: mclass_dist_plots\bearer_0_ul_total_bytes_distribution.png
Saved plot: mclass_dist_plots\bearer_1_dl_total_bytes_distribution.png
Saved plot: mclass_dist_plots\bearer_1_ul_total_bytes_distribution.png
Saved plot: mclass_dist_plots\dl_bitrate_distribution.png
Saved plot: mclass_dist_plots\ul_bitrate_distribution.png
Saved plot: mclass_dist_plots\ul_retx_distribution.png
Skipping ul_err for Attack Type 1 due to zero variance.
Saved plot: mclass_dist_plots\ul_err_distribution.png
Saved plot: mclass_dist_plots\ul_mcs_distribution.png
Saved plot: mclass_dist_plots\ul_path_loss_distribution.png
Saved plot: mclass_dist_plots\ul_phr_distribution.png
Skipping dl_err for Attack Type 1 due to zero variance.
Saved plot: mclass_dist_plots\dl_err_distribution.png
Saved plot: mclass_dist_plots\dl_mcs_distribution.png
Saved plot: mclass_dist_plots\dl_retx_distribution.png
Saved plot: mclass_dist_plots\dl_tx_distribution

In [10]:
plot_kde_with_gaussian_kernel(
    data=combined_data,
    features=key_features,
    group_column="multiclass_label",
    bandwidth_methods=["scott", "silverman"],
    output_folder="kde_gaussian_kernel_experiments_mclass"
)

Skipping ul_err for Attack due to zero variance.
Skipping ul_err for Attack due to zero variance.
Skipping dl_err for Attack due to zero variance.
Skipping dl_err for Attack due to zero variance.


In [11]:
# Calculate and save statistics for multiclass labels
multiclass_stats_combined = calculate_statistics(combined_data, key_features, "multiclass_label")
save_statistics(multiclass_stats_combined, "multiclass_stats_combined.csv")

		count	mean	std	min	25%	50%	75%	max	variance
Label 0	bearer_0_dl_total_bytes	674570.0	6799738888.840869	17763902926.13668	88.0	5147481.0	8694907.0	14194781.0	87991056255.0	3.155562471692073e+20
Label 0	bearer_0_ul_total_bytes	674570.0	29853010769.196995	80579429065.1857	0.0	538821.0	1001362.0	2052056.0	425728090140.0	6.493044388471295e+21
Label 0	bearer_1_dl_total_bytes	674570.0	6914896072.0263605	18027516051.309536	0.0	3805959.0	10727921.5	23466751.5	90169667041.0	3.24991334980223e+20
Label 0	bearer_1_ul_total_bytes	674570.0	40677478717.37499	112949432101.05373	0.0	1037923.75	15845324.5	11211046130.0	732967598835.0	1.2757574211950548e+22
Label 0	dl_bitrate	674570.0	1156557.9124256934	1573068.8532443282	0.0	3611.0	5538.0	3209316.0	71486108.0	2474545617047.426
Label 0	ul_bitrate	674570.0	5323119.854922395	8441077.288567143	0.0	3070.0	81975.5	10236734.0	69480666.0	71251785791564.03
Label 0	ul_retx	674570.0	56.6773440858621	74.79692622591163	0.0	0.0	24.0	118.0	656.0	5594.580172844

#### EDA for the rate of change final datasets

In [12]:
# EDA for rate of change dataset
input_folder_roc = 'Per_UE_Datasets_final_roc'
combined_data_roc = load_data(input_folder_roc)
normal_data_roc, attack_data_roc = combined_data_roc[combined_data_roc["binary_label"] == 0], combined_data_roc[combined_data_roc["binary_label"] == 1]

In [16]:
# Plot binary label distributions for rate of change
plot_distributions(combined_data_roc, key_features, group_column="binary_label", is_multiclass=False , output_folder='binary_dist_plots_roc')

Saved plot: binary_dist_plots_roc\bearer_0_dl_total_bytes_distribution.png
Saved plot: binary_dist_plots_roc\bearer_0_ul_total_bytes_distribution.png
Saved plot: binary_dist_plots_roc\bearer_1_dl_total_bytes_distribution.png
Saved plot: binary_dist_plots_roc\bearer_1_ul_total_bytes_distribution.png
Saved plot: binary_dist_plots_roc\dl_bitrate_distribution.png
Saved plot: binary_dist_plots_roc\ul_bitrate_distribution.png
Saved plot: binary_dist_plots_roc\ul_retx_distribution.png
Saved plot: binary_dist_plots_roc\ul_err_distribution.png
Saved plot: binary_dist_plots_roc\ul_mcs_distribution.png
Saved plot: binary_dist_plots_roc\ul_path_loss_distribution.png
Saved plot: binary_dist_plots_roc\ul_phr_distribution.png
Saved plot: binary_dist_plots_roc\dl_err_distribution.png
Saved plot: binary_dist_plots_roc\dl_mcs_distribution.png
Saved plot: binary_dist_plots_roc\dl_retx_distribution.png
Saved plot: binary_dist_plots_roc\dl_tx_distribution.png
Saved plot: binary_dist_plots_roc\cqi_distribut

In [None]:
plot_kde_with_gaussian_kernel(
    data=combined_data_roc,
    features=key_features,
    group_column="binary_label",
    bandwidth_methods=["scott", "silverman"],
    output_folder="kde_gaussian_kernel_experiments_binary_roc"
)

In [None]:
# Calculate and save statistics for binary labels (rate of change)
binary_stats_combined_roc = calculate_statistics(combined_data_roc, key_features, "binary_label")
save_statistics(binary_stats_combined_roc, "binary_stats_combined_roc.csv")

In [None]:
# Plot multiclass label distributions for rate of change
plot_distributions(combined_data_roc, key_features, group_column="multiclass_label", is_multiclass=True, output_folder='mclass_dist_plots_roc')

In [None]:
plot_kde_with_gaussian_kernel(
    data=combined_data_roc,
    features=key_features,
    group_column="multiclass_label",
    bandwidth_methods=["scott", "silverman"],
    output_folder="kde_gaussian_kernel_experiments_mclass_roc"
)

In [None]:
# Calculate and save statistics for multiclass labels (rate of change)
multiclass_stats_combined_roc = calculate_statistics(combined_data_roc, key_features, "multiclass_label")
save_statistics(multiclass_stats_combined_roc, "multiclass_stats_combined_roc.csv")