#### Import Packages

In [1]:
import pandas as pd
import os
from datetime import datetime

#### Define Paths

In [2]:
input_folder = 'Per_UE_Datasets'
output_folder1 = 'Per_UE_Datasets_final'
output_folder2 = 'Per_UE_Datasets_final_roc'
# Create output folder if it doesn't exist
os.makedirs(output_folder1, exist_ok=True)
os.makedirs(output_folder2, exist_ok=True)

#### Method to find the gaps where there is a re-connection (jump in time interval and huge change in bytes transfered)

In [3]:
def identify_events(df):
    """
    Processes a DataFrame to detect significant changes in bearer information.
    
    Parameters:
    - df (DataFrame): The input DataFrame containing UE data.
    
    Returns:
    - DataFrame: The processed DataFrame with 'event' columns showing where a reconnection occurs.
    """
    # Set thresholds for change detection and time difference
    massive_change_threshold = 0.5  # 50%
    time_threshold = 10  # seconds
    
    # Ensure timestamps are sorted and converted to datetime
    df['_time'] = pd.to_datetime(df['_time'])
    df = df.sort_values(by='_time').reset_index(drop=True)

    # Calculate the time difference between consecutive rows
    df['time_diff'] = df['_time'].diff()

    # Calculate the percentage change for DL and UL between consecutive rows
    df['dl_change'] = df['bearer_0_dl_total_bytes'].pct_change().fillna(0)
    df['ul_change'] = df['bearer_0_ul_total_bytes'].pct_change().fillna(0)

    # Initialize the 'event' column with 0 and define an event counter
    df['event'] = 0
    event_counter = 0

    # Loop through the DataFrame and update the 'event' column
    for i in range(1, len(df)):
        if (
            df.loc[i, 'time_diff'] > pd.Timedelta(seconds=time_threshold) and
            (abs(df.loc[i, 'dl_change']) > massive_change_threshold or 
             abs(df.loc[i, 'ul_change']) > massive_change_threshold)
        ):
            event_counter += 1  # Increment the event counter
        df.loc[i, 'event'] = event_counter  # Assign the current event count
    df = df.drop(columns=['time_diff', 'dl_change', 'ul_change'])
    return df

#### Method that computes the rate of change between records in the same event

In [4]:
def compute_rate_of_change(df):
    """
    Computes the rate of change for selected features within each event group in the DataFrame.
    
    Parameters:
    - df (DataFrame): The input DataFrame containing UE data with an 'event' column.
    
    Returns:
    - DataFrame: A new DataFrame with the rate of change for each feature, retaining excluded columns.
    """
    # List of columns to exclude from the rate of change calculation
    features_to_exclude = ['_time', 'imeisv', 'attack', 'attack_number', 'event']
    
    # Select only the columns for which we want to compute the rate of change
    features_to_compute = [col for col in df.columns if col not in features_to_exclude]
    
    # Initialize an empty DataFrame to store the rate of change values
    rate_of_change_df = pd.DataFrame()

    # Compute the rate of change for each feature within each event group
    for event, group in df.groupby('event'):
        # Compute the rate of change within each event group
        group_rate_of_change = group[features_to_compute].diff()  # Calculate the diff within each event group
        
        # Concatenate the computed rate of change values back to the main DataFrame
        rate_of_change_df = pd.concat([rate_of_change_df, group_rate_of_change])

    # Reset index after concatenation to align with the original DataFrame's structure
    rate_of_change_df = rate_of_change_df.sort_index().reset_index(drop=True)

    # Add back the excluded features to the rate of change DataFrame
    for feature in features_to_exclude:
        rate_of_change_df[feature] = df[feature]

    # Rearrange columns to match the original DataFrame order
    rate_of_change_df = rate_of_change_df[df.columns]
    
    # Remove the first row of each event group (NaN values from diff)
    rate_of_change_df = rate_of_change_df.dropna().reset_index(drop=True)

    return rate_of_change_df

#### Method to label the records, since only some UEs contribute to each attack (see table)

In [5]:
# Sample attack info dictionary as provided
attack_info = {
    1: {'attack_type': 1, 'timeframe': ('2024-08-18 07:00', '2024-08-18 08:00'), 'ue_imeisv': [8642840401612300, 8642840401624200]},
    2: {'attack_type': 2, 'timeframe': ('2024-08-19 07:00', '2024-08-19 09:41'), 'ue_imeisv': [8642840401612300, 8642840401624200]},
    3: {'attack_type': 3, 'timeframe': ('2024-08-19 17:00', '2024-08-19 18:00'), 'ue_imeisv': [8642840401612300, 8642840401624200]},
    4: {'attack_type': 4, 'timeframe': ('2024-08-21 12:00', '2024-08-21 13:00'), 'ue_imeisv': [8642840401612300, 8642840401624200]},
    5: {'attack_type': 5, 'timeframe': ('2024-08-21 17:00', '2024-08-21 18:00'), 'ue_imeisv': [8642840401612300, 8642840401624200, 8642840401594200, 8677660403123800, 3557821101183501]}
}

def parse_filename(filename):
    """
    Parses the filename to extract the imeisv and type (normal or malicious).
    
    Parameters:
    - filename (str): The filename in the format "ue_<imeisv>_<type>.csv"
    
    Returns:
    - tuple: imeisv (str), attack_type (str)
    """
    parts = filename.split('_')
    imeisv = parts[1]  # The IMEISV part after "ue_"
    attack_type = parts[2].split('.')[0]  # The type part (e.g., "malicious" or "normal")
    return imeisv, attack_type

def add_labels(df, filename):
    """
    Adds binary and multiclass labels to the DataFrame based on filename, timeframe, and ue_imeisv.
    
    Parameters:
    - df (DataFrame): The input DataFrame to which labels are added.
    - filename (str): The filename to check for 'normal' or 'malicious'.
    
    Returns:
    - DataFrame: The DataFrame with two new columns, 'binary_label' and 'multiclass_label'.
    """
    # Extract imeisv and type from filename
    imeisv, file_type = parse_filename(filename)
    print(imeisv, file_type)
    # Initialize new columns with default values
    df['binary_label'] = 0
    df['multiclass_label'] = 0
    
    # If file is "normal," return with labels as 0
    if file_type == "normal":
        return df  # Labels remain as 0 for normal files
    
    # If "malicious," proceed to apply labels based on attack mapping
    if file_type == "malicious":
        # Ensure '_time' is in datetime format for filtering
        df['_time'] = pd.to_datetime(df['_time'])
        
        # Loop over each attack in the attack info dictionary
        for attack_id, attack_data in attack_info.items():
            attack_type = attack_data['attack_type']
            participating_ues = attack_data['ue_imeisv']
            
            # Set the binary and multiclass labels based on conditions
            mask = (
                (df['attack_number'] == attack_type) &
                (df['imeisv'].isin(participating_ues))
            )
            # Print the count of True values in the mask
            true_count = mask.sum()
            print(f"Number of True values in the mask: {true_count}")
            
            # Apply labels only to rows matching the criteria
            df.loc[mask, 'binary_label'] = 1
            df.loc[mask, 'multiclass_label'] = attack_id
    
    return df

#### Loop through each UE dataset in the folder

In [6]:
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):  # Check if the file is a CSV
        file_path = os.path.join(input_folder, filename)
        try:
            # Read the CSV file into a DataFrame
            print(f"Reading {filename}")
            df = pd.read_csv(file_path)
            
            print("Identifying reconnection events")
            df_event = identify_events(df)
            
            print("Create rate of change dataframe")
            df_roc = compute_rate_of_change(df_event)
            
            print("Put labels to dataframes")
            df_labeled = add_labels(df_event, filename)
            df_roc_labeled = add_labels(df_roc, filename)
            # Define file paths based on the filename for saving
            output_path1 = os.path.join(output_folder1, f"{filename.rsplit('.', 1)[0]}_labeled.csv")
            output_path2 = os.path.join(output_folder2, f"{filename.rsplit('.', 1)[0]}_roc_labeled.csv")
            # Save DataFrames to the specified paths
            df_labeled.to_csv(output_path1, index=False)
            df_roc_labeled.to_csv(output_path2, index=False)

            print(f"DataFrames saved to {output_path1} and {output_path2}")
        except Exception as e:
            print(f"Error in {filename}: {e}")

Reading ue_3557821101183501_malicious.csv
Identifying reconnection events
Create rate of change dataframe
Put labels to dataframes
3557821101183501 malicious
Number of True values in the mask: 0
Number of True values in the mask: 0
Number of True values in the mask: 0
Number of True values in the mask: 0
Number of True values in the mask: 699
3557821101183501 malicious
Number of True values in the mask: 0
Number of True values in the mask: 0
Number of True values in the mask: 0
Number of True values in the mask: 0
Number of True values in the mask: 699
DataFrames saved to Per_UE_Datasets_final\ue_3557821101183501_malicious_labeled.csv and Per_UE_Datasets_final_roc\ue_3557821101183501_malicious_roc_labeled.csv
Reading ue_8609960468879057_normal.csv
Identifying reconnection events
Create rate of change dataframe
Put labels to dataframes
8609960468879057 normal
8609960468879057 normal
DataFrames saved to Per_UE_Datasets_final\ue_8609960468879057_normal_labeled.csv and Per_UE_Datasets_fina