# FEATURES EXTRACTION

In [1]:
!pip install pandas scikit-learn



In [9]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import glob
import os
import numpy as np
from scipy.stats import entropy

In [10]:
# determine the direction of a packet, either forward or backward
def determine_direction(row):
    if row['Rx Address'] in ['Random', 'Public']:
        # the packet is received by the device, indicating backward direction
        return 'backward'  
    elif row['Tx Address'] in ['Random', 'Public']:
        # the packet is transmitted by the device, indicating forward direction
        return 'forward'
    else:
        return 'None'

In [11]:
# Extracts features per advertising address and adds them as new column in a new CSV file
def feature_advertising_address(input_file_path, output_file_path, N=5):
    df = pd.read_csv(input_file_path, delimiter=';', error_bad_lines=False, encoding='ISO-8859-1')
    # Remove packets where the "label" field is empty
    df = df.dropna(subset=['label'])
    
    # Constants
    grouped_adv = df.groupby('Advertising Address')
    df_adv = df['Advertising Address']
    total_length1 = grouped_adv['Length.1'].sum()
    df['RSSI float'] = df['RSSI'].str.extract(r'(-?\d+)').astype(float)

    # Calculate duration for each advertising address group and add as new column
    first_time_per_address = grouped_adv['Time'].min()
    last_time_per_address = grouped_adv['Time'].max()
    duration_per_address = last_time_per_address - first_time_per_address
    df['Duration 1'] = df_adv.map(duration_per_address)
    
    # Number of Packets, Packets per Second and Time per Packet features
    flow_size_per_address = grouped_adv.size()
    df['Number of Packets 1'] = df_adv.map(flow_size_per_address)
    df['Packets per Second 1'] = df['Number of Packets 1'] / df['Duration 1']
    df['Time per Packet 1'] = df['Duration 1'] / df['Number of Packets 1']

    # Bytes per Second feature
    total_length_by_adv = df_adv.map(total_length1)
    df['Bytes per Second 1'] = total_length_by_adv / df['Duration 1']

    # Min, Max, Sum, Avg, Std, Var of RSSI
    min_rssi = grouped_adv['RSSI float'].min()
    max_rssi = grouped_adv['RSSI float'].max()
    sum_rssi = grouped_adv['RSSI float'].sum()
    avg_rssi = grouped_adv['RSSI float'].mean()
    std_rssi = grouped_adv['RSSI float'].std()
    var_rssi = grouped_adv['RSSI float'].var()
    df['Min RSSI 1'] = df_adv.map(min_rssi)
    df['Max RSSI 1'] = df_adv.map(max_rssi)
    df['Sum RSSI 1'] = df_adv.map(sum_rssi)
    df['Average RSSI 1'] = df_adv.map(avg_rssi)
    df['Standard Deviation RSSI 1'] = df_adv.map(std_rssi)
    df['Variance RSSI 1'] = df_adv.map(var_rssi)

    # Min, Max, Sum, Avg, Std, Var of Packet Length
    min_length1 = grouped_adv['Length.1'].min()
    max_length1 = grouped_adv['Length.1'].max()
    sum_length1 = grouped_adv['Length.1'].sum()
    avg_length1 = grouped_adv['Length.1'].mean()
    std_length1 = grouped_adv['Length.1'].std()
    var_length1 = grouped_adv['Length.1'].var()
    df['Min Packet Length 1'] = df_adv.map(min_length1)
    df['Max Packet Length 1'] = df_adv.map(max_length1)
    df['Sum Packet Length 1'] = df_adv.map(sum_length1)
    df['Average Packet Length 1'] = df_adv.map(avg_length1)
    df['Standard Deviation Packet Length 1'] = df_adv.map(std_length1)
    df['Variance Packet Length 1'] = df_adv.map(var_length1)

    # Min, Max, Sum, Avg, Std, Var of Length of payload
    min_payload_volume = grouped_adv['Length of payload'].min()
    max_payload_volume = grouped_adv['Length of payload'].max()
    sum_payload_volume = grouped_adv['Length of payload'].sum()
    avg_payload_volume = grouped_adv['Length of payload'].mean()
    std_payload_volume = grouped_adv['Length of payload'].std()
    var_payload_volume = grouped_adv['Length of payload'].var()
    df['Min Payload Length 1'] = df_adv.map(min_payload_volume)
    df['Max Payload Length 1'] = df_adv.map(max_payload_volume)
    df['Sum Payload Length 1'] = df_adv.map(sum_payload_volume)
    df['Average Payload Length 1'] = df_adv.map(avg_payload_volume)
    df['Standard Deviation Payload Length 1'] = df_adv.map(std_payload_volume)
    df['Variance Payload Length 1'] = df_adv.map(var_payload_volume)

    # Min, Max, Sum, Avg, Std, Var of Delta time
    min_delta_time = grouped_adv['Delta time ( end to start)'].min()
    max_delta_time = grouped_adv['Delta time ( end to start)'].max()
    sum_delta_time = grouped_adv['Delta time ( end to start)'].sum()
    avg_delta_time = grouped_adv['Delta time ( end to start)'].mean()
    std_delta_time = grouped_adv['Delta time ( end to start)'].std()
    var_delta_time = grouped_adv['Delta time ( end to start)'].var()
    df['Min Delta Time 1'] = df_adv.map(min_delta_time)
    df['Max Delta Time 1'] = df_adv.map(max_delta_time)
    df['Sum of Delta Time 1'] = df_adv.map(sum_delta_time)
    df['Average Delta Time 1'] = df_adv.map(avg_delta_time)
    df['Standard Deviation Delta Time 1'] = df_adv.map(std_delta_time)
    df['Variance Delta Time 1'] = df_adv.map(var_delta_time)

    # Packet Direction feature
    df['Packet Direction'] = df.apply(determine_direction, axis=1)

    # Total Number of Forward and Backward Packets features
    forward_packet_count = grouped_adv.apply(lambda x: (x['Packet Direction'] == 'forward').sum())
    backward_packet_count = grouped_adv.apply(lambda x: (x['Packet Direction'] == 'backward').sum())
    df['Nr Forward Packets 1'] = df_adv.map(forward_packet_count)
    df['Nr Backward Packets 1'] = df_adv.map(backward_packet_count)

    # Average number of forward/backward packets
    avg_forward_packet_count = grouped_adv.apply(lambda x: (x['Packet Direction'] == 'forward').mean())
    avg_backward_packet_count = grouped_adv.apply(lambda x: (x['Packet Direction'] == 'backward').mean())
    df['Average Nr Forward Packet 1'] = df_adv.map(avg_forward_packet_count)
    df['Average Nr Backward Packet 1'] = df_adv.map(avg_backward_packet_count)

    # Min, Max, Sum, Avg, Std, Var of delta time for forward/backward packets
    min_forward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'forward']['Delta time ( end to start)'].min())
    max_forward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'forward']['Delta time ( end to start)'].max())
    sum_forward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'forward']['Delta time ( end to start)'].sum())
    avg_forward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'forward']['Delta time ( end to start)'].mean())
    std_forward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'forward']['Delta time ( end to start)'].std())
    var_forward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'forward']['Delta time ( end to start)'].var())
    min_backward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'backward']['Delta time ( end to start)'].min())
    max_backward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'backward']['Delta time ( end to start)'].max())
    sum_backward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'backward']['Delta time ( end to start)'].sum())
    avg_backward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'backward']['Delta time ( end to start)'].mean())
    std_backward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'backward']['Delta time ( end to start)'].std())
    var_backward_delta_time = grouped_adv.apply(lambda x: x[x['Packet Direction'] == 'backward']['Delta time ( end to start)'].var())
    df['Min Forward Delta Time 1'] = df_adv.map(min_forward_delta_time)
    df['Max Forward Delta Time 1'] = df_adv.map(max_forward_delta_time)
    df['Sum Forward Delta Time 1'] = df_adv.map(sum_forward_delta_time)
    df['Avg Forward Delta Time 1'] = df_adv.map(avg_forward_delta_time)
    df['Std Forward Delta Time 1'] = df_adv.map(std_forward_delta_time)
    df['Var Forward Delta Time 1'] = df_adv.map(var_forward_delta_time)
    df['Min Backward Delta Time 1'] = df_adv.map(min_backward_delta_time)
    df['Max Backward Delta Time 1'] = df_adv.map(max_backward_delta_time)
    df['Sum Backward Delta Time 1'] = df_adv.map(sum_backward_delta_time)
    df['Avg Backward Delta Time 1'] = df_adv.map(avg_backward_delta_time)
    df['Std Backward Delta Time 1'] = df_adv.map(std_backward_delta_time)
    df['Var Backward Delta Time 1'] = df_adv.map(var_backward_delta_time)

    # Save the new added features to a new CSV file
    df.to_csv(output_file_path, index=False, sep=';', encoding='ISO-8859-1')

In [12]:
directory_path = 'csv 20 minutes/labeled2/'
output_directory = os.path.join(directory_path, 'features_final')
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

files = glob.glob(os.path.join(directory_path, '*_labeled.csv'))

# For each file ending with "_labeled" do feature extraction and save as new CSV file with ending "_features1"
for file in files:
    file_name = os.path.basename(file)
    output_file_path = os.path.join(output_directory, file_name.replace('.csv', '_features1.csv'))
    feature_advertising_address(file, output_file_path)
    print(file)

csv 20 minutes/labeled2\2 hp, 1 apple laptop, bose & samsung headphones_labeled.csv
csv 20 minutes/labeled2\3 ipads_labeled.csv
csv 20 minutes/labeled2\4 airtags_labeled.csv
csv 20 minutes/labeled2\airfryer_labeled.csv
csv 20 minutes/labeled2\airpods, charge2 smartwatch, xiaomi smartphone_labeled.csv
csv 20 minutes/labeled2\apple smartwatch_labeled.csv
csv 20 minutes/labeled2\camera canon_labeled.csv
csv 20 minutes/labeled2\dell laptops_labeled.csv
csv 20 minutes/labeled2\google smartphone_labeled.csv
csv 20 minutes/labeled2\gopro corina_labeled.csv
csv 20 minutes/labeled2\gopro5.1_labeled.csv
csv 20 minutes/labeled2\gopro5_labeled.csv
csv 20 minutes/labeled2\gopro9_labeled.csv
csv 20 minutes/labeled2\LG tv_labeled.csv
csv 20 minutes/labeled2\lh wh 3 & 4 & linkbuds headphones_labeled.csv
csv 20 minutes/labeled2\mindi smartwatch_labeled.csv
csv 20 minutes/labeled2\mixer_labeled.csv
csv 20 minutes/labeled2\ramize mindi iphone, huawei smartphone, fitbit smartwatch_labeled.csv
csv 20 minut

# PUT ALL CSV TOGETHER

In [13]:
directory = 'csv 20 minutes/labeled2/features_final'
# Print all files ending with "_features1"
csv_files = [file for file in os.listdir(directory) if file.endswith('_features1.csv')]
print(csv_files)

merged_df = pd.DataFrame()
# iterate through each CSV file and put the
    df = pd.read_csv(os.path.join(directory, csv_file), delimiter=';', error_bad_lines=False, encoding='ISO-8859-1')
    df_with_labels = df[df['label'].notna()]
    merged_df = pd.concat([merged_df, df_with_labels])

# Save to a new CSV file
merged_df.to_csv('csv 20 minutes/labeled2/features_final/together/all_data_final.csv', index=False)

['2 hp, 1 apple laptop, bose & samsung headphones_labeled_features1.csv', '3 ipads_labeled_features1.csv', '4 airtags_labeled_features1.csv', 'airfryer_labeled_features1.csv', 'airpods, charge2 smartwatch, xiaomi smartphone_labeled_features1.csv', 'apple smartwatch_labeled_features1.csv', 'camera canon_labeled_features1.csv', 'dell laptops_labeled_features1.csv', 'google smartphone_labeled_features1.csv', 'gopro corina_labeled_features1.csv', 'gopro5.1_labeled_features1.csv', 'gopro5_labeled_features1.csv', 'gopro9_labeled_features1.csv', 'LG tv_labeled_features1.csv', 'lh wh 3 & 4 & linkbuds headphones_labeled_features1.csv', 'mindi smartwatch_labeled_features1.csv', 'mixer_labeled_features1.csv', 'ramize mindi iphone, huawei smartphone, fitbit smartwatch_labeled_features1.csv', 'samsung smartphone, oralB toothbrush, rollei, apple smartwatch_labeled_features1.csv', 'smartwatch huawei_labeled_features1.csv', 'wasserkocher ipad 20min_labeled_features1.csv', 'ülkü_labeled_features1.csv']

Check if some columns are the same

In [14]:
# check if some added columns are the same
def check_identical_columns(input_file):
    df = pd.read_csv(input_file, delimiter=';', error_bad_lines=False, encoding='ISO-8859-1')
    identical_columns = []
    for i, col1 in enumerate(df.columns):
        for j, col2 in enumerate(df.columns):
            if i < j: 
                 # Check if columns have identical values
                if df[col1].equals(df[col2]): 
                    identical_columns.append((col1, col2))

    # Print the identical columns
    if identical_columns:
        print("Identical columns found:")
        for pair in identical_columns:
            print(f"{pair[0]} is identical to {pair[1]}")
    else:
        print("No identical columns found.")

In [15]:
check_identical_columns("csv 20 minutes/labeled2/features_final/together/all_data_final.csv")

Identical columns found:
Standard Deviation Packet Length 1 is identical to Standard Deviation Payload Length 1
Variance Packet Length 1 is identical to Variance Payload Length 1
