# Feature Selection Methods

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from prettytable import PrettyTable

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.metrics import classification_report, accuracy_score

In [2]:
input_csv_file = 'csv 20 minutes/labeled2/features_final/together/all_data_final.csv'
df = pd.read_csv(input_csv_file, delimiter=';', error_bad_lines=False, encoding='ISO-8859-1')

In [3]:
X = joblib.load('csv 20 minutes/labeled2/features_final/together/X.pkl')
y = joblib.load('csv 20 minutes/labeled2/features_final/together/y.pkl')
X_train = joblib.load('csv 20 minutes/labeled2/features_final/together/X_train.pkl')
X_test = joblib.load('csv 20 minutes/labeled2/features_final/together/X_test.pkl')
y_train = joblib.load('csv 20 minutes/labeled2/features_final/together/y_train.pkl')
y_test = joblib.load('csv 20 minutes/labeled2/features_final/together/y_test.pkl')

# Filter Selection Methods (SelectKBest with ANOVA F-value)

In [4]:
selectkbest = SelectKBest(score_func=f_classif, k=30)
X_train_filtered = selectkbest.fit_transform(X_train, y_train)

selected_indices_filter = selectkbest.get_support(indices=True)
selected_features_filter = X_train.columns[selected_indices_filter]
selected_scores_filter = selectkbest.scores_[selected_indices_filter]

In [5]:
selected_features_filter_sorted = [feature for _, feature in sorted(zip(selected_scores_filter, selected_features_filter), reverse=True)]
selected_scores_filter_sorted = sorted(selected_scores_filter, reverse=True)

In [6]:
# Print the 30 best selected features and their scores
print("\nFilter Method - Selected Features and Their Scores:")
for feature, score in zip(selected_features_filter_sorted[:30], selected_scores_filter_sorted[:30]):
    print(f"{feature}: {score}")


Filter Method - Selected Features and Their Scores:
Nr Forward Packets 1: 1012683.6623321977
Number of Packets 1: 994208.3568160227
Sum Payload Length 1: 781632.4871798268
Sum Packet Length 1: 650533.5279791678
Sum RSSI 1: 571983.935119351
Nr Backward Packets 1: 312498.04507695563
Min Payload Length 1: 173258.3405350631
Min Packet Length 1: 173258.34053506234
Min RSSI 1: 158535.4443244083
Info_ADV_NONCONN_IND: 150512.39303565852
PDU Type_0x2: 150101.3912182887
LE Limited Discoverable Mode_True: 128495.8293858528
BR/EDR Not Supported_True: 103749.10977480549
Duration 1: 92221.75277381923
Average RSSI 1: 78899.10585553886
Average Packet Length 1: 75266.98976958526
Average Payload Length 1: 75266.98976746222
Sum of Delta Time 1: 74557.63687589618
Sum Forward Delta Time 1: 74439.12414096775
LE Bluetooth Device Address: 69129.10674056465
Standard Deviation RSSI 1: 68867.29263698489
Flags: 68182.59561816981
16-bit Service Class UUIDs (incomplete): 66783.67496246415
Tx Address_Random: 65373.

# Embedded Method (SelectFromModel with Random Forest)

In [7]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
selectfrommodel = SelectFromModel(rf_classifier, prefit=False)
selectfrommodel.fit(X_train, y_train)

X_train_embedded = selectfrommodel.transform(X_train)

selected_indices_embedded = selectfrommodel.get_support(indices=True)
selected_features_embedded = X_train.columns[selected_indices_embedded]

In [8]:
rf_classifier.fit(X_train_embedded, y_train)

In [9]:
feature_importances = rf_classifier.feature_importances_

In [None]:
selected_features_embedded_sorted = [feature for _, feature in sorted(zip(feature_importances, selected_features_embedded), reverse=True)]
feature_importances_sorted = sorted(feature_importances, reverse=True)

In [10]:
# Print the 30 best selected features and their importances
print("Embedded Method - Selected Features and Their Importances:")
for feature, importance in zip(selected_features_embedded_sorted[:30], feature_importances_sorted[:30]):
    print(f"{feature}: {importance}")


Embedded Method - Selected Features and Their Importances:
Sum Payload Length 1: 0.0602249414264112
Sum Packet Length 1: 0.04863143131054751
Average Payload Length 1: 0.04682261107021593
Nr Forward Packets 1: 0.044695699130995185
Sum of Delta Time 1: 0.04381323688890218
Number of Packets 1: 0.043152293537454776
Sum RSSI 1: 0.03981353762079623
Average Packet Length 1: 0.03972767131112157
Time per Packet 1: 0.036958669115969886
Min RSSI 1: 0.03372065962241686
Nr Backward Packets 1: 0.03321398552435287
Sum Forward Delta Time 1: 0.03265474566287405
Min Packet Length 1: 0.02850865391848031
Min Payload Length 1: 0.027675304213762228
Avg Backward Delta Time 1: 0.026046077493249477
Average Delta Time 1: 0.020231987695904386
Average RSSI 1: 0.019397715524994148
Sum Backward Delta Time 1: 0.01870632126165314
Standard Deviation Delta Time 1: 0.018569515770239006
Max Packet Length 1: 0.01849640712992318
Std Forward Delta Time 1: 0.01834526603894691
Var Forward Delta Time 1: 0.01821780335621765
Du