# Random Forest on 80/20 Data Splitting Approach with selected Features

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import re
from sklearn.model_selection import cross_val_score, train_test_split, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from prettytable import PrettyTable

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics

LOAD TRAINING AND TEST DATASETS

In [3]:
X = joblib.load('csv 20 minutes/labeled2/features_final/together/X.pkl')
y = joblib.load('csv 20 minutes/labeled2/features_final/together/y.pkl')
X_train = joblib.load('csv 20 minutes/labeled2/features_final/together/X_train1.pkl')
X_test = joblib.load('csv 20 minutes/labeled2/features_final/together/X_test1.pkl')
y_train = joblib.load('csv 20 minutes/labeled2/features_final/together/y_train1.pkl')
y_test = joblib.load('csv 20 minutes/labeled2/features_final/together/y_test1.pkl')

Need to preprocess the data again cause not able to open the file 'all_data_final' because of its size

In [2]:
input_csv_file = 'csv 20 minutes/labeled2/features_final/together/all_data_final_before_one_hot_encoded.csv'
data = pd.read_csv(input_csv_file, delimiter=';', encoding='ISO-8859-1')

def get_feature_types(df):
    numerical_features = []
    categorical_features = []
    boolean_features = []
    for column_name, column_type in df.dtypes.items():
        if column_type in ['int64', 'float64']:
            numerical_features.append(column_name)
        elif column_type == 'object':
            categorical_features.append(column_name)
        elif column_type == 'bool':
            boolean_features.append(column_name)
    return numerical_features, categorical_features, boolean_features

numerical_features, categorical_features, boolean_features = get_feature_types(data)

def one_hot_encode_features(data, categorical_features):
    if 'label' in categorical_features:
        categorical_features.remove('label')
    if 'subcategory' in categorical_features:
        categorical_features.remove('subcategory')
    data = pd.get_dummies(data, columns=categorical_features, drop_first=True)
    return data

data = one_hot_encode_features(data, categorical_features)

data.fillna(0, inplace=True)

['PDU Type', 'Info', 'Channel Selection Algorithm', 'Tx Address', 'Rx Address', 'Simultaneous LE and BR/EDR to Same Device Capable (Host)', 'Simultaneous LE and BR/EDR to Same Device Capable (Controller)', 'BR/EDR Not Supported', 'LE General Discoverable Mode', 'LE Limited Discoverable Mode', 'label', 'subcategory', 'Packet Direction']
Total number of rows: 901623
Rows with NaN values:


Need to get the selected features from the embedded feature selection method

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
selectfrommodel = SelectFromModel(rf_classifier, prefit=False)
selectfrommodel.fit(X_train, y_train)
X_train_embedded = selectfrommodel.transform(X_train)
selected_indices_embedded = selectfrommodel.get_support(indices=True)
selected_features_embedded = X_train.columns[selected_indices_embedded]
rf_classifier.fit(X_train_embedded, y_train)
feature_importances = rf_classifier.feature_importances_
selected_features_embedded_sorted = [feature for _, feature in sorted(zip(feature_importances, selected_features_embedded), reverse=True)]
feature_importances_sorted = sorted(feature_importances, reverse=True)

## Train the RF classifier on the selected features and get the best number of features with the highest accuracy

In [5]:
def evaluate_top_features(data, selected_features, n_features):
    top_n_features = selected_features[:n_features]
    X_top = data[top_n_features]
    y_top = data['label']
    
    X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(X_top, y_top, test_size=0.2, random_state=42)
    
    rf_model_top = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model_top.fit(X_train_top, y_train_top)
    
    y_train_pred = rf_model_top.predict(X_train_top)
    y_test_pred = rf_model_top.predict(X_test_top)
    
    train_accuracy = accuracy_score(y_train_top, y_train_pred)
    test_accuracy = accuracy_score(y_test_top, y_test_pred)
    
    return train_accuracy, test_accuracy

results = []
for n in range(1, 31):
    train_acc, test_acc = evaluate_top_features(data, selected_features_embedded_sorted, n)
    print(f'Number of features: {n}, Training Accuracy: {train_acc}, Test Accuracy: {test_acc}')
    results.append((n, train_acc, test_acc))

best_result = max(results, key=lambda x: x[2])
best_n_features = best_result[0]
best_train_accuracy = best_result[1]
best_test_accuracy = best_result[2]

print(f'Best number of features: {best_n_features}')
print(f'Best Training Accuracy: {best_train_accuracy}')
print(f'Best Test Accuracy: {best_test_accuracy}')

print("All results:")
for n, train_acc, test_acc in results:
    print(f'Top {n} Features - Training Accuracy: {train_acc}, Test Accuracy: {test_acc}')

Number of features: 1, Training Accuracy: 0.9991834165629185, Test Accuracy: 0.9990350755580203
Number of features: 2, Training Accuracy: 0.9992568952083605, Test Accuracy: 0.9991071676140302
Number of features: 3, Training Accuracy: 0.9992568952083605, Test Accuracy: 0.9991071676140302
Number of features: 4, Training Accuracy: 0.999262440766507, Test Accuracy: 0.9991238042423403
Number of features: 5, Training Accuracy: 0.9999084982905817, Test Accuracy: 0.9994787189796202
Number of features: 6, Training Accuracy: 0.9999084982905817, Test Accuracy: 0.9994842645223901
Number of features: 7, Training Accuracy: 0.9999514763662176, Test Accuracy: 0.9995341744073201
Number of features: 8, Training Accuracy: 0.9999514763662176, Test Accuracy: 0.9994953556079301
Number of features: 9, Training Accuracy: 0.9999528627557542, Test Accuracy: 0.9995397199500902
Number of features: 10, Training Accuracy: 0.9999528627557542, Test Accuracy: 0.9994842645223901
Number of features: 11, Training Accurac