## *Prep*

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np

# The following five lines ensure that we reload the preprocessing functions 
# everytime this cell is called
import importlib
import helper_files.preprocessing as preprocFuncts
import helper_files.util as util
import parameters as MyParams
importlib.reload(preprocFuncts)
importlib.reload(util)
importlib.reload(MyParams)

# MODEL PARAMETERS

# DERIVATION PARAMETERS
WINDOW_LEN = MyParams.WINDOW_LEN # measured in IQ samples
OVERLAP = MyParams.OVERLAP
NUM_FEATURES = MyParams.NUM_FEATURES

# DATA PARAMETERS
NUM_TRAINING_FILES = MyParams.NUM_TRAINING_FILES # how many files in the saved numpy data for training
NUM_EVALUATION_FILES = MyParams.NUM_EVALUATION_FILES # how many files in the saved numpy data for evaluation
MAX_FILES = MyParams.MAX_FILES # if not using the saved numpy data, this is the max files to intake
USE_SAVED_DATA = MyParams.USE_SAVED_DATA # True = used the saved .npy file data instead of re-deriving the features again
SAVE_METRICS_TO_FILE = MyParams.SAVE_METRICS_TO_FILE
TRAINING_DATASET = MyParams.TRAINING_DATASET
EVAL_DATASET = MyParams.EVAL_DATASET

FEATURES_TO_USE = MyParams.FEATURES_TO_USE

## *Loading training dataset*

In [None]:
importlib.reload(preprocFuncts)
importlib.reload(util)
importlib.reload(MyParams)

# carlos' Mac
# data_dir = '/Users/carlos_1/Documents/GitHub/RFML-Code/RFML_Combined_Dataset_2025/RFML_Drone_Dataset_2025/old_drone_full_annotated_dataset/RFML_Old_Drone_Training_Dataset/*'
# uav-cyberlab-rfml laptop
data_dir = f'/home/uav-cyberlab-rfml/RFML/test_{TRAINING_DATASET}_training'
print("Pulling from directory: ", data_dir)

training_derived_samples, training_labels = preprocFuncts.preprocessFiles(
    data_dir, 
    postfix=f"train_{NUM_FEATURES}ftrs_{NUM_TRAINING_FILES}files_{WINDOW_LEN}win_{'0' + str(int(OVERLAP * 100))}over{"_"+TRAINING_DATASET if TRAINING_DATASET != "" else ""}", 
    features_to_use=FEATURES_TO_USE,
    window_len=WINDOW_LEN,
    overlap=OVERLAP,
    saved_data=USE_SAVED_DATA, 
    max_files=MAX_FILES,
)

## *Preprocessing training dataset*

The following cell removes labels that should not be used---like annotations that were not labeled, or the 'Burst' label, or the Ruko F11 pro labels because it does not appear in the evaluation dataset. 

The cell also applies an encoder to the labels to use for fitting and predicting.

In [None]:
print("\n==BEFORE BALANCING========")
util.display(training_derived_samples, training_labels)

training_derived_samples, training_labels = preprocFuncts.balanceByMedian(training_derived_samples, training_labels, unlabeled_downsampling=70_000)

print("\n==AFTER BALANCING========")
util.display(training_derived_samples, training_labels)


print(f"Number of samples: {training_derived_samples.shape[0]:,}")
print(f"Number of labels: {len(training_labels):,}")

remove_labels = [
    'Burst',
    '',
    # 'Ruko_F11_pro_UL',
    # 'HS100_Downlink',
]

# Build mask: True = keep, False = remove
mask = ~np.isin(training_labels, remove_labels)

# Apply mask
training_derived_samples = training_derived_samples[mask]
training_labels = training_labels[mask]
training_labels_strings = training_labels # save the strings

print(f"\nAfter removing unnecessary labels:")
print(f"Number of samples: {training_derived_samples.shape[0]:,}")
print(f"Number of labels: {len(training_labels):,}")

## *Loading evaluation dataset*

In [None]:
importlib.reload(preprocFuncts)
importlib.reload(util)
importlib.reload(MyParams)

# carlos' Mac
# data_dir = '/Users/carlos_1/Documents/GitHub/RFML-Code/RFML_Combined_Dataset_2025/RFML_Drone_Dataset_2025/old_drone_full_annotated_dataset/RFML_Old_Drone_Eval_data/*'
# uav-cyberlab-rfml laptop
data_dir = f'/home/uav-cyberlab-rfml/RFML/test_{EVAL_DATASET}_eval'
print("Pulling from directory: ", data_dir)


test_derived_samples, test_labels = preprocFuncts.preprocessFiles(
    data_dir, 
    postfix=f"eval_{NUM_FEATURES}ftrs_{NUM_EVALUATION_FILES}files_{WINDOW_LEN}win_{'0' + str(int(OVERLAP * 100))}over{"_"+EVAL_DATASET if EVAL_DATASET != "" else ""}", 
    features_to_use=FEATURES_TO_USE,
    window_len=WINDOW_LEN,
    overlap=OVERLAP,
    saved_data=USE_SAVED_DATA, 
    max_files=MAX_FILES,
)

## *Preprocessing evaluation dataset*
First I need to remove labels that were in the evaluation set but not in the training set.

In [None]:
print("\n==BEFORE BALANCING========")
util.display(test_derived_samples, test_labels)

test_derived_samples, test_labels = preprocFuncts.balanceByMedian(test_derived_samples, test_labels, unlabeled_downsampling=7_000)
print("\n==AFTER BALANCING========")
util.display(test_derived_samples, training_labels)


print(f"Number of samples: {test_derived_samples.shape[0]:,}")
print(f"Number of labels: {test_labels.shape[0]:,}")

# remove labels that are in the test_labels but not in y_train
remove_labels = np.setdiff1d(test_labels, training_labels_strings).tolist()
print(remove_labels)

# Build mask: True = keep, False = remove
mask = ~np.isin(test_labels, remove_labels)

# Apply mask
test_derived_samples = test_derived_samples[mask]
test_labels = test_labels[mask]

print(f"\nAfter removing unnecessary labels:")
print(f"Number of samples: {test_derived_samples.shape[0]:,}")
print(f"Number of labels: {test_labels.shape[0]:,}")

y_strings = test_labels # store the string version of the labels before they get encoded

## *Fitting and prediction*

In [None]:
# testing for proper overlap between testing and training

unique_train_classes = set(np.unique(training_labels))
unique_num_test_classes  = set(np.unique(test_labels))

print("Number of training classes:\t", len(unique_train_classes))
print("Number of testing classes:\t", len(unique_num_test_classes))
print("Overlap:\t\t\t", len(unique_train_classes & unique_num_test_classes))
print("Only-in-test :", sorted(unique_num_test_classes - unique_train_classes)[:20])
print("Only-in-train:", sorted(unique_train_classes - unique_num_test_classes)[:20])

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Ensure arrays
X_train = np.asarray(training_derived_samples)
y_train = np.asarray(training_labels)
X_test = np.asarray(test_derived_samples)
y_test = np.asarray(test_labels)

# Encode string labels to integers
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

model = XGBClassifier(
    objective="binary:logistic",
    # num_class=len(le.classes_), 
    eval_metric='mlogloss',
    # use_label_encoder=False,
    max_depth=4,
    learning_rate=0.3,
    n_estimators=100,
    random_state=42
)

print(f"Training samples: {X_train[0][:2]}")
print(f"Training labels: {y_train[:5]}")
print(f"Training samples type: {X_train.dtype}")
print(f"Training labels type: {y_train.dtype}")
print(f"Training samples size: {len(X_train)}")
print(f"Training labels size: {len(y_train)}")
print(f"Training samples shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")

model.fit(X_train, y_train)


y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}%")


## *Metrics*

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

importlib.reload(preprocFuncts)
importlib.reload(util)

base_name='xgb'
model="XGBoost"

accuracy = accuracy_score(y_test, y_pred)
perc_accuracy = accuracy * 100
cr = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)


util.printMetrics(
    base_name=base_name + "_metrics", 
    model=model, 
    perc_accuracy=perc_accuracy, 
    notes=f"",
    cr=cr,
    cm=cm,
    labels=y_strings,
    max_files=MAX_FILES,
    window_size=WINDOW_LEN,
    overlap=OVERLAP,
    num_features=NUM_FEATURES,
    num_training_files=NUM_TRAINING_FILES,
    num_evaluation_files=NUM_EVALUATION_FILES,
    training_dataset=TRAINING_DATASET,
    eval_dataset=EVAL_DATASET,
)


if SAVE_METRICS_TO_FILE:
    util.saveMetricsToFile(
        base_name=base_name + "_metrics", 
        model=model, 
        perc_accuracy=perc_accuracy, 
        notes="Using the fixed length sample size",
        cr=cr,
        cm=cm,
        labels=y_strings,
        max_files=MAX_FILES,
        window_size=WINDOW_LEN,
        overlap=OVERLAP,
        num_features = NUM_FEATURES,
        num_training_files = NUM_TRAINING_FILES,
        num_evaluation_files = NUM_EVALUATION_FILES,
        training_dataset=TRAINING_DATASET,
        eval_dataset=EVAL_DATASET,
    )

In [None]:
# Create a heatmap of the confusion matrix

plt.figure(figsize=(7, 7))
sn.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y_strings), yticklabels=np.unique(y_strings))

# Labels and title
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix Heatmap for {model} Predictions')

if SAVE_METRICS_TO_FILE:
    plt.savefig(f"./metrics/{base_name}_cm_{(perc_accuracy * 100):.0f}.png", bbox_inches='tight', dpi=200)
plt.show()

The following cell prints Cohen's Kappa and Matthew's Correlation Coefficient
* Cohen's Kappa (Îº) is a statistic measuring agreement between two categorical raters (or one rater at two times) beyond what's expected by chance, correcting for random agreement, and is used for inter-rater reliability in fields like machine learning. It ranges from -1 (total disagreement) to +1 (perfect agreement), with 0 meaning agreement is purely random.

* The Matthews Correlation Coefficient (MCC) is a robust metric in machine learning for evaluating binary/multiclass classification, measuring correlation between actual and predicted classes, ranging from -1 (perfect inverse) to +1 (perfect prediction), with 0 being random; it's especially valuable for imbalanced datasets as it uses all four confusion matrix values (TP, TN, FP, FN) for a balanced performance score, unlike simpler metrics that can be misleading.

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import matthews_corrcoef

print("Cohen's Kappa:", cohen_kappa_score(y_test, y_pred))
print("MCC:", matthews_corrcoef(y_test, y_pred))
