# Feature Based Fingerprinting
In this notebook, we utilize a Random Forest Classifier to classify our devices utilizing the features we have extracted.

In this example, we test the baseline test (3x Arduino Unos, 3x STM 32's, 1m distance, 5 different samples of data across different times/days). This notebook reports the test accuracy and other metrics for the baseline test

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import csv
from sklearn.metrics import confusion_matrix

sns.set()

## Step 1: Create Dataframe

In [None]:
# Each row is a sample, and each column is a feature. One of the columns is the target output.

# Baseline Test Data
samples1 = pd.read_csv('day1.csv')
samples2 = pd.read_csv('day2.csv')
samples3 = pd.read_csv('morning.csv')
samples4 = pd.read_csv('noon.csv')
samples5 = pd.read_csv('afternoon.csv')
frames = [samples1, samples2, samples3, samples4, samples5]
samples = pd.concat(frames)
samples

## Step 2a: Decision Tree Model

In [None]:
from sklearn.model_selection import train_test_split

# For all data together
train_data, validation_data = train_test_split(samples, test_size=0.3, random_state=5)
validation_data, test_data = train_test_split(validation_data, test_size=0.50, random_state=5)

In [None]:
# If value is NaN, replace with zero
for i in range(0, len(train_data)):
    if (np.isnan(train_data['freq_MHz_2'].iloc[i])):
        train_data['freq_MHz_2'].iloc[i] = 0
    if (np.isnan(train_data['strDiff_dB_2'].iloc[i])):
        train_data['strDiff_dB_2'].iloc[i] = 0
    if (np.isnan(train_data['relStr_dB_2'].iloc[i])):
        train_data['relStr_dB_2'].iloc[i] = 0
    if (np.isnan(train_data['relDist_Hz_2'].iloc[i])):
        train_data['relDist_Hz_2'].iloc[i] = 0
    if (np.isnan(train_data['width_Hz_2'].iloc[i])):
        train_data['width_Hz_2'].iloc[i] = 0
    if (np.isnan(train_data['autoCorrelation_lag-2'].iloc[i])):
        train_data['autoCorrelation_lag-2'].iloc[i] = 0
    if (np.isnan(train_data['autoCorrelation_lag-2_2'].iloc[i])):
        train_data['autoCorrelation_lag-2_2'].iloc[i] = 0
    if (np.isnan(train_data['alpha'].iloc[i])):
        train_data['alhpa'].iloc[i] = 0
    if (np.isnan(train_data['alpha_2'].iloc[i])):
        train_data['alhpa_2'].iloc[i] = 0

for i in range(0, len(validation_data)):
    if (np.isnan(validation_data['freq_MHz_2'].iloc[i])):
        validation_data['freq_MHz_2'].iloc[i] = 0
    if (np.isnan(validation_data['strDiff_dB_2'].iloc[i])):
        validation_data['strDiff_dB_2'].iloc[i] = 0
    if (np.isnan(validation_data['relStr_dB_2'].iloc[i])):
        validation_data['relStr_dB_2'].iloc[i] = 0
    if (np.isnan(validation_data['relDist_Hz_2'].iloc[i])):
        validation_data['relDist_Hz_2'].iloc[i] = 0
    if (np.isnan(validation_data['width_Hz_2'].iloc[i])):
        validation_data['width_Hz_2'].iloc[i] = 0
    if (np.isnan(validation_data['autoCorrelation_lag-2'].iloc[i])):
        validation_data['autoCorrelation_lag-2'].iloc[i] = 0
    if (np.isnan(validation_data['autoCorrelation_lag-2_2'].iloc[i])):
        validation_data['autoCorrelation_lag-2_2'].iloc[i] = 0
    if (np.isnan(validation_data['alpha'].iloc[i])):
        validation_data['alhpa'].iloc[i] = 0
    if (np.isnan(validation_data['alpha_2'].iloc[i])):
        validation_data['alhpa_2'].iloc[i] = 0

for i in range(0, len(test_data)):
    if (np.isnan(test_data['freq_MHz_2'].iloc[i])):
        test_data['freq_MHz_2'].iloc[i] = 0
    if (np.isnan(test_data['strDiff_dB_2'].iloc[i])):
        test_data['strDiff_dB_2'].iloc[i] = 0
    if (np.isnan(test_data['relStr_dB_2'].iloc[i])):
        test_data['relStr_dB_2'].iloc[i] = 0
    if (np.isnan(test_data['relDist_Hz_2'].iloc[i])):
        test_data['relDist_Hz_2'].iloc[i] = 0
    if (np.isnan(test_data['width_Hz_2'].iloc[i])):
        test_data['width_Hz_2'].iloc[i] = 0
    if (np.isnan(test_data['autoCorrelation_lag-2'].iloc[i])):
        test_data['autoCorrelation_lag-2'].iloc[i] = 0
    if (np.isnan(test_data['autoCorrelation_lag-2_2'].iloc[i])):
        test_data['autoCorrelation_lag-2_2'].iloc[i] = 0
    if (np.isnan(test_data['alpha'].iloc[i])):
        test_data['alhpa'].iloc[i] = 0
    if (np.isnan(test_data['alpha_2'].iloc[i])):
        test_data['alhpa_2'].iloc[i] = 0

In [None]:
# Features we need, as well as target
features = [
    'freq_MHz',               # raw frequency of clock in first band
    'freq_MHz_2',             # raw frequency of clock in second band
    'width_Hz',               # width of clock in first band
    'width_Hz_2',             # Width of clock in second band
    'autoCorrelation_lag-2',  # autocorrelation of first band
    'autoCorrelation_lag-2_2',# autocorrelation of second band
    'strDiff_dB_2',           # frequency snr ratio relationship between clocks
    'alpha',                  # cyclic frequency of band 1
    'alpha_2',                # cyclic frequency of band 2
    'relDist_Hz_2'            # frequency difference between clock harmonics
]

target = 'Device'          # which device is this? (can be either between devices, or within devices)    

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Max Depth of tree
max_depth = 15
# Min samples required to be at leaf node
min_samples_leaf = 1
decision_tree_model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=6)
decision_tree_model.fit(train_data[features], train_data[target])

In [None]:
import graphviz
from sklearn import tree

# Visualize the tree
def draw_tree(tree_model, features):
    """
    visualizes a Decision Tree
    """
    tree_data = tree.export_graphviz(tree_model, 
                                    impurity=False, 
                                    feature_names=features,
                                    class_names=tree_model.classes_.astype(str),
                                    filled=True,
                                    out_file=None)
    graph = graphviz.Source(tree_data) 
    display(graph)
    
draw_tree(decision_tree_model, features)

## Step 2b: Decision Tree Train/Validation Accuracy

In [None]:
# Training Accuracy
y_pred_training = decision_tree_model.predict(train_data[features])
y_true = list(train_data[target])
correctNum = 0
for i in range(0, len(y_true)):
    if (y_pred_training[i] == y_true[i]):
        correctNum = correctNum + 1
decision_train_accuracy = correctNum / len(y_true)
print("decision train accuracy: ", decision_train_accuracy)

# Validation Accuracy
y_pred_valid = decision_tree_model.predict(validation_data[features])
y_true = list(validation_data[target])
correctNum = 0
for i in range(0, len(y_true)):
    if (y_pred_valid[i] == y_true[i]):
        correctNum = correctNum + 1
decision_validation_accuracy = correctNum / len(y_true)
print("decision validation accuracy: ", decision_validation_accuracy)

In [None]:
def plot_confusion_matrix(y_true, y_pred, fig_name, to_norm, large_scale, program_idle):
    if to_norm == 1:
        data = confusion_matrix(y_true, y_pred, normalize='true') * 100
    else:
        data = confusion_matrix(y_true, y_pred)
    # figsize=(6, 6) control width and height
    # dpi = 600, I 
    plt.figure(figsize=(6, 6), 
               dpi = 600) 


    if large_scale == 1:
        # parameter annot_kws={"size": 8} control corr values font size
        snsPlot = sns.heatmap(data, xticklabels=['ard1', 'ard2', 'ard3', 'ard4', 'ard5', 'ard6', 'ard7', 'ard8', 'ard9', 'ard10'], yticklabels=['ard1', 'ard2', 'ard3', 'ard4', 'ard5', 'ard6', 'ard7', 'ard8', 'ard9', 'ard10'], 
                              cmap="Blues", annot=True, square=True, fmt='.0f', cbar=False, annot_kws={"size": 12})
    elif program_idle == 1:
        # parameter annot_kws={"size": 8} control corr values font size
        snsPlot = sns.heatmap(data, xticklabels=['ard1I', 'ard1P', 'ard2I', 'ard2P', 'ard3I', 'ard3P', 'stm1I', 'stm1P', 'stm2I', 'stm2P', 'stm3I', 'stm3P'], yticklabels=['ard1I', 'ard1P', 'ard2I', 'ard2P', 'ard3I', 'ard3P', 'stm1I', 'stm1P', 'stm2I', 'stm2P', 'stm3I', 'stm3P'], 
                              cmap="Blues", annot=True, square=True, fmt='.0f', cbar=False, annot_kws={"size": 12})
    else:
        # parameter annot_kws={"size": 8} control corr values font size
        snsPlot = sns.heatmap(data, xticklabels=['ard1', 'ard2', 'ard3', 'stm1', 'stm2', 'stm3'], yticklabels=['ard1', 'ard2', 'ard3', 'stm1', 'stm2', 'stm3'], 
                              cmap="Blues", annot=True, square=True, fmt='.0f', cbar=False, annot_kws={"size": 12})
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=0)
    plt.yticks(rotation=0)
    plt.tick_params(axis = 'x', labelsize = 10) # x font label size
    plt.tick_params(axis = 'y', labelsize = 10) # y font label size
    plt.savefig(fig_name) 

In [None]:
to_norm = 0
large_scale = 0
program_idle = 0
plot_confusion_matrix(y_true, y_pred_valid, 'dt_valid_matrix.jpg', to_norm, large_scale, program_idle)

In [None]:
# Test Accuracy
y_pred_test = decision_tree_model.predict(test_data[features])
y_true = list(test_data[target])
correctNum = 0
for i in range(0, len(y_true)):
    if (y_pred_test[i] == y_true[i]):
        correctNum = correctNum + 1
decision_test_accuracy = correctNum / len(y_true)
print("decision test accuracy: ", decision_test_accuracy)
precision = precision_score(y_true, y_pred_test, average='macro')
print("precision: ", precision)
recall = recall_score(y_true, y_pred_test, average='macro')
print("recall: ", recall)
f1= f1_score(y_true, y_pred_test, average='macro')
print("f1 score: ", f1)

In [None]:
to_norm = 0
large_scale = 0
program_idle = 0
plot_confusion_matrix(y_true, y_pred_test, 'matrix_decision_tree_baseline.pdf', to_norm, large_scale, program_idle)

## Step 2c: Decision Tree Parameter Tuning

In [None]:
# Grid Search
from sklearn.model_selection import GridSearchCV
hyperparameters = {'min_samples_leaf': [1, 10, 50, 100, 200, 300], 'max_depth':[1, 5, 10, 15, 20]}
estimator = DecisionTreeClassifier()
search = GridSearchCV(estimator=estimator, param_grid=hyperparameters, cv=6, return_train_score=True)
search.fit(train_data[features], train_data[target])
result = search.best_params_
print("result: ", result)

## Step 3a: Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
clf = RandomForestClassifier(max_depth=15, min_samples_leaf=1, random_state=6)
clf.fit(train_data[features], train_data[target])

## Step 3b: Random Forest Train/Validation Accuracy

In [None]:
# Training Accuracy
y_pred_training = clf.predict(train_data[features])
y_true = list(train_data[target])
correctNum = 0
for i in range(0, len(y_true)):
    if (y_pred_training[i] == y_true[i]):
        correctNum = correctNum + 1
decision_train_accuracy = correctNum / len(y_true)
print("decision train accuracy: ", decision_train_accuracy)

# Validation Accuracy
y_pred_valid = clf.predict(validation_data[features])
y_true = list(validation_data[target])
correctNum = 0
for i in range(0, len(y_true)):
    if (y_pred_valid[i] == y_true[i]):
        correctNum = correctNum + 1
decision_validation_accuracy = correctNum / len(y_true)
print("decision validation accuracy: ", decision_validation_accuracy)

In [None]:
# Test Accuracy
y_pred_test = clf.predict(test_data[features])
y_true = list(test_data[target])
correctNum = 0
for i in range(0, len(y_true)):
    if (y_pred_test[i] == y_true[i]):
        correctNum = correctNum + 1
decision_test_accuracy = correctNum / len(y_true)
print("decision test accuracy: ", decision_test_accuracy)
precision = precision_score(y_true, y_pred_test, average='macro')
print("precision: ", precision)
recall = recall_score(y_true, y_pred_test, average='macro')
print("recall: ", recall)
f1= f1_score(y_true, y_pred_test, average='macro')
print("f1 score: ", f1)

In [None]:
to_norm = 0
large_scale = 0
program_idle = 0
plot_confusion_matrix(y_true, y_pred_test, 'matrix_random_forest_baseline.pdf', to_norm, large_scale, program_idle)

#### 