In [None]:
import torch
import pandas as pd
import numpy as np
import functools

import sys
sys.path.append('../')


from dataloader.dataloader import *
from training.training import *
from models.rae import *
from utils.utils import *
from visualizations.visualizations import *
from evaluation.evaluation import *

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Training on GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Training on CPU.")

In [None]:
data_folder = "../Data/Data v5"

In [None]:
data_folder = "../Data/Data v5"
df = pd.read_csv(os.path.join(data_folder, "amari_ue_data_final_v5_smoothed_scaled.csv"))
df = df.sort_values(["imeisv", "_time"], ascending = True)
df['imeisv'] = df['imeisv'].astype(str)
dataset_used = 'smoothed_scaled'

In [None]:
benign_data_starting_point = "2024-03-20 14:14:50.19"
benign_data_ending_point = "2024-03-23 16:26:19.00"


benign_filter_1 = (df['_time'].between(benign_data_starting_point, benign_data_ending_point))
benign_filter_2 = (~df['imeisv'].isin(['8642840401594200', '8642840401612300','8642840401624200','3557821101183501']))
benign_filter_3 = (df['label'] == 0)
benign_data_filter = (benign_filter_1 & benign_filter_2 & benign_filter_3)

In [None]:
# benign data
benign_data_train = df[benign_data_filter].copy()
benign_data_train = benign_data_train.sort_values(['imeisv','_time'])
print(benign_data_train.shape[0])

In [None]:
benign_data_test_period_start = "2024-03-24 01:20:00.19"
benign_devices_for_testing = ['8609960468879057', '8628490433231157','8677660403123800']

benign_filter_4 = (df['_time'] >= benign_data_test_period_start)
benign_filter_5 = (df['imeisv'].isin(benign_devices_for_testing))
benign_data_filter_test = (benign_filter_3 & benign_filter_4 & benign_filter_5)

benign_data_test = df[benign_data_filter_test].copy()
benign_data_test = benign_data_test.sort_values(['imeisv','_time'])
print(benign_data_test.shape[0])

In [None]:
#malicious data
attck_1_start = "2024-03-23 21:26:00"
attck_1_end = "2024-03-23 22:23:00"
ues_to_exclude_in_1st_attck = [
    '8628490433231157','8609960480666910',
    '3557821101183501'] #'8677660403123800' '8642840401594200'

attck_2_start = "2024-03-23 22:56:00"
attck_2_end = "2024-03-23 23:56:00"
ues_to_exclude_in_2nd_attck = [
    '8609960480666910','8642840401612300'
]

mal_filter_1 = (
    df['_time'].between(attck_1_start, attck_1_end)
    & (~df['imeisv'].isin(ues_to_exclude_in_1st_attck))
)

mal_filter_2 = (
    df['_time'].between(attck_2_start, attck_2_end)
    & (~df['imeisv'].isin(ues_to_exclude_in_2nd_attck))
)

mal_filter_3 = (df['label'] == 1)

malicious_data = df[(mal_filter_1 | mal_filter_2) & mal_filter_3].copy()
malicious_data = malicious_data.sort_values(['imeisv','_time'])
print(malicious_data.shape[0])

In [None]:
f = open("../results/experiments_metadata.json")
exp_metadata = json.load(f)

In [None]:
data_for_df = {}
for exp_id, exp_info in exp_metadata.items():
    parameters = exp_info['parameters']
    flattened_info = {
        **parameters,
        'epochs_trained':exp_info['epochs_trained'], 
        'results_file': exp_info['results_file'], 
        'timestamp': exp_info['timestamp'],
        'min_train_loss': exp_info['min_train_loss'],
        'min_val_loss': exp_info['min_val_loss'],
        'min_train_val_gap': exp_info['min_train_val_gap'],
        'features': exp_info['feature_columns'],
        'rolling_avg': exp_info['rolling_avg'],
        'dataset_used': exp_info['dataset_used']
        }
    data_for_df[exp_id] = flattened_info

exp_df = pd.DataFrame.from_dict(data_for_df, orient='index')

In [None]:
exp_df.sort_values('min_train_val_gap', ascending = True).head(5)

In [None]:
exp_parameters = exp_metadata['5bfa52f8-e8c6-4899-963d-3ebd80be60f9']

In [None]:
exp_parameters

In [None]:
exp_hist = load_history_from_pickle('../results/5bfa52f8-e8c6-4899-963d-3ebd80be60f9_history.pkl', device)

### Plot training validation loss

In [None]:
plot_train_val_loss(exp_hist.train_losses, exp_hist.val_losses)

### Plot rec loss as scatterplot

In [None]:
test_batch_size = 1
benign_test_data_loader, mal_test_data_loader = create_test_ds_loaders(
    benign_data_test, 
    malicious_data, 
    120, 
    30, 
    features = exp_parameters['feature_columns'], 
    batch_size = test_batch_size
)

In [None]:
rae_model = LSTMAutoencoder(
    input_dim = len(exp_parameters['feature_columns']), 
    hidden_dim1 = exp_parameters['parameters']['hidden_dim1'], 
    hidden_dim2 = exp_parameters['parameters']['hidden_dim2'], 
    output_dim = len(exp_parameters['feature_columns']), 
    dropout = exp_parameters['parameters']['dropout'], 
    layer_norm_flag = exp_parameters['parameters']['layer_norm_flag']
)

rae_model.load_state_dict(exp_hist.model_weights)
rae_model.to(device)

criterion = nn.L1Loss() if exp_parameters['parameters']['loss_function'] == 'L1Loss' else nn.MSELoss()

In [None]:
benign_test_losses, mal_test_losses = evaluate(rae_model, criterion, benign_test_data_loader, mal_test_data_loader, device)

In [None]:
plot_scatter_plot_rec_loss(benign_test_losses, mal_test_losses)

### Plot Roc Curve

In [None]:
fpr, tpr, thresholds, roc_auc, optimal_threshold = calculate_threshold(benign_test_losses, mal_test_losses)

In [None]:
plot_roc_curve(fpr, tpr, thresholds , roc_auc)

### Threshold selection & Inference

In [None]:
optimal_threshold

In [None]:
accuracy, precision, recall, f1, tp_rate, tn_rate, fp_rate, fn_rate = infer(benign_test_losses, mal_test_losses, optimal_threshold)

### Original vs reconstructed TS

In [None]:
imeisv = "8628490433231157"

imeisv_series = df[
        (df['imeisv'] == imeisv)
    ].sort_values('_time', ascending = True)[exp_parameters['feature_columns']]

window_length = exp_parameters['parameters']['window_size'] 

split_arrays = []

for start in range(0, len(imeisv_series) - window_length + 1, window_length):
    end = start + window_length
    split_arrays.append(imeisv_series[start:end].values)

rae_model.to(device)
rae_model.eval()

imeisv_original = []
imeisv_rec = []
with torch.no_grad(): 
    for ar in split_arrays:
        ar_tensor = torch.from_numpy(ar).to(device).float()
        imeisv_rec.append(rae_model(ar_tensor).to('cpu').numpy())
        imeisv_original.append(ar_tensor.to('cpu').numpy())

imeisv_original = functools.reduce(lambda a, b: np.concatenate([a,b]), imeisv_original).flatten()
imeisv_rec = functools.reduce(lambda a, b: np.concatenate([a,b]), imeisv_rec).flatten()

In [None]:
for metric in exp_parameters['feature_columns']:
    plot_original_vs_rec(imeisv_original, imeisv_rec, imeisv, metric)