In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

from tqdm import tqdm

import os
import random

import sys
sys.path.append('../')

from dataloader.dataloader import *
from training.training import *
from models.rae import *
from utils.utils import *
from visualizations.visualizations import *
from evaluation.evaluation import *

In [None]:
data_folder = "../Data/Data v5"
df = pd.read_csv(os.path.join(data_folder, "amari_ue_data_final_v5_no_outliers_scaled.csv"))
df = df.sort_values(["imeisv", "_time"], ascending = True)

benign_data_starting_point = "2024-03-20 14:14:50.19"
benign_data_ending_point = "2024-03-23 16:26:19.00"


filter_1 = (df['_time'].between(benign_data_starting_point, benign_data_ending_point))
filter_2 = (~df['imeisv'].isin(['8642840401594200', '8642840401612300','8642840401624200','3557821101183501']))

df.drop(df[filter_1 & filter_2].index, inplace = True)
benign_data = df[df['label'] == 0].copy()
benign_data = benign_data.sort_values(['imeisv','_time'])
print(benign_data.shape[0])
malicious_data = df[df['label'] == 1].copy()
malicious_data = malicious_data.sort_values(['imeisv','_time'])
print(malicious_data.shape[0])

In [None]:
feature_columns = [
    'dl_bitrate','ul_bitrate', 
    'cell_x_dl_retx', 'cell_x_dl_tx',
    'cell_x_ul_retx', 'cell_x_ul_tx',
    'ul_total_bytes_non_incr', 'dl_total_bytes_non_incr'
    ]

### Intra vs Inter class variability

In [None]:
def calculate_euclidean_distances(batch1, batch2, num_of_features):
    batch1_reshaped = batch1.view(-1, num_of_features)
    batch2_reshaped = batch2.view(-1, num_of_features)
    
    if batch1_reshaped.shape != batch2_reshaped.shape:
        return np.NaN
    
    distances = torch.zeros(num_of_features)
    
    for i in range(num_of_features):
        distances[i] = torch.norm(batch1_reshaped[:, i] - batch2_reshaped[:, i], p=2)
        
    return distances

In [None]:
test_batch_size = 10
benign_data_loader, mal_data_loader = create_test_ds_loaders(
    benign_data, 
    malicious_data, 
    120, 
    30, 
    features = feature_columns, 
    batch_size = test_batch_size
)

In [None]:
benign_data = [*map(lambda x: x[0], benign_data_loader)]
mal_data = [*map(lambda x: x[0], mal_data_loader)]
benign_data_sampled = random.sample(benign_data, len(mal_data))

In [None]:
num_of_features = len(feature_columns)

In [None]:
%%time

A = torch.zeros((len(benign_data), len(benign_data_sampled), num_of_features))

for i, benign_i in enumerate(benign_data):
    for j, benign_j in enumerate(benign_data_sampled):
        if i != j:
            A[i, j] = calculate_euclidean_distances(benign_i, benign_j, num_of_features)

In [None]:
%%time

B = torch.zeros((len(benign_data), len(mal_data), num_of_features))

for i, benign_tensor in enumerate(benign_data):
    for j, mal_tensor in enumerate(mal_data):
        B[i, j] = calculate_euclidean_distances(benign_tensor, mal_tensor, num_of_features)

In [None]:
A.shape, B.shape

In [None]:
matrices_A = [A[:,:,i].numpy().copy() for i in range(num_of_features)]
matrices_B = [B[:,:,i].numpy().copy() for i in range(num_of_features)]

In [None]:
forb_norm_matrices_A = [*map(lambda X: np.linalg.norm(X[:-1,:-1], 'fro'), matrices_A)] 
forb_norm_matrices_B = [*map(lambda X: np.linalg.norm(X[:-1,:-1], 'fro'), matrices_B)] 

In [None]:
for num, (a,b) in enumerate(zip(forb_norm_matrices_A, forb_norm_matrices_B)):
    print(f"feature {feature_columns[num]}, within class variability:{format(a, '.2f')}, intra class variability:{format(b, '.2f')}")

### Numerical Analysis of NN outputs

In [None]:
import torch.nn as nn
import torch

In [None]:
benign_test_data_loader, mal_test_data_loader = create_test_ds_loaders(benign_data, malicious_data, 120, 120, ['ul_bitrate'], 32)

In [None]:
benign_ts_data = torch.cat([batch[0] for batch in [*benign_test_data_loader]])
mal_ts_data = torch.cat([batch[0] for batch in [*mal_test_data_loader]], dim=0)

benign_ts_data_norm1 = np.linalg.norm(benign_ts_data, ord=1, axis = 1)
benign_ts_data_norm2 = np.linalg.norm(benign_ts_data, axis = 1)

mal_ts_data_norm1 = np.linalg.norm(mal_ts_data, ord=1, axis = 1)
mal_ts_data_norm1_sampled = np.random.choice(mal_ts_data_norm1, size=benign_ts_data_norm1.shape[0])

mal_ts_data_norm2 = np.linalg.norm(mal_ts_data, axis = 1)
mal_ts_data_norm2_sampled = np.random.choice(mal_ts_data_norm2, size=benign_ts_data_norm2.shape[0])

In [None]:
plot_dist(benign_ts_data_norm1, mal_ts_data_norm1_sampled, 'Distribution of Benign & Malicious TS Data Norm1', x_axis_range = [None, None])

In [None]:
#plot_dist(benign_ts_data_norm2, mal_ts_data_norm2_sampled, 'Distribution of Benign & Malicious TS Data Norm2', x_axis_range = [None, None])

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Training on GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Training on CPU.")

random_model = LSTMAutoencoder(
    input_dim = 1, 
    hidden_dim1 = 50, 
    hidden_dim2 = 100, 
    output_dim = 1,
    dropout = 0.2,
    layer_norm_flag = False
)

In [None]:
random_model.to(device)
random_model.eval()

with torch.no_grad(): 
    benign_random_model_outputs = []
    for instance in benign_ts_data:
        instance = instance.view(120,1).to(device)
        benign_random_model_outputs.append(random_model(instance).view(-1).to('cpu').numpy())

with torch.no_grad():  
    mal_random_model_outputs = []
    for instance in mal_ts_data:
        instance = instance.view(120,1).to(device)
        mal_random_model_outputs.append(random_model(instance).view(-1).to('cpu').numpy())

In [None]:
benign_ts_random_model_out_norm1 = np.linalg.norm(np.array(benign_random_model_outputs), ord=1, axis = 1)
mal_ts_random_model_out_norm1 = np.linalg.norm(np.array(mal_random_model_outputs), ord=1, axis = 1)
mal_ts_random_model_out_norm1_sampled = np.random.choice(mal_ts_random_model_out_norm1, size=benign_ts_random_model_out_norm1.shape[0])

benign_ts_random_model_out_norm2 = np.linalg.norm(np.array(benign_random_model_outputs), axis = 1)
mal_ts_random_model_out_norm2 = np.linalg.norm(np.array(mal_random_model_outputs), axis = 1)
mal_ts_random_model_out_norm2_sampled = np.random.choice(mal_ts_random_model_out_norm2, size=benign_ts_random_model_out_norm2.shape[0])

In [None]:
plot_dist(
    benign_ts_random_model_out_norm1, 
    mal_ts_random_model_out_norm1_sampled, 
    'Distribution of Benign & Malicious TS Random Model Output Norm1',
    nbins = 50, 
    x_axis_range = [None, None]
)

In [None]:
plot_dist(
    benign_ts_random_model_out_norm2, 
    mal_ts_random_model_out_norm2_sampled, 
    'Distribution of Benign & Malicious TS Random Model Output Norm2',
    nbins = 50, 
    x_axis_range = [None, None]
)

In [None]:
exp_parameters = {'window_size': 120,
    'batch_size': 32,
    'hidden_dim1': 25,
    'hidden_dim2': 50,
    'dropout': 0.2,
    'layer_norm_flag': False,
    'loss_function': 'L1Loss',
    'lr': 0.001,
    'num_epochs': 52,
    'epochs_trained': 37,
    'results_file': './results/17d829f2-3999-4513-9844-462191858daa_history.pkl',
    'timestamp': '2024-04-08 23:04:20.306661',
    'min_train_loss': 0.085,
    'min_val_loss': 0.1056,
    'min_train_val_gap': -0.0079
}

exp_hist = load_history_with_pickle('../results/17d829f2-3999-4513-9844-462191858daa_history.pkl')

model = LSTMAutoencoder(
    input_dim = 1, 
    hidden_dim1 = exp_parameters['hidden_dim1'], 
    hidden_dim2 = exp_parameters['hidden_dim2'], 
    output_dim = 1,
    dropout = exp_parameters['dropout'],
    layer_norm_flag = exp_parameters['layer_norm_flag']
)
model.load_state_dict(exp_hist.model_weights)
model.to(device)

criterion = nn.L1Loss() if exp_parameters['loss_function'] == 'L1Loss' else nn.MSELoss()

In [None]:
model.to(device)
model.eval()

with torch.no_grad(): 
    benign_data_model_outputs = []
    for instance in benign_ts_data:
        instance = instance.view(120,1).to(device)
        benign_data_model_outputs.append(model(instance).view(-1).to('cpu').numpy())
        
with torch.no_grad():  
    mal_data_model_outputs = []
    for instance in mal_ts_data:
        instance = instance.view(120,1).to(device)
        mal_data_model_outputs.append(model(instance).view(-1).to('cpu').numpy())      

In [None]:
benign_ts_model_out_norm1 = np.linalg.norm(np.array(benign_data_model_outputs), ord=1, axis = 1)
benign_ts_model_out_norm2 = np.linalg.norm(np.array(benign_data_model_outputs), axis = 1)


mal_ts_model_out_norm1 = np.linalg.norm(np.array(mal_data_model_outputs), ord=1, axis = 1)
mal_ts_model_out_norm2 = np.linalg.norm(np.array(mal_data_model_outputs), axis = 1)

mal_ts_model_out_norm1_sampled = np.random.choice(mal_ts_model_out_norm1, size=benign_ts_model_out_norm1.shape[0])
mal_ts_model_out_norm2_sampled = np.random.choice(mal_ts_model_out_norm2, size=benign_ts_model_out_norm2.shape[0])

In [None]:
plot_dist(
    benign_ts_model_out_norm1, 
    mal_ts_model_out_norm1_sampled, 
    'Distribution of Benign & Malicious TS Trained Model Output Norm1',
    nbins = 50, 
    x_axis_range = [None, None]
)

In [None]:
plot_dist(
    benign_ts_model_out_norm2, 
    mal_ts_model_out_norm2_sampled, 
    'Distribution of Benign & Malicious TS Trained Model Output Norm2',
    nbins = 50, 
    x_axis_range = [None, None]
)