In [1]:
import pandas as pd


import os

import sys
sys.path.append('../')

import torch.nn as nn
import torch

from dataloader.dataloader import *
from training.training import *
from models.rae import *
from utils.utils import *
from visualizations.visualizations import *
from evaluation.evaluation import *

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Training on GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Training on CPU.")

CUDA is not available. Training on CPU.


In [4]:
data_folder = "../Data/Data v5"
df = pd.read_csv(os.path.join(data_folder, "amari_ue_data_final_v5_smoothed_scaled.csv"))
df = df.sort_values(["imeisv", "_time"], ascending = True)
df['imeisv'] = df['imeisv'].astype(str)

  df = pd.read_csv(os.path.join(data_folder, "amari_ue_data_final_v5_smoothed_scaled.csv"))


In [5]:
feature_columns = [
    'dl_bitrate','ul_bitrate', 
    'cell_x_dl_retx', 'cell_x_dl_tx',
    'cell_x_ul_retx', 'cell_x_ul_tx',
    'ul_total_bytes_non_incr', 'dl_total_bytes_non_incr'
]

# feature_columns = [
#     'dl_bitrate','ul_bitrate','ul_total_bytes_non_incr', 'dl_total_bytes_non_incr'
# ]

# feature_columns = [
#     'ul_bitrate'
# ]

In [6]:
benign_data_starting_point = "2024-03-20 14:14:50.19"
benign_data_ending_point = "2024-03-23 16:26:19.00"


benign_filter_1 = (df['_time'].between(benign_data_starting_point, benign_data_ending_point))
benign_filter_2 = (~df['imeisv'].isin(['8642840401594200', '8642840401612300','8642840401624200','3557821101183501']))
benign_filter_3 = (df['label'] == 0)
benign_data_filter = (benign_filter_1 & benign_filter_2 & benign_filter_3)

In [7]:
# benign data
benign_data_train = df[benign_data_filter].copy()
benign_data_train = benign_data_train.sort_values(['imeisv','_time'])
print(benign_data_train.shape[0])

260051


In [8]:
benign_data_test_period_start = "2024-03-24 01:20:00.19"
benign_devices_for_testing = ['8609960468879057', '8628490433231157','8677660403123800']

benign_filter_4 = (df['_time'] >= benign_data_test_period_start)
benign_filter_5 = (df['imeisv'].isin(benign_devices_for_testing))
benign_data_filter_test = (benign_filter_3 & benign_filter_4 & benign_filter_5)

benign_data_test = df[benign_data_filter_test].copy()
benign_data_test = benign_data_test.sort_values(['imeisv','_time'])
print(benign_data_test.shape[0])

90102


In [9]:
#malicious data
attck_1_start = "2024-03-23 21:26:00"
attck_1_end = "2024-03-23 22:23:00"
ues_to_exclude_in_1st_attck = [
    '8628490433231157','8609960480666910',
    '3557821101183501'] #'8677660403123800' '8642840401594200'

attck_2_start = "2024-03-23 22:56:00"
attck_2_end = "2024-03-23 23:56:00"
ues_to_exclude_in_2nd_attck = [
    '8609960480666910','8642840401612300'
]

mal_filter_1 = (
    df['_time'].between(attck_1_start, attck_1_end)
    & (~df['imeisv'].isin(ues_to_exclude_in_1st_attck))
)

mal_filter_2 = (
    df['_time'].between(attck_2_start, attck_2_end)
    & (~df['imeisv'].isin(ues_to_exclude_in_2nd_attck))
)

mal_filter_3 = (df['label'] == 1)

malicious_data = df[(mal_filter_1 | mal_filter_2) & mal_filter_3].copy()
malicious_data = malicious_data.sort_values(['imeisv','_time'])
print(malicious_data.shape[0])

10971


In [None]:
params = {'parameters': {'window_size': 120,
  'step_size': 40,
  'batch_size': 32,
  'hidden_dim1': 50,
  'hidden_dim2': 100,
  'dropout': 0.2,
  'layer_norm_flag': False,
  'loss_function': 'L1Loss',
  'lr': 0.001,
  'num_epochs': 52},
 'min_train_loss': 0.2222,
 'min_val_loss': 0.348,
 'min_train_val_gap': 0.1235,
 'epochs_trained': 44,
 'results_file': '../results/5bfa52f8-e8c6-4899-963d-3ebd80be60f9_history.pkl',
 'timestamp': '2024-04-16 00:52:07.473140',
 'rolling_avg': False,
 'feature_columns': ['ul_bitrate'],
 'dataset_used': 'no_outliers_scaled'}

In [None]:
train_data_loader, val_data_loader, mal_data_loader = create_ds_loader(
                benign_data_train, 
                malicious_data, 
                params['parameters']['window_size'], 
                params['parameters']['step_size'], 
                feature_columns, 
                params['parameters']['batch_size']
            )

## Recurrent Autoencoder

In [None]:
rae_model = LSTMAutoencoder(
    input_dim = len(feature_columns), 
    hidden_dim1 = params['parameters']['hidden_dim1'], 
    hidden_dim2 = params['parameters']['hidden_dim2'], 
    output_dim = len(feature_columns), 
    dropout = params['parameters']['dropout'], 
    layer_norm_flag = params['parameters']['layer_norm_flag']
)

rae_model.to(device)

early_stopping = EarlyStopping(patience=7, min_delta=0.)
criterion = nn.L1Loss()


In [None]:
history = rae_model.train_model(
    num_epochs = 64, 
    early_stopping = None, 
    train_data_loader = train_data_loader, 
    val_data_loader = val_data_loader, 
    mal_data_loader = mal_data_loader, 
    device = device, 
    criterion = criterion,  
    lr = 0.001
)

In [None]:
plot_train_val_loss(history.train_losses, history.val_losses)

In [None]:
test_batch_size = 1
benign_test_data_loader, mal_test_data_loader = create_test_ds_loaders(
    benign_data_test, 
    malicious_data, 
    120, 
    10, 
    features = feature_columns, 
    batch_size = test_batch_size
)

In [None]:
benign_test_losses, mal_test_losses = evaluate(rae_model, criterion, benign_test_data_loader, mal_test_data_loader, device)
#benign_test_losses, mal_test_losses = evaluate(rae_model, criterion, val_data_loader, mal_data_loader, device)

In [None]:
plot_scatter_plot_rec_loss(benign_test_losses, mal_test_losses)

In [None]:
fpr, tpr, thresholds, roc_auc, optimal_threshold = calculate_threshold(benign_test_losses, mal_test_losses)

In [None]:
plot_roc_curve(fpr, tpr, thresholds , roc_auc)

In [None]:
optimal_threshold

In [None]:
accuracy, precision, recall, f1, tp_rate, tn_rate, fp_rate, fn_rate = infer(benign_test_losses, mal_test_losses, optimal_threshold)