In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

import os
from hampel import hampel
import sys
sys.path.append('../')

from dataloader.dataloader import *
from training.training import *
from utils.utils import *
from visualizations.visualizations import *
from evaluation.evaluation import *

In [None]:
data_folder = "../Data/Data v5"

df = pd.read_csv(os.path.join(data_folder, "amari_ue_data_final_v5.csv"))
df = df.sort_values(["imeisv", "_time"], ascending = True)

In [None]:
feature_columns = [
    'dl_bitrate','ul_bitrate', 
    'cell_x_dl_retx', 'cell_x_dl_tx',
    'cell_x_ul_retx', 'cell_x_ul_tx',
    'ul_total_bytes_non_incr', 'dl_total_bytes_non_incr'
    ]

store_columns = ['_time', 'imeisv'] + feature_columns

### Remove Outliers using Hampel filter

In [None]:
%%time

window_size = 10
n_sigma = 3.0

imeisv_df_no_outliers = {}

for imeisv, imeisv_df in df.groupby('imeisv'):
    imeisv_df = imeisv_df[['_time', 'imeisv'] + feature_columns + ['label']].copy()
    
    for col in feature_columns:
        filter_result = hampel(imeisv_df[col], window_size=window_size, n_sigma=n_sigma)
        imeisv_df[col] = filter_result.filtered_data.values
        
    imeisv_df_no_outliers[str(imeisv)] = imeisv_df

In [None]:
final_df = pd.concat(list(imeisv_df_no_outliers.values()))

In [None]:
final_df.to_csv(os.path.join(data_folder, "amari_ue_data_final_v5_no_outliers.csv"), index = False)

### Scaling TS

In [None]:
imeisv_df_no_outliers_scaled = {}

for imeisv, df in imeisv_df_no_outliers.items():
    scaler = StandardScaler()
    
    scaled_data = scaler.fit_transform(df[feature_columns])
    df[feature_columns] = scaled_data
        
    imeisv_df_no_outliers_scaled[imeisv] = df

In [None]:
final_df = pd.concat(list(imeisv_df_no_outliers_scaled.values()))

In [None]:
final_df.to_csv(os.path.join(data_folder, "amari_ue_data_final_v5_no_outliers_scaled.csv"), index = False)

### Scaling TS (separately)

In [None]:
imeisv_df_no_outliers_scaled_sep = {}

for imeisv, df in imeisv_df_no_outliers.items():
    benign_df = df[df['label'] == 0].copy()
    malicious_df = df[df['label'] == 1].copy()
    
    benign_scaler = StandardScaler()
    malicious_scaler = StandardScaler()
    
    scaled_benign_data = benign_scaler.fit_transform(benign_df[feature_columns])
    benign_df[feature_columns] = scaled_benign_data
    
    if malicious_df.shape[0] > 0:
        scaled_malicious_data = malicious_scaler.fit_transform(malicious_df[feature_columns])
        malicious_df[feature_columns] = scaled_malicious_data
    
    imeisv_df_no_outliers_scaled_sep[imeisv] = pd.concat(
        [benign_df,malicious_df], axis = 0
        ).sort_values(['imeisv','_time'], ascending= True)

In [None]:
final_df = pd.concat(list(imeisv_df_no_outliers_scaled_sep.values()))

In [None]:
final_df.to_csv(os.path.join(data_folder, "amari_ue_data_final_v5_no_outliers_scaled_sep.csv"), index = False)