In [None]:
import os

import pandas as pd
import numpy as np

from functools import reduce
from datetime import datetime, timedelta

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
data_folder = "../Data/Data v5"

## Data Parsing

In [None]:
# uncomment for classic
amari_ue_data_path = os.path.join(data_folder, "nwdaf-classic/amari_ue_data.csv")
enb_counters_path = os.path.join(data_folder, "nwdaf-classic/amari_ue_data.csv")
enb_name = "classic"

# uncomment for mini
# amari_ue_data_path = os.path.join(data_folder, "nwdaf-mini/amari_ue_data.csv")
# enb_counters_path = os.path.join(data_folder, "nwdaf-mini/enb_counters.csv")
# enb_name = "mini"

In [None]:
%%time

amari_ue_data_df = pd.read_csv(amari_ue_data_path, skiprows = 3)

amari_ue_data_df.drop(['Unnamed: 0', 'result'], axis = 1, inplace = True)
amari_ue_data_df.dropna(how = 'all', inplace = True)
amari_ue_data_df.drop(
    amari_ue_data_df[
        ~(amari_ue_data_df['table'].astype(str).str.isnumeric())
    ].index, 
    inplace = True
)

amari_ue_data_df['_time'] = pd.to_datetime(amari_ue_data_df['_time'], format = 'ISO8601')
amari_ue_data_df['imeisv'] = amari_ue_data_df['imeisv'].astype(str)

imeisv_dfs = {}
for imeisv, indices in amari_ue_data_df.groupby('imeisv').groups.items():
    imeisv_dfs[imeisv] = (
        amari_ue_data_df.loc[indices].copy()
        .pivot(
            index = ['_time','imeisv'], 
            columns = '_field', 
            values = '_value'
        )
        .reset_index()
    )
    
amari_ue_data_ds = pd.concat(list(imeisv_dfs.values()), axis = 0)

In [None]:
#amari_ue_data_ds.to_csv(os.path.join(data_folder,f'./amari_ue_data_{enb_name}_tabular.csv'), index = False)

## Data Merging

In [None]:
classic_ds = pd.read_csv(os.path.join(data_folder, 'amari_ue_data_classic_tabular.csv'))
mini_ds = pd.read_csv(os.path.join(data_folder, 'amari_ue_data_mini_tabular.csv'))

In [None]:
merged_ds = pd.concat([classic_ds, mini_ds], axis = 0, ignore_index = True)

In [None]:
cell_1_cols = [ 'cell_1_cell_id',
 'cell_1_cqi',
 'cell_1_dl_bitrate',
 'cell_1_dl_err',
 'cell_1_dl_mcs',
 'cell_1_dl_retx',
 'cell_1_dl_tx',
 'cell_1_epre',
 'cell_1_initial_ta',
 'cell_1_p_ue',
 'cell_1_pusch_snr',
 'cell_1_ri',
 'cell_1_turbo_decoder_avg',
 'cell_1_turbo_decoder_max',
 'cell_1_turbo_decoder_min',
 'cell_1_ul_bitrate',
 'cell_1_ul_err',
 'cell_1_ul_mcs',
 'cell_1_ul_n_layer',
 'cell_1_ul_path_loss',
 'cell_1_ul_phr',
 'cell_1_ul_rank',
 'cell_1_ul_retx',
 'cell_1_ul_tx']

cell_2_cols = [
'cell_2_cell_id',
 'cell_2_cqi',
 'cell_2_dl_bitrate',
 'cell_2_dl_err',
 'cell_2_dl_mcs',
 'cell_2_dl_retx',
 'cell_2_dl_tx',
 'cell_2_epre',
 'cell_2_initial_ta',
 'cell_2_p_ue',
 'cell_2_pusch_snr',
 'cell_2_ri',
 'cell_2_turbo_decoder_avg',
 'cell_2_turbo_decoder_max',
 'cell_2_turbo_decoder_min',
 'cell_2_ul_bitrate',
 'cell_2_ul_err',
 'cell_2_ul_mcs',
 'cell_2_ul_n_layer',
 'cell_2_ul_path_loss',
 'cell_2_ul_phr',
 'cell_2_ul_rank',
 'cell_2_ul_retx',
 'cell_2_ul_tx'
]

In [None]:
cell_1_metrics = [*map(lambda x: x.split('_'), cell_1_cols)]
cell_2_metrics = [*map(lambda x: x.split('_'), cell_2_cols)]

cell_1_metrics = [*map(lambda x: '_'.join(x[x.index('1') + 1:]), cell_1_metrics)]
cell_2_metrics = [*map(lambda x: '_'.join(x[x.index('2') + 1:]), cell_2_metrics)]


print(set(cell_1_metrics) - set(cell_2_metrics))
print(set(cell_2_metrics) - set(cell_1_metrics))

In [None]:
cell_metrics_dict = {cell_1_metric: cell_2_metric for cell_1_metric, cell_2_metric in  zip(cell_1_cols, cell_2_cols)}

In [None]:
i = 0
for col1, col2 in cell_metrics_dict.items():
    merged_ds[f'cell_x_{cell_1_metrics[i]}'] = np.where(pd.isnull(merged_ds[col1]), merged_ds[col2], merged_ds[col1])
    merged_ds.drop(col1, axis = 1, inplace = True)
    merged_ds.drop(col2, axis = 1, inplace = True)
    i+= 1

In [None]:
'bearer_1_ipv6' in list(mini_ds.columns), 'bearer_1_ipv6' in list(classic_ds.columns)

In [None]:
merged_ds.to_csv(os.path.join(data_folder, "amari_ue_data_merged_v5.csv"), index= False)

### Labeling

In [None]:
merged_ds = pd.read_csv(os.path.join(data_folder, "amari_ue_data_merged_v5.csv"))

In [None]:
summary_df = pd.read_csv(os.path.join(data_folder, "summary_df.csv"))

In [None]:
attck_1_start = "2024-03-23 21:26:00"
attck_1_end = "2024-03-23 22:23:00"

attck_2_start = "2024-03-23 22:56:00"
attck_2_end = "2024-03-23 23:56:00"

In [None]:
merged_ds['imeisv'] = merged_ds['imeisv'].astype(str)
merged_ds['_time'] = pd.to_datetime(merged_ds['_time'], format = 'ISO8601')

In [None]:
attack_1_filter = (
    (merged_ds['_time'].between(
        attck_1_start, 
        attck_1_end, 
        inclusive = 'both')
    ) & 
    (merged_ds['imeisv'].isin(
            summary_df[
                summary_df['ROLE'] == "MALICIOUS USER"
            ]['IMEISV'].astype(str).tolist()
        )
    )
)

attack_2_filter = (
    (merged_ds['_time'].between(
        attck_2_start, 
        attck_2_end, 
        inclusive = 'both')
    ) & 
    (merged_ds['imeisv'].isin(
            summary_df[
                summary_df['ROLE'] == "MALICIOUS USER"
            ]['IMEISV'].astype(str).tolist()
        )
    )
)

In [None]:
attack_1_filter.sum()

In [None]:
attack_2_filter.sum()

In [None]:
merged_ds['label'] = np.where((attack_1_filter | attack_2_filter), 1, 0)

In [None]:
merged_ds.drop(merged_ds[
    merged_ds['imeisv'] == '8642840402500000'
].index, inplace = True)

In [None]:
merged_ds.to_csv(os.path.join(data_folder, "amari_ue_data_merged_labeled_v5.csv"), index= False)

## Fix interuption of periodicity

In [None]:
merged_ds = pd.read_csv(os.path.join(data_folder, "amari_ue_data_merged_labeled_v5.csv"))

In [None]:
merged_ds['imeisv'] = merged_ds['imeisv'].astype(str)

In [None]:
merged_ds['_time'] = pd.to_datetime(merged_ds['_time'], format = "ISO8601")

In [None]:
periodicity = timedelta(seconds=5)

In [None]:
periodicity

In [None]:
merged_ds_sorted = merged_ds.sort_values(['imeisv','_time'], ascending = True)

In [None]:
merged_ds_sorted['time_diff'] = merged_ds_sorted['_time'].diff().dt.total_seconds()

In [None]:
merged_ds_sorted['prev_imeisv'] = merged_ds_sorted['imeisv'].shift(1)

In [None]:
merged_ds_sorted['prev_imeisv'].fillna('', inplace = True)
merged_ds_sorted['prev_imeisv'] = merged_ds_sorted['prev_imeisv'].astype(str)

In [None]:
merged_ds_sorted['prev_time'] = merged_ds_sorted['_time'].shift(1)

In [None]:
periodicity_breaks = merged_ds_sorted[
   (merged_ds_sorted['time_diff'] >= 10)
&  ~ (merged_ds_sorted['imeisv'] != merged_ds_sorted['prev_imeisv'])
].copy().sort_values(['imeisv','_time'], ascending = True)

In [None]:
periodicity_breaks['num_of_periods'] = periodicity_breaks['time_diff']/5

In [None]:
periodicity_breaks['date_range'] = periodicity_breaks.apply(
    lambda row: pd.date_range(
        start=row['prev_time'], 
        periods=row['num_of_periods'], 
        freq=periodicity
    ),
    axis = 1
)

In [None]:
feature_columns = [
       '5g_tmsi', 'amf_ue_id', 'bearer_0_apn',
       'bearer_0_dl_total_bytes', 'bearer_0_ip', 'bearer_0_pdu_session_id',
       'bearer_0_qos_flow_id', 'bearer_0_sst', 'bearer_0_ul_total_bytes',
       'dl_bitrate', 'ran_id', 'ran_plmn', 'ran_ue_id', 'registered', 'rnti',
       't3512', 'tac', 'tac_plmn', 'ue_aggregate_max_bitrate_dl',
       'ue_aggregate_max_bitrate_ul', 'ul_bitrate', 'bearer_1_apn',
       'bearer_1_dl_total_bytes', 'bearer_1_ip', 'bearer_1_ipv6',
       'bearer_1_pdu_session_id', 'bearer_1_qos_flow_id', 'bearer_1_sst',
       'bearer_1_ul_total_bytes', 'bearer_0_ipv6', 'cell_x_cell_id',
       'cell_x_cqi', 'cell_x_dl_bitrate', 'cell_x_dl_err', 'cell_x_dl_mcs',
       'cell_x_dl_retx', 'cell_x_dl_tx', 'cell_x_epre', 'cell_x_initial_ta',
       'cell_x_p_ue', 'cell_x_pusch_snr', 'cell_x_ri',
       'cell_x_turbo_decoder_avg', 'cell_x_turbo_decoder_max',
       'cell_x_turbo_decoder_min', 'cell_x_ul_bitrate', 'cell_x_ul_err',
       'cell_x_ul_mcs', 'cell_x_ul_n_layer', 'cell_x_ul_path_loss',
       'cell_x_ul_phr', 'cell_x_ul_rank', 'cell_x_ul_retx', 'cell_x_ul_tx'
]

In [None]:
periodicity_breaks_exploded = periodicity_breaks.explode('date_range')

In [None]:
periodicity_breaks_exploded.loc[
    periodicity_breaks_exploded.index.duplicated(keep='first'),
    feature_columns
] = np.NaN

In [None]:
rows_to_insert = periodicity_breaks_exploded[
    periodicity_breaks_exploded.index.duplicated(keep='first')
].copy()

In [None]:
rows_to_insert['_time'] = rows_to_insert['date_range']

In [None]:
merged_ds_expanded = pd.concat(
    [merged_ds_sorted, rows_to_insert], ignore_index=False
).sort_values('_time', ascending = True)

In [None]:
merged_ds_expanded['prev_imeisv'].fillna('', inplace = True)
merged_ds_expanded['prev_imeisv'] = merged_ds_expanded['prev_imeisv'].astype(str)

#### Some Data Preprocessing

In [None]:
merged_ds_expanded = merged_ds_expanded.sort_values(['imeisv','_time'], ascending = True)

In [None]:
merged_ds_expanded['bearer_0_ul_total_bytes_non_incr'] = merged_ds_expanded['bearer_0_ul_total_bytes'].diff()
merged_ds_expanded['bearer_1_ul_total_bytes_non_incr'] = merged_ds_expanded['bearer_1_ul_total_bytes'].diff()

merged_ds_expanded['bearer_0_dl_total_bytes_non_incr'] = merged_ds_expanded['bearer_0_dl_total_bytes'].diff()
merged_ds_expanded['bearer_1_dl_total_bytes_non_incr'] = merged_ds_expanded['bearer_1_dl_total_bytes'].diff()

In [None]:
merged_ds_expanded.loc[
    (merged_ds_expanded['imeisv'] != merged_ds_expanded['prev_imeisv']),
    'bearer_0_ul_total_bytes_non_incr'
] = merged_ds_expanded.loc[
    (merged_ds_expanded['imeisv'] != merged_ds_expanded['prev_imeisv']),
    'bearer_0_ul_total_bytes'
]

merged_ds_expanded.loc[
    (merged_ds_expanded['imeisv'] != merged_ds_expanded['prev_imeisv']),
    'bearer_1_ul_total_bytes_non_incr'
] = merged_ds_expanded.loc[
    (merged_ds_expanded['imeisv'] != merged_ds_expanded['prev_imeisv']),
    'bearer_1_ul_total_bytes'
]

merged_ds_expanded.loc[
    (merged_ds_expanded['imeisv'] != merged_ds_expanded['prev_imeisv']),
    'bearer_0_dl_total_bytes_non_incr'
] = merged_ds_expanded.loc[
    (merged_ds_expanded['imeisv'] != merged_ds_expanded['prev_imeisv']),
    'bearer_0_dl_total_bytes'
]

merged_ds_expanded.loc[
    (merged_ds_expanded['imeisv'] != merged_ds_expanded['prev_imeisv']),
    'bearer_1_dl_total_bytes_non_incr'
] = merged_ds_expanded.loc[
    (merged_ds_expanded['imeisv'] != merged_ds_expanded['prev_imeisv']),
    'bearer_1_dl_total_bytes'
]

In [None]:
merged_ds_expanded.drop(merged_ds_expanded[
    merged_ds_expanded['imeisv'] != merged_ds_expanded['prev_imeisv']
].index, axis = 0, inplace = True)

In [None]:
merged_ds_expanded['bearer_0_ul_total_bytes_non_incr'].fillna(0.0, inplace = True)
merged_ds_expanded['bearer_1_ul_total_bytes_non_incr'].fillna(0.0, inplace = True)
merged_ds_expanded['bearer_0_dl_total_bytes_non_incr'].fillna(0.0, inplace = True)
merged_ds_expanded['bearer_1_dl_total_bytes_non_incr'].fillna(0.0, inplace = True)

merged_ds_expanded['ul_total_bytes_non_incr'] = np.where(
    merged_ds_expanded['bearer_0_ul_total_bytes_non_incr']==0.0, 
    merged_ds_expanded['bearer_1_ul_total_bytes_non_incr'], 
    merged_ds_expanded['bearer_0_ul_total_bytes_non_incr']
)
merged_ds_expanded['dl_total_bytes_non_incr'] = np.where(
    merged_ds_expanded['bearer_0_dl_total_bytes_non_incr']==0.0, 
    merged_ds_expanded['bearer_1_dl_total_bytes_non_incr'], 
    merged_ds_expanded['bearer_0_dl_total_bytes_non_incr']
)

In [None]:
merged_ds_expanded[
  merged_ds_expanded['imeisv'] == '8628490433231157'
][['_time', 'imeisv','prev_imeisv','bearer_0_dl_total_bytes', 'bearer_0_dl_total_bytes_non_incr']]

In [None]:
feature_columns = [
    'dl_bitrate','ul_bitrate', 
    'cell_x_dl_retx', 'cell_x_dl_tx',
    'cell_x_ul_retx', 'cell_x_ul_tx',
    'ul_total_bytes_non_incr', 'dl_total_bytes_non_incr'
    ]

store_columns = ['_time', 'imeisv'] + feature_columns

In [None]:
def group_consecutive_numbers(numbers):
    
    grouped = []
    current_group = [numbers[0]]
    for i in range(1, len(numbers)):
        if numbers[i] == numbers[i-1] + 1:
            current_group.append(numbers[i])
        else:
            grouped.append(current_group)
            current_group = [numbers[i]]
    
    grouped.append(current_group)
    
    return grouped

In [None]:
merged_ds_expanded.reset_index(inplace = True)

In [None]:
nan_indices = list(merged_ds_expanded[merged_ds_expanded['ul_bitrate'].isna()].index)
consecutive_nan_groups = group_consecutive_numbers(nan_indices)

for indices_group in consecutive_nan_groups:
    group_len = len(indices_group)
    target_indices = [ind - group_len for ind in indices_group]
    merged_ds_expanded.loc[indices_group,'ul_bitrate'] = merged_ds_expanded.loc[target_indices,'ul_bitrate'].values

In [None]:
merged_ds_expanded.to_csv(os.path.join(data_folder, "amari_ue_data_final_v5.csv"), index = False)