In [1]:
pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from pandas import json_normalize
import json

Converting the Data from the JSON Files into a Dataframe

In [3]:
# to flatten all nested levels, takes in data and outputs to flattened_data list
def flatten(data, flattened_data):
    if isinstance(data, dict):
        for key, value in data.items():
            flatten( (key, value), flattened_data)
    # value is a nested dictionary
    elif isinstance(data[1], dict):
        for key, value in data[1].items():
            flatten( (key, value), flattened_data)
    # value is a nested list
    elif isinstance(data[1], list):
        for item in data[1]:
            flatten(item, flattened_data)
    # flattened key and value
    else:
        length = len(flattened_data)
        key = data[0]
        value = data[1]
        if len(flattened_data[length - 1]) == 0:
            flattened_data[length - 1] = dict()
        flattened_data[length - 1][key] = value

In [4]:
# opens the file for reading
with open('clean1.json', 'r') as file:
    clean1_data = json.load(file)

clean1_flattened_data = []

for item in clean1_data:
    clean1_flattened_data.append([])
    flatten(data = item, flattened_data = clean1_flattened_data)

clean1_df = pd.DataFrame(clean1_flattened_data)
clean1_df.head(30)
clean1_df.shape

# add label 
clean1_df['interference'] = [False] * len(clean1_df)

In [5]:
# repeat with jammer data

with open('jammer1.json', 'r') as file:
    jammer1_data = json.load(file)

jammer1_flattened_data = []

for item in jammer1_data:
    jammer1_flattened_data.append([])
    flatten(data = item, flattened_data = jammer1_flattened_data)

jammer1_df = pd.DataFrame(jammer1_flattened_data)
jammer1_df.head(30)
jammer1_df.shape

jammer1_df['interference'] = [True] * len(jammer1_df)

In [6]:
# join clean and jammer data

combined_df = pd.merge(clean1_df, jammer1_df, how = 'outer')
combined_df.shape

(2283, 36)

In [7]:
combined_df.head(30)

Unnamed: 0,type,timestamp,carrier_id,cell_id,event_name,pci,cell_identity,sib9_home_enb_name,nof_rach,ue_rnti,...,qci,dl_total_bytes,ul_total_bytes,dl_latency,ul_latency,dl_buffered_bytes,ul_buffered_bytes,mme_ue_s1ap_id,enb_ue_s1ap_id,interference
0,event,1698340000.0,0,1.0,sector_start,1.0,1.0,,,,...,,,,,,,,,,True
1,event,1698340000.0,0,1.0,rrc_log,,,,,,...,,,,,,,,,,True
2,event,1698340000.0,0,1.0,rrc_log,,,,,,...,,,,,,,,,,True
3,event,1698340000.0,0,1.0,rrc_log,,,,,,...,,,,,,,,,,True
4,event,1698340000.0,0,1.0,rrc_log,,,,,,...,,,,,,,,,,True
5,event,1698340000.0,0,1.0,rrc_log,,,,,,...,,,,,,,,,,True
6,event,1698340000.0,0,1.0,s1_context_create,,,,,,...,,,,,,,,5.0,1.0,True
7,event,1698340000.0,0,1.0,rrc_log,,,,,,...,,,,,,,,,,True
8,event,1698340000.0,0,1.0,rrc_log,,,,,,...,,,,,,,,,,True
9,event,1698340000.0,0,1.0,rrc_log,,,,,,...,,,,,,,,,,True


Data Preperation: Removing Features with No Predictive Value and Handling NaN Values

In [8]:
# data handling
import numpy as np

# making 'cell_identity' numerical

combined_df['cell_identity'] = pd.to_numeric(combined_df['cell_identity'], errors='coerce')

# finds columns with missing values
nan_count = np.sum(combined_df.isnull(), axis = 0)
nan_detected = nan_count > 0
is_int_or_float = (combined_df.dtypes == 'int64') | (combined_df.dtypes == 'float64')

display(nan_detected[nan_detected & is_int_or_float])

to_impute = combined_df[nan_detected[nan_detected & is_int_or_float].index]

# replaces missing numerical values with mean
for col in to_impute:
    mean = combined_df[col].mean()
    combined_df[col].fillna(value = mean, inplace = True)

cell_id              True
pci                  True
cell_identity        True
nof_rach             True
ue_rnti              True
dl_cqi               True
dl_mcs               True
dl_bitrate           True
dl_bler              True
ul_snr               True
ul_mcs               True
ul_bitrate           True
ul_bler              True
ul_phr               True
ul_bsr               True
rnti                 True
asn1_length          True
asn1_type            True
additional           True
bearer_id            True
qci                  True
dl_total_bytes       True
ul_total_bytes       True
dl_latency           True
ul_latency           True
dl_buffered_bytes    True
ul_buffered_bytes    True
mme_ue_s1ap_id       True
enb_ue_s1ap_id       True
dtype: bool

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df[col].fillna(value = mean, inplace = True)


In [9]:
# columns with missing non-numerical values

nan_count = np.sum(combined_df.isnull(), axis = 0)
display(nan_count[nan_count > 0])

event_name            2110
sib9_home_enb_name    2279
asn1_message          2120
dtype: int64

In [10]:
# dropping columns with no predictive value
combined_df.drop(columns = ['asn1_message', 'sib9_home_enb_name'], inplace = True)

# filling NaN values of event_name
combined_df['event_name'].fillna(value = 'no_event', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['event_name'].fillna(value = 'no_event', inplace = True)


In [11]:
# confirming there are no more missing values
nan_count = np.sum(combined_df.isnull(), axis = 0)
display(nan_count[nan_count > 0])

Series([], dtype: int64)

Running K-Means Clustering Model