In [43]:
import pandas as pd
import json

In [44]:
# to flatten all nested levels, takes in data and outputs to flattened_data list
def flatten(data, flattened_data):
    if isinstance(data, dict):
        for key, value in data.items():
            flatten( (key, value), flattened_data)
    # value is a nested dictionary
    elif isinstance(data[1], dict):
        for key, value in data[1].items():
            flatten( (key, value), flattened_data)
    # value is a nested list
    elif isinstance(data[1], list):
        for item in data[1]:
            flatten(item, flattened_data)
    # flattened key and value
    else:
        length = len(flattened_data)
        key = data[0]
        value = data[1]
        if len(flattened_data[length - 1]) == 0:
            flattened_data[length - 1] = dict()
        flattened_data[length - 1][key] = value

In [45]:
# processes all json files into a dataframe

import os
directory_path = 'data'
combined_df = pd.DataFrame()
files = os.listdir(directory_path)

for file_name in files:
    file_path = os.path.join(directory_path, file_name)
    with open(file_path, 'r') as file:
        file_data = json.load(file)
        file_data_flattened = []

        for item in file_data:
            file_data_flattened.append([])
            flatten(data = item, flattened_data = file_data_flattened)

        file_df = pd.DataFrame(file_data_flattened) 

        if file_name.startswith('clean'):
            file_df['interference'] = [False] * len(file_df)
        else:
            file_df['interference'] = [True] * len(file_df)
        
        if combined_df.shape[0] == 0:
            combined_df = file_df
        else:
            combined_df = pd.merge(combined_df, file_df, how = 'outer')

In [46]:
# display combined_df

display(combined_df.shape)
display(combined_df.head(30))

(11935, 36)

Unnamed: 0,type,timestamp,carrier_id,cell_id,event_name,pci,cell_identity,sib9_home_enb_name,nof_rach,rnti,...,enb_ue_s1ap_id,bearer_id,qci,dl_total_bytes,ul_total_bytes,dl_latency,ul_latency,dl_buffered_bytes,ul_buffered_bytes,interference
0,event,1698340000.0,0,1.0,sector_start,1.0,1.0,,,,...,,,,,,,,,,True
1,event,1698340000.0,0,1.0,rrc_log,,,,,73.0,...,,,,,,,,,,True
2,event,1698340000.0,0,1.0,rrc_log,,,,,73.0,...,,,,,,,,,,True
3,event,1698340000.0,0,1.0,rrc_log,,,,,73.0,...,,,,,,,,,,True
4,event,1698340000.0,0,1.0,rrc_log,,,,,73.0,...,,,,,,,,,,True
5,event,1698340000.0,0,1.0,rrc_log,,,,,73.0,...,,,,,,,,,,True
6,event,1698340000.0,0,1.0,s1_context_create,,,,,73.0,...,1.0,,,,,,,,,True
7,event,1698340000.0,0,1.0,rrc_log,,,,,76.0,...,,,,,,,,,,True
8,event,1698340000.0,0,1.0,rrc_log,,,,,76.0,...,,,,,,,,,,True
9,event,1698340000.0,0,1.0,rrc_log,,,,,76.0,...,,,,,,,,,,True


In [47]:
# data handling
import numpy as np

# making 'cell_identity' numerical

combined_df['cell_identity'] = pd.to_numeric(combined_df['cell_identity'], errors='coerce')

# finds numerical columns with missing values
nan_count = np.sum(combined_df.isnull(), axis = 0)
nan_detected = nan_count > 0
is_int_or_float = (combined_df.dtypes == 'int64') | (combined_df.dtypes == 'float64')

display(nan_detected[nan_detected & is_int_or_float])

to_impute = combined_df[nan_detected[nan_detected & is_int_or_float].index]

# replaces missing numerical values with mean
for col in to_impute:
    mean = combined_df[col].mean()
    combined_df[col].fillna(value = mean, inplace = True)

# columns with missing non-numerical values

nan_count = np.sum(combined_df.isnull(), axis = 0)
display(nan_count[nan_count > 0])

# dropping non-numerical columns with no predictive value
combined_df.drop(columns = ['asn1_message', 'sib9_home_enb_name'], inplace = True)

# filling NaN values of event_name
combined_df['event_name'].fillna(value = 'no_event', inplace = True)

# confirming there are no more missing values
nan_count = np.sum(combined_df.isnull(), axis = 0)
display(nan_count[nan_count > 0])

cell_id              True
pci                  True
cell_identity        True
nof_rach             True
rnti                 True
asn1_length          True
asn1_type            True
additional           True
ue_rnti              True
dl_cqi               True
dl_mcs               True
dl_bitrate           True
dl_bler              True
ul_snr               True
ul_mcs               True
ul_bitrate           True
ul_bler              True
ul_phr               True
ul_bsr               True
mme_ue_s1ap_id       True
enb_ue_s1ap_id       True
bearer_id            True
qci                  True
dl_total_bytes       True
ul_total_bytes       True
dl_latency           True
ul_latency           True
dl_buffered_bytes    True
ul_buffered_bytes    True
dtype: bool

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df[col].fillna(value = mean, inplace = True)


event_name            10700
sib9_home_enb_name    11915
asn1_message          10768
dtype: int64

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['event_name'].fillna(value = 'no_event', inplace = True)


Series([], dtype: int64)

In [48]:
# one-hot encoding on non-numerical columns
 
cols_with_strings = []
for col in combined_df.columns:
    if combined_df[col].dtype == 'object':
        cols_with_strings.append(col)
        print(col, combined_df[col].nunique())

for col in cols_with_strings:
    df_column = pd.get_dummies(combined_df[col])
    combined_df = pd.concat([combined_df, df_column], axis = 1)
    combined_df.drop(col, axis = 1, inplace = True)

type 3
event_name 8


In [49]:
# imports for decision-tree model

from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [50]:
# method to train a Decision Tree classifier
def train_test_DT(X_train, X_test, y_train, y_test, depth = 8, leaf=1, crit='entropy'):
    
    model = DecisionTreeClassifier(max_depth = depth, min_samples_leaf = leaf, criterion = crit)
    model.fit(X_train, y_train)
    class_label_predictions = model.predict(X_test)
    acc_score = accuracy_score(y_test, class_label_predictions)

    importances = model.feature_importances_
    # Create a DataFrame for better visualization
    importance_df = pd.DataFrame({
        'Feature': list(combined_df.drop(columns = 'interference').columns),
        'Importance': importances
    })

    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Display the feature importances
    display(importance_df)
    return acc_score

In [51]:
# split testing and training data

y = combined_df['interference']
X = combined_df.drop(columns = 'interference')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [52]:
print(train_test_DT(X_train, X_test, y_train, y_test))

Unnamed: 0,Feature,Importance
10,ue_rnti,0.776041
0,timestamp,0.16442
5,nof_rach,0.024762
6,rnti,0.024163
31,event,0.004213
40,sector_start,0.002795
41,sector_stop,0.002403
8,asn1_type,0.001202
25,dl_total_bytes,0.0
26,ul_total_bytes,0.0


0.998984513835999
