In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [12]:
"""Load Cell based dataset"""
df_cell = pd.read_csv('../Datasets/enb_counters.csv')

In [13]:
df_cell.head()

Unnamed: 0,timestamp,instance_id,cell_X_dl_bitrate,cell_X_dl_err,cell_X_dl_gbr_use_avg,cell_X_dl_gbr_use_max,cell_X_dl_gbr_use_min,cell_X_dl_retx,cell_X_dl_sched_users_avg,cell_X_dl_sched_users_max,...,rf_tx_cpu_time,rf_tx_sample_rate,cell_id,cell_X_erab_count_avg,cell_X_erab_count_max,cell_X_erab_count_min,msg_ng_error_indication,msg_ng_initial_context_setup_failure,msg_xn_ng_ran_node_configuration_update_acknowledge_recv,msg_xn_ng_ran_node_configuration_update_sent
0,2024-01-24 14:00:00+00:00,65a5700c,96029.0,0.0,0.0,0.0,0.0,42.0,0.034,2.0,...,5.0,61.440203,1.0,,,,,,,
1,2024-01-24 14:00:00+00:00,65b119a2,,,,,,,,,...,,,,,,,,,,
2,2024-01-24 14:00:05+00:00,65a5700c,95975.0,0.0,0.0,0.0,0.0,52.0,0.034,2.0,...,5.0,61.439763,1.0,,,,,,,
3,2024-01-24 14:00:05+00:00,65b119a2,,,,,,,,,...,,,,,,,,,,
4,2024-01-24 14:00:10+00:00,65a5700c,93107.0,0.0,0.0,0.0,0.0,40.0,0.031,2.0,...,5.0,61.440072,1.0,,,,,,,


In [14]:
# Remove columns with singular values
singular_columns = df_cell.columns[df_cell.nunique() == 1]
df_cell = df_cell.drop(columns=singular_columns)

In [15]:
df_cell.head()

Unnamed: 0,timestamp,instance_id,cell_X_dl_bitrate,cell_X_dl_err,cell_X_dl_retx,cell_X_dl_sched_users_avg,cell_X_dl_sched_users_max,cell_X_dl_tx,cell_X_dl_use_avg,cell_X_dl_use_max,...,rf_samples_tx2_max,rf_samples_tx2_rms,rf_tx_count,rf_tx_cpu_time,rf_tx_sample_rate,cell_id,cell_X_erab_count_avg,cell_X_erab_count_max,cell_X_erab_count_min,msg_ng_error_indication
0,2024-01-24 14:00:00+00:00,65a5700c,96029.0,0.0,42.0,0.034,2.0,238.0,0.008,1.0,...,-4.576581,-36.216774,316416000.0,5.0,61.440203,1.0,,,,
1,2024-01-24 14:00:00+00:00,65b119a2,,,,,,,,,...,,,,,,,,,,
2,2024-01-24 14:00:05+00:00,65a5700c,95975.0,0.0,52.0,0.034,2.0,229.0,0.007,0.654,...,-5.630717,-36.618362,318197760.0,5.0,61.439763,1.0,,,,
3,2024-01-24 14:00:05+00:00,65b119a2,,,,,,,,,...,,,,,,,,,,
4,2024-01-24 14:00:10+00:00,65a5700c,93107.0,0.0,40.0,0.031,2.0,219.0,0.007,0.579,...,-6.586207,-36.659668,316108800.0,5.0,61.440072,1.0,,,,


In [16]:
# Remove instance id since we have cell_id to combine datasets
drop_col = 'instance_id'
df_cell = df_cell.drop(columns=drop_col)

In [17]:
# Convert timestamp feature from object to datetime
df_cell['timestamp'] = pd.to_datetime(df_cell['timestamp'])
df_cell.head()

Unnamed: 0,timestamp,cell_X_dl_bitrate,cell_X_dl_err,cell_X_dl_retx,cell_X_dl_sched_users_avg,cell_X_dl_sched_users_max,cell_X_dl_tx,cell_X_dl_use_avg,cell_X_dl_use_max,cell_X_drb_count_avg,...,rf_samples_tx2_max,rf_samples_tx2_rms,rf_tx_count,rf_tx_cpu_time,rf_tx_sample_rate,cell_id,cell_X_erab_count_avg,cell_X_erab_count_max,cell_X_erab_count_min,msg_ng_error_indication
0,2024-01-24 14:00:00+00:00,96029.0,0.0,42.0,0.034,2.0,238.0,0.008,1.0,13.305,...,-4.576581,-36.216774,316416000.0,5.0,61.440203,1.0,,,,
1,2024-01-24 14:00:00+00:00,,,,,,,,,,...,,,,,,,,,,
2,2024-01-24 14:00:05+00:00,95975.0,0.0,52.0,0.034,2.0,229.0,0.007,0.654,13.0,...,-5.630717,-36.618362,318197760.0,5.0,61.439763,1.0,,,,
3,2024-01-24 14:00:05+00:00,,,,,,,,,,...,,,,,,,,,,
4,2024-01-24 14:00:10+00:00,93107.0,0.0,40.0,0.031,2.0,219.0,0.007,0.579,13.731,...,-6.586207,-36.659668,316108800.0,5.0,61.440072,1.0,,,,


In [18]:
# Remove rows where only the first feature has value
print("Number of instances before: ",len(df_cell))
df_cell = df_cell[df_cell.iloc[:, 1:].notna().any(axis=1)].reset_index(drop=True)
print("Number of instances after: ",len(df_cell))

Number of instances before:  58218
Number of instances after:  37432


In [19]:
columns = df_cell.columns

# Print how many Nan values exist in each column
nan_counts = df_cell.isnull().sum()
for column, count in nan_counts.items():
    print(f"Column '{column}': {count} NaN values")

Column 'timestamp': 0 NaN values
Column 'cell_X_dl_bitrate': 0 NaN values
Column 'cell_X_dl_err': 18610 NaN values
Column 'cell_X_dl_retx': 0 NaN values
Column 'cell_X_dl_sched_users_avg': 0 NaN values
Column 'cell_X_dl_sched_users_max': 0 NaN values
Column 'cell_X_dl_tx': 0 NaN values
Column 'cell_X_dl_use_avg': 18610 NaN values
Column 'cell_X_dl_use_max': 18610 NaN values
Column 'cell_X_drb_count_avg': 18610 NaN values
Column 'cell_X_drb_count_max': 18610 NaN values
Column 'cell_X_drb_count_min': 18610 NaN values
Column 'cell_X_ue_active_count_avg': 0 NaN values
Column 'cell_X_ue_active_count_max': 0 NaN values
Column 'cell_X_ue_active_count_min': 0 NaN values
Column 'cell_X_ue_count_avg': 0 NaN values
Column 'cell_X_ue_count_max': 0 NaN values
Column 'cell_X_ue_count_min': 0 NaN values
Column 'cell_X_ul_bitrate': 0 NaN values
Column 'cell_X_ul_err': 18610 NaN values
Column 'cell_X_ul_retx': 0 NaN values
Column 'cell_X_ul_sched_users_avg': 0 NaN values
Column 'cell_X_ul_sched_users_m

In [21]:
# # save the rows with the nan values

# # Identify rows with NaN values in any of the columns except the first one
nan_rows = df_cell[df_cell.iloc[:, 1:].isna().any(axis=1)]

# # Save the rows with NaN values to a CSV file
nan_rows.to_csv('nan_rows_cell.csv', index=False)

In [None]:
# columns to remove due to NaN values:
features_to_drop = ['cell_X_dl_err','cell_X_dl_use_avg','cell_X_dl_use_max','cell_X_drb_count_avg',
                    'cell_X_drb_count_max','cell_X_drb_count_min','cell_X_ul_err','cell_X_ul_use_avg',
                    'cell_X_ul_use_max','msg_ng_paging','msg_ng_path_switch_request',
                    'msg_ng_path_switch_request_acknowledge','msg_ng_pdu_session_resource_notify',
                    'msg_xn_handover_request_acknowledge_recv','msg_xn_handover_request_acknowledge_sent',
                    'msg_xn_handover_request_recv','msg_xn_handover_request_sent',
                    'msg_xn_ng_ran_node_configuration_update_acknowledge_sent', 'msg_xn_ng_ran_node_configuration_update_recv',
                    'msg_xn_sn_status_transfer_recv','msg_xn_sn_status_transfer_sent','msg_xn_ue_context_release_recv',
                    'msg_xn_ue_context_release_sent','rf_samples_tx2_count','rf_samples_tx2_max','rf_samples_tx2_rms',
                    'cell_X_erab_count_avg','cell_X_erab_count_max','cell_X_erab_count_min','msg_ng_error_indication']

# remove columns
df_cell = df_cell.drop(columns=features_to_drop)

In [None]:
df_cell.head()

In [22]:
# group the instances into clusters using:
# variance thresholding
# correlation-based feature selection
# and feature selection using clustering

# remove timestamp and cellid 
data = df_cell.drop(columns = ['timestamp', 'cell_id'])

# Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(data)

In [None]:
# Variance thresholding 

# Set threshold to a value. Features with variance lower than this will be removed.
threshold = 0.9 # up to 0.9 no features were removed. 
selector = VarianceThreshold(threshold)

# Fit the selector to the scaled data
selector.fit(df_scaled)

# Transform the data to keep only the features with variance above the threshold
df_reduced = selector.transform(df_scaled)

# Convert the result back to a DataFrame with original feature names
df_reduced = pd.DataFrame(df_reduced, columns=[column for column, var in zip(data.columns, selector.variances_) if var > threshold])

In [None]:
df_reduced.head()

In [24]:
'''Correlation based feature selection '''

# make numpy array back to dataframe
# df_scaled = pd.DataFrame(df_scaled, columns=[column for column in data.columns])

# Calculate the correlation matrix
correlation_matrix = data.corr().abs()

# Set the correlation threshold
threshold = 0.8

# Identify pairs of features with correlation greater than the threshold
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find columns to drop
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# Display the columns to drop
print("Columns to Drop:")
print(len(to_drop))
print(to_drop)

Columns to Drop:
68
['cell_X_dl_retx', 'cell_X_dl_sched_users_avg', 'cell_X_dl_tx', 'cell_X_dl_use_avg', 'cell_X_drb_count_max', 'cell_X_drb_count_min', 'cell_X_ue_active_count_avg', 'cell_X_ue_active_count_max', 'cell_X_ue_active_count_min', 'cell_X_ue_count_avg', 'cell_X_ue_count_max', 'cell_X_ue_count_min', 'cell_X_ul_bitrate', 'cell_X_ul_retx', 'cell_X_ul_sched_users_avg', 'cell_X_ul_sched_users_max', 'cell_X_ul_tx', 'cell_X_ul_use_avg', 'cpu', 'msg_ng_downlink_nas_transport', 'msg_ng_initial_context_setup_response', 'msg_ng_initial_ue_message', 'msg_ng_path_switch_request', 'msg_ng_path_switch_request_acknowledge', 'msg_ng_pdu_session_resource_notify', 'msg_ng_pdu_session_resource_release_command', 'msg_ng_pdu_session_resource_release_response', 'msg_ng_pdu_session_resource_setup_request', 'msg_ng_pdu_session_resource_setup_response', 'msg_ng_setup_request', 'msg_ng_setup_response', 'msg_ng_ue_context_release_command', 'msg_ng_ue_context_release_complete', 'msg_ng_ue_context_relea

In [25]:
df_reduced_corr = data.drop(columns = to_drop)
df_reduced_corr.head()
features_correlation = list(df_reduced_corr.columns)

In [None]:
'''Feature selection using clustering'''

# Apply K-means clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(df_scaled)

# Calculate the silhouette score to evaluate clustering performance
silhouette_avg = silhouette_score(df_scaled, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")

# Determine feature importance
# One way to determine feature importance is to analyze the cluster centers
feature_importance = np.abs(kmeans.cluster_centers_).mean(axis=0)
feature_importance_df = pd.DataFrame({'Feature': data.columns, 'Importance': feature_importance})
print(feature_importance_df)

# Select features with importance above a certain threshold
threshold = 0.5 # * feature_importance.max()
important_features = feature_importance_df[feature_importance_df['Importance'] >= threshold]['Feature']

# Reduce the DataFrame to important features
df_reduced_clustering = data[important_features]

In [None]:
print(df_reduced_clustering)

In [None]:
# visualise clusters and the instances of each cluster 
clusters = kmeans.fit_predict(df_scaled)

# Add the cluster labels to the DataFrame
df = data.copy()
df['Cluster'] = clusters

# Scatter plot of the clusters
plt.figure(figsize=(10, 5))
sns.scatterplot(x='cell_X_dl_retx', y='cell_X_ul_bitrate', hue='Cluster', data=df, palette='viridis', s=100)
plt.title('Scatter Plot of Clusters')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
# Bar plot of the number of instances in each cluster
cluster_counts = df['Cluster'].value_counts().sort_index()
plt.figure(figsize=(8, 5))
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, palette='viridis')
plt.title('Number of Instances in Each Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()

In [None]:
# correlation between the remaining features to discard even more

# Calculate the correlation matrix
correlation_matrix = df_reduced_clustering.corr().abs()

# Set the correlation threshold
threshold = 0.8

# Identify pairs of features with correlation greater than the threshold
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find columns to drop
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# Display the columns to drop
print("Columns to Drop:")
print(to_drop)

In [None]:
# clustering and correlation features 
df_clustering_correlation = df_reduced_clustering.drop(columns = to_drop)

In [None]:
df_clustering_correlation.head()

In [None]:
# data to save based on clustering and correlation
final_columns = list(df_clustering_correlation.columns)
print(final_columns)
final_columns.extend(['timestamp','cell_id'])
print(final_columns)

In [None]:
final_df_clustering_correlation = df_cell[final_columns]

In [None]:
final_df_clustering_correlation.head()

In [None]:
final_df_clustering_correlation.to_csv('cell_data_clustering_correlation.csv',index=False)

In [26]:
features_correlation.extend(['timestamp','cell_id'])
final_df__correlation = df_cell[features_correlation]
final_df__correlation.to_csv('cell_data_correlation.csv',index=False)