In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
"""Load Cell based dataset"""
df_cell = pd.read_csv('../Datasets/enb_counters.csv')

In [3]:
print(df_cell)

                       timestamp instance_id  cell_X_dl_bitrate  \
0      2024-01-24 14:00:00+00:00    65a5700c            96029.0   
1      2024-01-24 14:00:00+00:00    65b119a2                NaN   
2      2024-01-24 14:00:05+00:00    65a5700c            95975.0   
3      2024-01-24 14:00:05+00:00    65b119a2                NaN   
4      2024-01-24 14:00:10+00:00    65a5700c            93107.0   
...                          ...         ...                ...   
58213  2024-01-25 16:59:50+00:00    65a5700c                NaN   
58214  2024-01-25 16:59:50+00:00    65b119a8            13583.0   
58215  2024-01-25 16:59:55+00:00    65b119a2            28921.0   
58216  2024-01-25 16:59:55+00:00    65a5700c                NaN   
58217  2024-01-25 16:59:55+00:00    65b119a8            12561.0   

       cell_X_dl_err  cell_X_dl_gbr_use_avg  cell_X_dl_gbr_use_max  \
0                0.0                    0.0                    0.0   
1                NaN                    NaN            

In [None]:
# Remove columns with singular values
singular_columns = df_cell.columns[df_cell.nunique() == 1]
df_cell = df_cell.drop(columns=singular_columns)

In [None]:
print(df_cell)

In [None]:
# Remove instance id since we have cell_id to combine datasets
drop_col = 'instance_id'
df_cell = df_cell.drop(columns=drop_col)

In [None]:
# Convert timestamp feature from object to datetime
df_cell['timestamp'] = pd.to_datetime(df_cell['timestamp'])
df_cell.head()

In [None]:
# Remove rows where only the first feature has value
print("Number of instances before: ",len(df_cell))
df_cell = df_cell[df_cell.iloc[:, 1:].notna().any(axis=1)].reset_index(drop=True)
print("Number of instances after: ",len(df_cell))

In [None]:
# Analysis of cell features for removal
columns = df_cell.columns

# Print how many Nan values exist in each column
nan_counts = df_cell.isnull().sum()
for column, count in nan_counts.items():
    print(f"Column '{column}': {count} NaN values")

In [None]:
# columns to remove due to NaN values:
features_to_drop = ['cell_X_dl_err','cell_X_dl_use_avg','cell_X_dl_use_max','cell_X_drb_count_avg',
                    'cell_X_drb_count_max','cell_X_drb_count_min','cell_X_ul_err','cell_X_ul_use_avg',
                    'cell_X_ul_use_max','msg_ng_paging','msg_ng_path_switch_request',
                    'msg_ng_path_switch_request_acknowledge','msg_ng_pdu_session_resource_notify',
                    'msg_xn_handover_request_acknowledge_recv','msg_xn_handover_request_acknowledge_sent',
                    'msg_xn_handover_request_recv','msg_xn_handover_request_sent',
                    'msg_xn_ng_ran_node_configuration_update_acknowledge_sent', 'msg_xn_ng_ran_node_configuration_update_recv',
                    'msg_xn_sn_status_transfer_recv','msg_xn_sn_status_transfer_sent','msg_xn_ue_context_release_recv',
                    'msg_xn_ue_context_release_sent','rf_samples_tx2_count','rf_samples_tx2_max','rf_samples_tx2_rms',
                    'cell_X_erab_count_avg','cell_X_erab_count_max','cell_X_erab_count_min','msg_ng_error_indication']

# remove columns
df_cell = df_cell.drop(columns=features_to_drop)

In [None]:
df_cell.head()

In [None]:
# group the instances into clusters using:
# variance thresholding
# correlation-based feature selection
# and feature selection using clustering

# remove timestamp and cellid 
data = df_cell.drop(columns = ['timestamp', 'cell_id'])

# Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(data)

In [None]:
# Variance thresholding 

# Set threshold to a value. Features with variance lower than this will be removed.
threshold = 0.9 # up to 0.9 no features were removed. 
selector = VarianceThreshold(threshold)

# Fit the selector to the scaled data
selector.fit(df_scaled)

# Transform the data to keep only the features with variance above the threshold
df_reduced = selector.transform(df_scaled)

# Convert the result back to a DataFrame with original feature names
df_reduced = pd.DataFrame(df_reduced, columns=[column for column, var in zip(data.columns, selector.variances_) if var > threshold])

In [None]:
df_reduced.head()

In [None]:
'''Correlation based feature selection '''

# make numpy array back to dataframe
# df_scaled = pd.DataFrame(df_scaled, columns=[column for column in data.columns])

# Calculate the correlation matrix
correlation_matrix = data.corr().abs()

# Set the correlation threshold
threshold = 0.8

# Identify pairs of features with correlation greater than the threshold
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find columns to drop
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# Display the columns to drop
print("Columns to Drop:")
print(to_drop)

In [None]:
df_reduced_corr = data.drop(columns = to_drop)
df_reduced_corr.head()
features_correlation = list(df_reduced_corr.columns)

In [None]:
'''Feature selection using clustering'''

# Apply K-means clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(df_scaled)

# Calculate the silhouette score to evaluate clustering performance
silhouette_avg = silhouette_score(df_scaled, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")

# Determine feature importance
# One way to determine feature importance is to analyze the cluster centers
feature_importance = np.abs(kmeans.cluster_centers_).mean(axis=0)
feature_importance_df = pd.DataFrame({'Feature': data.columns, 'Importance': feature_importance})
print(feature_importance_df)

# Select features with importance above a certain threshold
threshold = 0.5 # * feature_importance.max()
important_features = feature_importance_df[feature_importance_df['Importance'] >= threshold]['Feature']

# Reduce the DataFrame to important features
df_reduced_clustering = data[important_features]

In [None]:
print(df_reduced_clustering)

In [None]:
# visualise clusters and the instances of each cluster 
clusters = kmeans.fit_predict(df_scaled)

# Add the cluster labels to the DataFrame
df = data.copy()
df['Cluster'] = clusters

# Scatter plot of the clusters
plt.figure(figsize=(10, 5))
sns.scatterplot(x='cell_X_dl_retx', y='cell_X_ul_bitrate', hue='Cluster', data=df, palette='viridis', s=100)
plt.title('Scatter Plot of Clusters')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
# Bar plot of the number of instances in each cluster
cluster_counts = df['Cluster'].value_counts().sort_index()
plt.figure(figsize=(8, 5))
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, palette='viridis')
plt.title('Number of Instances in Each Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()

In [None]:
# correlation between the remaining features to discard even more

# Calculate the correlation matrix
correlation_matrix = df_reduced_clustering.corr().abs()

# Set the correlation threshold
threshold = 0.8

# Identify pairs of features with correlation greater than the threshold
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find columns to drop
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# Display the columns to drop
print("Columns to Drop:")
print(to_drop)

In [None]:
# clustering and correlation features 
df_clustering_correlation = df_reduced_clustering.drop(columns = to_drop)

In [None]:
df_clustering_correlation.head()

In [None]:
# data to save based on clustering and correlation
final_columns = list(df_clustering_correlation.columns)
print(final_columns)
final_columns.extend(['timestamp','cell_id'])
print(final_columns)

In [None]:
final_df_clustering_correlation = df_cell[final_columns]

In [None]:
final_df_clustering_correlation.head()

In [None]:
final_df_clustering_correlation.to_csv('cell_data_clustering_correlation.csv',index=False)

In [None]:
features_correlation.extend(['timestamp','cell_id'])
final_df__correlation = df_cell[features_correlation]
final_df__correlation.to_csv('cell_data_correlation.csv',index=False)