# **Import Libraries**

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, silhouette_samples, silhouette_score, adjusted_rand_score, accuracy_score, precision_score, recall_score, f1_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from pandas.plotting import parallel_coordinates
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import Rectangle, Circle, Ellipse, Polygon, Wedge, Arrow

import numpy as np
import pandas as pd
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Load Dataset**

In [54]:
data = pd.read_csv('/content/drive/MyDrive/Dataset/train_dataset.zip')
data.head()

Unnamed: 0,LTE/5g Category,Time,Packet Loss Rate,Packet delay,IoT,LTE/5G,GBR,Non-GBR,AR/VR/Gaming,Healthcare,Industry 4.0,IoT Devices,Public Safety,Smart City & Home,Smart Transportation,Smartphone,slice Type
0,14,0,1e-06,10,1,0,0,1,0,0,0,0,1,0,0,0,3
1,18,20,0.001,100,0,1,1,0,1,0,0,0,0,0,0,0,1
2,17,14,1e-06,300,0,1,0,1,0,0,0,0,0,0,0,1,1
3,3,17,0.01,100,0,1,0,1,0,0,0,0,0,0,0,1,1
4,9,4,0.01,50,1,0,0,1,0,0,0,0,0,1,0,0,2


In [55]:
data_backup = data
data_backup

Unnamed: 0,LTE/5g Category,Time,Packet Loss Rate,Packet delay,IoT,LTE/5G,GBR,Non-GBR,AR/VR/Gaming,Healthcare,Industry 4.0,IoT Devices,Public Safety,Smart City & Home,Smart Transportation,Smartphone,slice Type
0,14,0,0.000001,10,1,0,0,1,0,0,0,0,1,0,0,0,3
1,18,20,0.001000,100,0,1,1,0,1,0,0,0,0,0,0,0,1
2,17,14,0.000001,300,0,1,0,1,0,0,0,0,0,0,0,1,1
3,3,17,0.010000,100,0,1,0,1,0,0,0,0,0,0,0,1,1
4,9,4,0.010000,50,1,0,0,1,0,0,0,0,0,1,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31578,14,23,0.010000,100,0,1,0,1,0,0,0,0,0,0,0,1,1
31579,12,7,0.010000,50,1,0,0,1,0,0,0,0,0,1,0,0,2
31580,18,5,0.010000,300,1,0,1,0,0,0,0,1,0,0,0,0,2
31581,8,1,0.000001,10,1,0,0,1,0,1,0,0,0,0,0,0,3


In [56]:
# Read the CSV file containing the random seeds
seed_df = pd.read_csv("/content/drive/MyDrive/Dataset/random_seed_data.csv")
seed_df.head()

Unnamed: 0,Iteration No.,Under-sampling,Data-split,Modeling
0,1,43,40,43
1,2,43,42,42
2,3,43,41,38


# **Loop**

## **label_identification**

In [57]:
def label_identification(data, a, b, c):

  #################################### Initialization ###################################
  data = data
  a = a #Under-sampling
  b = b #Data-split
  c = c #Data Modeling
  print(a, b, c)
  print("\n")

  #################################### Under-sampling ####################################
  # Identify the majority class
  majority_class = data['slice Type'].value_counts().idxmax()
  # print("Majority Class: ", majority_class)

  # Count the number of data points in the majority class
  majority_count = data['slice Type'].value_counts()[majority_class]
  # print("Majority Class Count: ", majority_count)

  # Count the number of data points in the other two classes
  minority_count = data['slice Type'].value_counts().min()
  # print("Minority Class Count: ", minority_count)

  # Calculate the number of data points to remove from the majority class
  num_to_remove = majority_count - minority_count
  # print("Amount of Data to be removed: ", num_to_remove)

  # Randomly select data points from the majority class to remove
  majority_indices = data[data['slice Type'] == majority_class].index
  np.random.seed(a)
  remove_indices = np.random.choice(majority_indices, size=num_to_remove, replace=False)
  # print("Indices to be removed: \n", remove_indices[:100])

  # Remove the selected data points from the DataFrame
  balanced_data = data.drop(remove_indices)
  data = balanced_data
  # data.head(20)
  data = data.reset_index(drop=True)
  # data.head(20)

  #################################### Relevant Feature Selection ####################################
  features = ['Packet Loss Rate', 'Packet delay', 'IoT',	'LTE/5G',	'GBR',	'Non-GBR']

  #################################### Define X and y ####################################
  X = data[features]
  y = data['slice Type']

  #################################### Apply t-sne to X ####################################
  X_backup = X
  tsne = TSNE(n_components=2, random_state=42)  # Reduce to 2 dimensions
  X_tsne = tsne.fit_transform(X)

  X = pd.DataFrame({'Feature 1': X_tsne[:, 0], 'Feature 2': X_tsne[:, 1]})
  data = pd.DataFrame({'Feature 1': X_tsne[:, 0], 'Feature 2': X_tsne[:, 1], 'slice Type': y})
  features = ['Feature 1', 'Feature 2']

  #################################### Ground Truth Calculation ####################################
  ##################### Ground Truth Slice 1 #####################
  # Group data by label "slice 1"
  grouped_data_slice1 = data[data['slice Type'] == 1]

  # Create 14-dimensional vectors
  grouped_data_slice1_vec = grouped_data_slice1[features].values

  # Calculate Average
  GT_slice1 = np.mean(grouped_data_slice1_vec, axis=0)
  # print(GT_slice1)

  ##################### Ground Truth Slice 2 #####################
  # Group data by label "slice 2"
  grouped_data_slice2 = data[data['slice Type'] == 2]

  # Create 14-dimensional vectors
  grouped_data_slice2_vec = grouped_data_slice2[features].values

  # Calculate Average
  GT_slice2 = np.mean(grouped_data_slice2_vec, axis=0)
  # print(GT_slice2)

  ##################### Ground Truth Slice 3 #####################
  # Group data by label "slice 3"
  grouped_data_slice3 = data[data['slice Type'] == 3]

  # Create 14-dimensional vectors
  grouped_data_slice3_vec = grouped_data_slice3[features].values

  # Calculate Average
  GT_slice3 = np.mean(grouped_data_slice3_vec, axis=0)
  # print(GT_slice3)

  ##################### Plot Ground Truth #####################
  # Combine the centroids into a single array
  GT_centroids = np.vstack((GT_slice1, GT_slice2, GT_slice3))
  print("\n Ground Truth Centroids: \n", GT_centroids)

  # Testing the for loop
  # for cluster in range(1, len(np.unique(data['slice Type']))+1):
  #   cluster_data = data[data['slice Type'] == cluster]
    # print(cluster, "\n",cluster_data, "\n")

  #################################### Datasplit - X, X_train and X_test ####################################
  # Split the dataset into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=b)


  #################################### Data Modeling and Analysis ####################################
  ##################### Model Training #####################
  # Create a KMeans model
  kmeans_model = KMeans(n_clusters=3, random_state=c)

  # Fit the model to the training data
  best_model = kmeans_model.fit(X_train)

  # Model Labels
  kmeans_model_labels = best_model.labels_
  kmeans_model_labels_aligned = kmeans_model_labels + 1

  # Model Centroids
  kmeans_model_centroids = best_model.cluster_centers_
  C1 = kmeans_model_centroids[0]
  C2 = kmeans_model_centroids[1]
  C3 = kmeans_model_centroids[2]

  ##################### Add Features and Aligned Labels into a table #####################
  # Get the column names from the original DataFrame
  X_train_column_names = X_train.columns.tolist()

  # Convert X_test to a pandas DataFrame
  X_train_df = pd.DataFrame(X_train, columns=X_train_column_names)  # Replace X_test_column_names with your actual column names
  X_train_with_cluster_labels = X_train_df.copy()  # Create a copy to avoid modifying the original DataFrame

  # Include the 'labels' as a new column in the DataFrame
  X_train_with_cluster_labels['cluster_label'] = kmeans_model_labels_aligned

  ##################### Minimum Centroid Distance from Ground Truths #####################
  # Initialize an empty array
  arr = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]).astype(np.float32)

  # Minimum Distance of C1 from Ground Truths
  dist_C1_GT_slice1 = np.linalg.norm(C1 - GT_slice1)
  dist_C1_GT_slice2 = np.linalg.norm(C1 - GT_slice2)
  dist_C1_GT_slice3 = np.linalg.norm(C1 - GT_slice3)

  min_dist_C1 = min(dist_C1_GT_slice1, dist_C1_GT_slice2, dist_C1_GT_slice3)

  if min_dist_C1 == dist_C1_GT_slice1:
    closest_centroid_C1 = "Slice 1"
    arr[0] = C1
  elif min_dist_C1 == dist_C1_GT_slice2:
    closest_centroid_C1 = "Slice 2"
    arr[0] = C2
  else:
    closest_centroid_C1 = "Slice 3"
    arr[0] = C3
  # print("C1 has minimum distance of ", min_dist_C1, "from the Model's Centroid/Cluster", closest_centroid_C1)

  # Minimum Distance of C2 from Ground Truths
  dist_C2_GT_slice1 = np.linalg.norm(C2 - GT_slice1)
  dist_C2_GT_slice2 = np.linalg.norm(C2 - GT_slice2)
  dist_C2_GT_slice3 = np.linalg.norm(C2 - GT_slice3)

  min_dist_C2 = min(dist_C2_GT_slice1, dist_C2_GT_slice2, dist_C2_GT_slice3)

  if min_dist_C2 == dist_C2_GT_slice1:
    closest_centroid_C2 = "Slice 1"
    arr[1] = C1
  elif min_dist_C2 == dist_C2_GT_slice2:
    closest_centroid_C2 = "Slice 2"
    arr[1] = C2
  else:
    closest_centroid_C2 = "Slice 3"
    arr[1] = C3
  # print("C2 has minimum distance of ", min_dist_C2, "from the Model's Centroid/Cluster", closest_centroid_C2)

  # Minimum Distance of C3 from Ground Truths
  dist_C3_GT_slice1 = np.linalg.norm(C3 - GT_slice1)
  dist_C3_GT_slice2 = np.linalg.norm(C3 - GT_slice2)
  dist_C3_GT_slice3 = np.linalg.norm(C3 - GT_slice3)

  min_dist_C3 = min(dist_C3_GT_slice1, dist_C3_GT_slice2, dist_C3_GT_slice3)

  if min_dist_C3 == dist_C3_GT_slice1:
    closest_centroid_C3 = "Slice 1"
    arr[2] = C1
  elif min_dist_C3 == dist_C3_GT_slice2:
    closest_centroid_C3 = "Slice 2"
    arr[2] = C2
  else:
    closest_centroid_C3 = "Slice 3"
    arr[2] = C3
  # print("C3 has minimum distance of ", min_dist_C3, "from the Model's Centroid/Cluster", closest_centroid_C3)

  # Altering the Model Centroids based on Ground Truths
  print("\n Before Mapping: \n", kmeans_model_centroids)
  kmeans_model_mod = kmeans_model
  kmeans_model_mod.cluster_centers_ = arr
  kmeans_model_centroids_mapped = kmeans_model_mod.cluster_centers_
  print("\n After Mapping: \n", kmeans_model_centroids_mapped)

  #################################### Model Testing and Analysis ####################################
  # Predict the labels for the test data
  y_pred = kmeans_model_mod.predict(X_test)
  y_pred_aligned = y_pred + 1

  ##################### Update testing dataset with labels #####################
  # Get the column names from the original DataFrame
  X_test_column_names = X_test.columns.tolist()

  # Convert X_test to a pandas DataFrame
  # X_test_df = pd.DataFrame(X_test, columns=X_test_column_names)

  # Create a copy to avoid modifying the original DataFrame
  X_test_with_labels = X_test.copy()

  # Resetting indices to avoid alignment issues
  X_test_with_labels.reset_index(drop=True, inplace=True)
  y_test.reset_index(drop=True, inplace=True)
  # y_pred.reset_index(drop=True, inplace=True)

  actual_label = pd.Series(y_test, name='actual_label')
  predicted_label = pd.Series(y_pred_aligned, name='predicted_label')

  # Include the 'predicted labels' as a new column in the DataFrame
  X_test_with_labels['actual_label'] = actual_label

  # Include the 'predicted labels' as a new column in the DataFrame
  X_test_with_labels['predicted_label'] = predicted_label

  #################################### Performance Metrics calculation ####################################
  ##################### Sihouette Score, DBI and ARI calculation #####################
  # Calculate the Silhouette Score
  silhouette_score_test = silhouette_score(X_test, predicted_label)

  # Calculate Davies-Bouldin Index
  db_index_test = metrics.davies_bouldin_score(X_test, predicted_label)

  # Calculate the Adjusted Rand Index - It is the measure of the similarity of datapoints presents in the clusters and it ranges between 0 and 1
  ar_index_test = adjusted_rand_score(actual_label, predicted_label)

  return actual_label, predicted_label

## **performance_score_inbuilt_libraries**

In [58]:
def performance_score_inbuilt_libraries(actual_label, predicted_label):
  #################################### Initialization ###################################
  actual_label = actual_label
  predicted_label = predicted_label

  ##################### Confusion Matrix from pre-defined libraries #####################
  accuracy = accuracy_score(actual_label, predicted_label)
  precision = precision_score(actual_label, predicted_label, average='weighted')
  recall = recall_score(actual_label, predicted_label, average='weighted')
  f1score = f1_score(actual_label, predicted_label, average='weighted')

  return accuracy, precision, recall, f1score

## **performance_score_custom_method**

In [59]:
def performance_score_custom_method(actual_label, predicted_label):
  #################################### Initialization ###################################
  actual_label = actual_label
  predicted_label = predicted_label

  ##################### Confusion Matrix using Mathematical Equation #####################
  # Initialize counts for TP, FP, FN, and TN for each class
  TP_class = [0, 0, 0]
  FP_class = [0, 0, 0]
  FN_class = [0, 0, 0]
  TN_class = [0, 0, 0]

  # Iterate over each sample
  for actual, predicted in zip(actual_label, predicted_label):
      # Iterate over each class
      for c in range(len(np.unique(actual_label))):
          # Increment counts based on the classification result
          if actual == c+1 and predicted == c+1:  # True Positives
              TP_class[c] += 1
          elif actual == c+1 and predicted != c+1:  # False Negatives
              FN_class[c] += 1
          elif actual != c+1 and predicted == c+1:  # False Positives
              FP_class[c] += 1
          elif actual != c+1 and predicted != c+1:  # True Negatives
              TN_class[c] += 1

  # Compute overall TP, FP, FN, TN
  TP = sum(TP_class)
  FP = sum(FP_class)
  FN = sum(FN_class)
  TN = sum(TN_class)

  # Overall
  accuracy = (TP + TN)/(TP + TN + FP + FN)
  precision = TP/(TP + FP)
  recall = TP/(TP + FN)
  f1score = 2 * ((precision * recall)/(precision + recall))

  return accuracy, precision, recall, f1score

# **Main**

In [60]:
seed_df_updated = seed_df
seed_df_updated

Unnamed: 0,Iteration No.,Under-sampling,Data-split,Modeling
0,1,43,40,43
1,2,43,42,42
2,3,43,41,38


In [61]:
# Iterate through the random seeds
for i in range(len(seed_df)):
    a = seed_df.at[i, "Under-sampling"]
    b = seed_df.at[i, "Data-split"]
    c = seed_df.at[i, "Modeling"]
    print("Iteration ", i, "\n")
    # print(a,b,c)
    # print("\n")
    actual_label, predicted_label = label_identification(data, a, b, c)

    # performance_score_inbuilt_libraries
    accuracy, precision, recall, f1score = performance_score_inbuilt_libraries(actual_label, predicted_label)
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-score: ", f1score)

    # performance_score_custom_method
    # accuracy, precision, recall, f1_score = performance_score_custom_method(actual_label, predicted_label)
    # print(accuracy, precision, recall, f1_score, "\n")

    # Update seed_df with actual_label and predicted_label
    seed_df_updated.at[i, 'Accuracy'] = accuracy
    seed_df_updated.at[i, 'Precision'] = precision
    seed_df_updated.at[i, 'Recall'] = recall
    seed_df_updated.at[i, 'F1-score'] = f1score

Iteration  0 

43 40 43



 Ground Truth Centroids: 
 [[  51.848503    97.29763  ]
 [  65.6263     -98.73888  ]
 [-116.47587      1.5351417]]

 Before Mapping: 
 [[  76.11909   125.30534 ]
 [  64.93585  -108.468704]
 [-119.17418    16.932602]]

 After Mapping: 
 [[  76.11909   125.30534 ]
 [  64.93585  -108.468704]
 [-119.17418    16.932602]]




Accuracy:  0.9328223624887286
Precision:  0.9389572059043638
Recall:  0.9328223624887286
F1-score:  0.9305831277110229
Iteration  1 

43 42 42



 Ground Truth Centroids: 
 [[  51.848503    97.29763  ]
 [  65.6263     -98.73888  ]
 [-116.47587      1.5351417]]





 Before Mapping: 
 [[  65.23732  -108.06334 ]
 [  75.53792   125.905914]
 [-119.1928     16.997768]]

 After Mapping: 
 [[  75.53792   125.905914]
 [  65.23732  -108.06334 ]
 [-119.1928     16.997768]]
Accuracy:  0.9274120829576195
Precision:  0.9345150459959775
Recall:  0.9274120829576195
F1-score:  0.9247947585496109
Iteration  2 

43 41 38



 Ground Truth Centroids: 
 [[  51.848503    97.29763  ]
 [  65.6263     -98.73888  ]
 [-116.47587      1.5351417]]

 Before Mapping: 
 [[-105.50133   -34.450703]
 [ 117.43093   -62.135845]
 [  15.313333  153.0702  ]]

 After Mapping: 
 [[  15.313333  153.0702  ]
 [ 117.43093   -62.135845]
 [-105.50133   -34.450703]]




Accuracy:  0.8250676284941388
Precision:  0.8446654910490649
Recall:  0.8250676284941388
F1-score:  0.8232565931390177


In [62]:
seed_df_updated

Unnamed: 0,Iteration No.,Under-sampling,Data-split,Modeling,Accuracy,Precision,Recall,F1-score
0,1,43,40,43,0.932822,0.938957,0.932822,0.930583
1,2,43,42,42,0.927412,0.934515,0.927412,0.924795
2,3,43,41,38,0.825068,0.844665,0.825068,0.823257


In [63]:
# Save the updated DataFrame to CSV
seed_df_updated.to_csv('/content/drive/MyDrive/Dataset/random_seed_results.csv', index=False)

In [64]:
# Read the CSV file containing the random seed results
seed_results_df = pd.read_csv("/content/drive/MyDrive/Dataset/random_seed_results.csv")
seed_results_df.head()

Unnamed: 0,Iteration No.,Under-sampling,Data-split,Modeling,Accuracy,Precision,Recall,F1-score
0,1,43,40,43,0.932822,0.938957,0.932822,0.930583
1,2,43,42,42,0.927412,0.934515,0.927412,0.924795
2,3,43,41,38,0.825068,0.844665,0.825068,0.823257


# **SYNC NOTEBOOKS**

In [65]:
%cp '/content/drive/MyDrive/Colab Notebooks/Exp9 - Slice_Identification_UnSupML_undersampling_tsne_loop.ipynb' '/content/drive/MyDrive/Network_Slicing_with_ML_repo/Network-Slicing-with-ML/Notebooks/'
%cp '/content/drive/MyDrive/Dataset/random_seed_data.csv' '/content/drive/MyDrive/Network_Slicing_with_ML_repo/Network-Slicing-with-ML/Results/'
%cp '/content/drive/MyDrive/Dataset/random_seed_results.csv' '/content/drive/MyDrive/Network_Slicing_with_ML_repo/Network-Slicing-with-ML/Results/'