# Phase 3: Microservice Identification (Grouping by Similar Services)

In [None]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
model_type = "albert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert

## 1.1 Create service graph

In [None]:
import pandas as pd
from scipy import spatial
from utils import load_data_from_csv

In [None]:
# Read CSV to DataFrame
best_community_detection_algorithm = 'Louvain' # Change this
communities_df = pd.read_csv(f"generated_data/community/{version}_{system}_{best_community_detection_algorithm}_communities.csv")
class_graph_df = pd.read_csv(f"generated_data/graph/class/{version}_{system}_class_graph.csv")
class_names, class_labels, class_embeddings = load_data_from_csv(f"generated_data/embedding/{version}_{system}_{model_type}_embeddings.csv")

# Data Structuring
class_embeddings_dict = dict(zip(class_names, class_embeddings))

In [None]:
# 1. Calculate service embeddings

# Filter out class names not present in the embeddings dictionary
valid_communities_df = communities_df[communities_df['class_name'].isin(class_embeddings_dict.keys())]

# Calculate service embeddings
service_to_embedding = valid_communities_df.groupby('service')['class_name'].apply(
    lambda x: sum(class_embeddings_dict[class_name] for class_name in x) / len(x)
).to_dict()

# 2. Calculate service similarities
service_similarities = {
    s1: {
        s2: 1 - spatial.distance.cosine(embedding1, embedding2)
        for s2, embedding2 in service_to_embedding.items() if s1 != s2
    }
    for s1, embedding1 in service_to_embedding.items()
}

# 3. Create dictionaries to store processed distances using merges
merged_df = class_graph_df.merge(
    communities_df, left_on='class1', right_on='class_name', how='inner'
).merge(
    communities_df, left_on='class2', right_on='class_name', how='inner', suffixes=('_1', '_2')
)

# Filter rows where services are the same and accumulate distances
static_dict = merged_df.loc[merged_df['service_1'] != merged_df['service_2']].groupby(['service_1', 'service_2'])['static_distance'].sum().to_dict()
semantic_dict = {(s1, s2): service_similarities.get(s1, {}).get(s2) for s1, s2 in static_dict.keys()}

# 4. Normalize static distances
max_static_distance = max(static_dict.values()) if static_dict else 0
normalized_static_dict = {k: v / max_static_distance for k, v in static_dict.items()}

# 5. Create the service_graph_df DataFrame
service_graph_data = [
    [s1, s2, normalized_static_dict[(s1, s2)], semantic_dict.get((s1, s2), 0)]
    for s1, s2 in normalized_static_dict.keys()
]

service_graph_df = pd.DataFrame(service_graph_data, columns=['service1', 'service2', 'static_distance', 'semantic_distance'])

# 6. Save service_graph_df to CSV
service_graph_df.to_csv(f"generated_data/graph/service/{version}_{system}_service_graph.csv", index=False)

service_graph_df.head()  # Display the first few rows of the dataframe

## 1.2 Cluster services

In [None]:
import networkx as nx
import numpy as np
import math
from matplotlib import pyplot as plt
from sklearn.manifold import MDS
import skfuzzy as fuzz


In [None]:
def edge_weight(semantic, static, alpha=0.5, beta=0.5):
    return alpha * static + beta * semantic

# Compute edges with weights and construct the graph
services_graph = nx.Graph([
    (row['service1'], row['service2'], {"weight": edge_weight(row['semantic_distance'], row['static_distance'])})
    for _, row in service_graph_df.iterrows()
])

In [None]:
shortest_distances = nx.floyd_warshall_numpy(services_graph, weight='weight')
print(shortest_distances)

In [None]:
# 1. Convert the center distances matrix into a 2D feature representation
embedding = MDS(n_components=2, dissimilarity='precomputed')
transformed_data = embedding.fit_transform(shortest_distances)

# Determine optimal number of clusters using Elbow method
fpcs = []
cluster_range = range(1, 100)  # Adjust as needed
for c_value in cluster_range:
    _, _, _, _, _, _, fpc = fuzz.cmeans(
        transformed_data.T, 
        c=c_value, 
        m=2, 
        error=0.005, 
        maxiter=1000
    )
    fpcs.append(fpc)

# Plot the FPC values to determine the 'elbow'
plt.figure()
plt.plot(cluster_range, fpcs)
plt.title('Fuzzy Partition Coefficient (FPC) for different cluster numbers')
plt.xlabel('Number of clusters')
plt.ylabel('FPC')
plt.grid(True)
plt.show()

def detect_elbow(y_values):
    # Get coordinates of all the points
    n_points = len(y_values)
    all_coords = np.vstack((range(n_points), y_values)).T
    # Get vectors between all points from the first point to the last point
    first_point = all_coords[0]
    line_vector = all_coords[-1] - all_coords[0]
    line_vector_norm = line_vector / np.sqrt(np.sum(line_vector**2))
    
    # Get orthogonal vectors from the first point to all points
    vec_from_first = all_coords - first_point
    scalar_prod = np.sum(vec_from_first * line_vector_norm, axis=1)
    vec_from_first_parallel = np.outer(scalar_prod, line_vector_norm)
    vec_to_line = vec_from_first - vec_from_first_parallel
    
    # Compute the distance to the line
    dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1))
    
    # Return the index of the point with max distance to the line
    idx_elbow = np.argmax(dist_to_line)
    return idx_elbow

optimal_clusters = detect_elbow(fpcs)
print(f"Optimal number of clusters: {optimal_clusters}")

# 4. Use transformed_center_data with scikit-fuzzy's cmeans function using the optimal cluster number
cntr, u, u0, d, jm, p, fpc = fuzz.cmeans(
    transformed_data.T, 
    c=optimal_clusters, 
    m=2, 
    error=0.005, 
    maxiter=1000
)
labels = np.argmax(u, axis=0)

In [None]:
# Visualization
plt.figure(figsize=(10, 8))
for cluster_num in np.unique(labels):
    cluster_points = transformed_data[labels == cluster_num]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Microservice {cluster_num + 1}')

plt.title('Fuzzy C-Means Clustering')
plt.xlabel('MDS Dimension 1')
plt.ylabel('MDS Dimension 2')
plt.legend()
plt.show()

# Save to microservices.txt
with open(f"./results/{version}_{system}_microservices.txt", "w") as file:
    for cluster_num in np.unique(labels):
        file.write(f"Microservice {cluster_num + 1}:\n")
        cluster_services = [service_name for idx, service_name in enumerate(services_graph.nodes) if labels[idx] == cluster_num]
        
        for service in cluster_services:
            file.write(f"  - Service: {service}\n")
            
            # Retrieve and write related classes for the service
            related_classes = communities_df[communities_df['service'] == service]['class_name'].tolist()
            for related_class in related_classes:
                file.write(f"    - Class: {related_class}\n")
                
        file.write("\n")