# Phase 3: Microservice Identification (Grouping by Similar Services)

In [None]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
model_type = "codebert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert
best_community_detection_algorithm = 'Louvain' # All options: Louvain, EdMot, Infomap, LabelPropagation, FastGreedy, GirvanNewman

## 1.1 Create service graph

In [None]:
import pandas as pd
import numpy as np
from scipy import spatial
from utils import save_microservices_to_file

In [None]:
# Load the data
communities_df = pd.read_csv(f"generated_data/community/{version}_{system}_{best_community_detection_algorithm}_communities.csv")
class_graph_df = pd.read_csv(f"generated_data/graph/class/{version}_{system}_class_graph.csv")
embeddings_df = pd.read_csv(f"generated_data/embedding/{version}_{system}_{model_type}_embeddings.csv")

# Extract class names and their embeddings
class_names = embeddings_df.iloc[:, 0].str.split(';', expand=True)[0]
embeddings = embeddings_df.iloc[:, 1:].values
class_embeddings_dict = dict(zip(class_names, embeddings))

def compute_static_distances_for_service_pairs(class_graph, communities):
    merged_df = class_graph.merge(communities, left_on='class1', right_on='class_name').merge(
        communities, left_on='class2', right_on='class_name', suffixes=('_1', '_2')
    )
    inter_service_df = merged_df[merged_df['service_1'] != merged_df['service_2']]
    return inter_service_df.groupby(['service_1', 'service_2'])['static_distance'].sum().to_dict()

def compute_service_embeddings(embeddings_dict, communities):
    """Compute service embeddings by averaging class embeddings for each service, skipping missing embeddings."""
    service_embeddings = {}
    for service, class_group in communities.groupby('service')['class_name']:
        class_embeddings = [embeddings_dict[class_name] for class_name in class_group if class_name in embeddings_dict]
        if class_embeddings:
            service_embeddings[service] = np.mean(class_embeddings, axis=0)
    return service_embeddings

def compute_semantic_distances_for_service_pairs(embeddings_dict, communities):
    service_embeddings = compute_service_embeddings(embeddings_dict, communities)
    
    services = list(service_embeddings.keys())
    semantic_distances = {}
    for i, s1 in enumerate(services):
        for j, s2 in enumerate(services):
            if i != j:
                distance = 1 - spatial.distance.cosine(service_embeddings[s1], service_embeddings[s2])
                semantic_distances[(s1, s2)] = distance
    
    return semantic_distances

def normalize_data(data):
    min_val, max_val = min(data.values()), max(data.values())
    range_val = max_val - min_val
    return {k: (v - min_val) / range_val for k, v in data.items()} if range_val else {k: 0 for k, v in data.items()}

# Compute static and semantic distances
static_distances = compute_static_distances_for_service_pairs(class_graph_df, communities_df)
semantic_distances = compute_semantic_distances_for_service_pairs(class_embeddings_dict, communities_df)

# Normalize the distances
normalized_static_distances = normalize_data(static_distances)
normalized_semantic_distances = normalize_data(semantic_distances)

# Create the service graph DataFrame
service_graph_data = [
    [s1, s2, normalized_static_distances.get((s1, s2), 0), normalized_semantic_distances.get((s1, s2), 0)]
    for s1, s2 in static_distances.keys()
]
service_graph_df = pd.DataFrame(service_graph_data, columns=['service1', 'service2', 'static_distance', 'semantic_distance'])

# Save the DataFrame
filename = f"generated_data/graph/service/{version}_{system}_service_graph.csv"
service_graph_df.to_csv(filename, index=False)

## 1.2 Cluster services

In [None]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
import skfuzzy as fuzz
import math
from collections import defaultdict

In [None]:
def compute_edge_weight(semantic, static, alpha=0.5):
    if not (0 <= semantic <= 1) or not (0 <= static <= 1):
        raise ValueError("Both 'semantic' and 'static' values should be between 0 and 1.")
    
    beta = 1 - alpha
    return alpha * static + beta * semantic

In [None]:
# Usage
services_graph = nx.Graph([
    (row['service1'], row['service2'], {"weight": compute_edge_weight(row['semantic_distance'], row['static_distance'])}) # adjust alpha (weight of static distance)
    for _, row in service_graph_df.iterrows()
])

# Remove edges with weight 0
services_graph.remove_edges_from([(u, v) for u, v, weight in services_graph.edges(data='weight') if weight == 0])

# Print pairs of nodes and their edge weights
print("Edge weights:")
for u, v, weight in services_graph.edges(data='weight'):
    print(f"{u} - {v}: {weight}")

shortest_distances = nx.floyd_warshall_numpy(services_graph, weight='weight')

# Replace 'inf' with a large value, such as 1e9
shortest_distances[shortest_distances == np.inf] = 1e9

print("Shortest distances matrix:")
print(shortest_distances)

### 1.2.1 Fuzzy C-means from Scikit

In [None]:
# Constants (change if needed)
N_COMPONENTS = 2
DISSIMILARITY = 'precomputed'
FUZZINESS = 2
ERROR_THRESHOLD = 0.005
MAX_ITERATIONS = 1000
CLUSTER_RANGE = range(1, 100)

In [None]:
def compute_2D_embedding(distances):
    """Computes a 2D embedding from the given distance matrix."""
    embedding = MDS(n_components=N_COMPONENTS, dissimilarity=DISSIMILARITY)
    return embedding.fit_transform(distances)

def determine_optimal_clusters(data):
    """Determines the optimal number of clusters using the Elbow method."""
    fpc_values = []
    for c_value in CLUSTER_RANGE:
        _, _, _, _, _, _, fpc = fuzz.cmeans(
            data.T, 
            c=c_value, 
            m=FUZZINESS, 
            error=ERROR_THRESHOLD, 
            maxiter=MAX_ITERATIONS
        )
        fpc_values.append(fpc)

    plt.figure()
    plt.plot(CLUSTER_RANGE, fpc_values)
    plt.title('Fuzzy Partition Coefficient (FPC) for different cluster numbers')
    plt.xlabel('Number of clusters')
    plt.ylabel('FPC')
    plt.grid(True)
    plt.show()

    return detect_elbow(fpc_values)

def detect_elbow(y_values):
    """Detects the 'elbow' in a list of y-values."""
    # Get coordinates of all the points
    n_points = len(y_values)
    all_coords = np.vstack((range(n_points), y_values)).T
    # Get vectors between all points from the first point to the last point
    first_point = all_coords[0]
    line_vector = all_coords[-1] - all_coords[0]
    line_vector_norm = line_vector / np.sqrt(np.sum(line_vector**2))
    
    # Get orthogonal vectors from the first point to all points
    vec_from_first = all_coords - first_point
    scalar_prod = np.sum(vec_from_first * line_vector_norm, axis=1)
    vec_from_first_parallel = np.outer(scalar_prod, line_vector_norm)
    vec_to_line = vec_from_first - vec_from_first_parallel
    
    # Compute the distance to the line
    dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1))
    
    # Return the index of the point with max distance to the line
    idx_elbow = np.argmax(dist_to_line)
    return idx_elbow

def fuzzy_cmeans_clustering(data, optimal_clusters):
    """Clusters the data using Fuzzy C-Means."""
    cntr, u, _, _, _, _, _ = fuzz.cmeans(
        data.T, 
        c=optimal_clusters, 
        m=FUZZINESS, 
        error=ERROR_THRESHOLD, 
        maxiter=MAX_ITERATIONS
    )
    return np.argmax(u, axis=0)

def visualize_clusters(data, labels):
    """Visualizes the clustered data."""
    plt.figure(figsize=(10, 8))
    for cluster_num in np.unique(labels):
        cluster_points = data[labels == cluster_num]
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Microservice {cluster_num + 1}')

    plt.title('Fuzzy C-Means Clustering')
    plt.xlabel('MDS Dimension 1')
    plt.ylabel('MDS Dimension 2')
    plt.legend()
    plt.show()

In [None]:
# transformed_data = compute_2D_embedding(shortest_distances)
optimal_clusters = determine_optimal_clusters(shortest_distances)
print('Optimal number of clusters: ', optimal_clusters)
labels = fuzzy_cmeans_clustering(shortest_distances, optimal_clusters)

embedding_2d = MDS(n_components=N_COMPONENTS, dissimilarity=DISSIMILARITY).fit_transform(shortest_distances)
visualize_clusters(embedding_2d, labels)

# Save results
save_microservices_to_file(labels, services_graph, communities_df, f"results/{version}_{system}_microservices.txt")

### 1.2.1 Custom fuzzy C-means

In [None]:
def save_clusters_to_file(clusters, communities_df, filename):
    with open(filename, "w") as file:
        for cluster_num, (center, members) in enumerate(clusters.items(), 1):
            file.write(f"Microservice {cluster_num} centered at {center}:\n")
            for service in members:
                file.write(f"  - Service: {service}\n")
                related_classes = communities_df[communities_df['service'] == service]['class_name'].tolist()
                for related_class in related_classes:
                    file.write(f"    - Class: {related_class}\n")
            file.write("\n")

In [None]:
class FuzzyCMeans:
    def __init__(self, m, membership_threshold, merge_threshold):
        self.m = m  # Fuzziness coefficient
        self.threshold = membership_threshold  # Membership threshold
        self.merge_threshold = merge_threshold  # Threshold for merging clusters based on overlap
        self.coef = 2 / (m - 1)  # Coefficient used in membership calculation

    def calculate_membership(self, service_idx, center_idx, distances):
        """Calculate fuzzy membership of a service to a center."""
        distance_to_center = distances[service_idx][center_idx]
        if distance_to_center == 0:
            return float('inf')
        
        membership_sum = 0
        for type_center_idx in self.center_type_indices.values():
            distance_to_type_center = distances[service_idx][type_center_idx] or 1e9
            membership_sum += math.pow(distance_to_center / distance_to_type_center, self.coef)
        
        return 1 / membership_sum if membership_sum else float('inf')

    def cluster_services(self, services_list, distances, service_type):
        # Identify service indices of the given type
        self.center_type_indices = {
            service: idx for idx, service in enumerate(services_list) if service.startswith(service_type)
        }

        # Initial clustering
        clusters = {}
        for type_service, type_index in self.center_type_indices.items():
            cluster_members = [type_service]  # Initialize cluster with the center itself
            for idx, service in enumerate(services_list):
                if service != type_service:
                    membership_value = self.calculate_membership(idx, type_index, distances)
                    if membership_value > self.threshold:
                        cluster_members.append(service)
            clusters[type_service] = cluster_members

        # Remove clusters containing only the center service if that service belongs to another cluster
        clusters = self.remove_single_service_clusters(clusters)

        # Merge clusters with overlapping services
        clusters = self.merge_clusters_based_on_overlap(clusters)
        
        return clusters

    def remove_single_service_clusters(self, clusters):
        # Find centers that are members of other clusters
        centers_in_other_clusters = set()
        for center, members in clusters.items():
            for member in members:
                if member != center and member in self.center_type_indices:
                    centers_in_other_clusters.add(member)

        # Remove clusters with only the center service if the center belongs to another cluster
        clusters_to_remove = [center for center, members in clusters.items() 
                              if len(members) == 1 and center in centers_in_other_clusters]
        for center in clusters_to_remove:
            del clusters[center]

        return clusters

    def calculate_overlap(self, cluster_a, cluster_b):
        """Calculate the overlap between two clusters."""
        intersection = set(cluster_a).intersection(cluster_b)
        union = set(cluster_a).union(cluster_b)
        overlap_ratio = len(intersection) / len(union) if union else 0
        return overlap_ratio

    def merge_clusters_based_on_overlap(self, clusters):
        """Merge clusters based on the overlap between their services."""
        clusters_to_merge = []
        keys = list(clusters.keys())

        # Find pairs of clusters to merge
        for i, key_i in enumerate(keys):
            for j in range(i + 1, len(keys)):
                key_j = keys[j]
                if self.calculate_overlap(clusters[key_i], clusters[key_j]) > self.merge_threshold:
                    clusters_to_merge.append((key_i, key_j))

        # Merge the clusters
        for key_i, key_j in clusters_to_merge:
            if key_i in clusters and key_j in clusters:
                # Extend the services of cluster i with those of cluster j and remove cluster j
                clusters[key_i].extend(clusters[key_j])
                clusters[key_i] = list(set(clusters[key_i]))  # Remove duplicates
                del clusters[key_j]

        return clusters

# Usage
fuzzy_c_means = FuzzyCMeans(m=2, membership_threshold=0.2, merge_threshold=0.15)

# Get the list of service nodes from the graph
service_nodes = list(services_graph.nodes)

# Perform clustering with the best thresholds
clusters = fuzzy_c_means.cluster_services(service_nodes, shortest_distances, service_type="Application") # change service type

# Output clusters
for center, members in clusters.items():
    print(f"Cluster centered at {center}: {members}")

# Save results
save_clusters_to_file(clusters, communities_df, f"results/{version}_{system}_microservices.txt")