# Phase 3: Microservice Identification (Grouping by Similar Services)

In [None]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
model_type = "albert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert

## 1.1 Create service graph

In [None]:
import pandas as pd
from scipy import spatial
from utils import load_data_from_csv

In [None]:
# Read CSV to DataFrame
best_community_detection_algorithm = 'Louvain' # Change this
communities_df = pd.read_csv(f"generated_data/community/{version}_{system}_{best_community_detection_algorithm}_communities.csv")
class_graph_df = pd.read_csv(f"generated_data/graph/class/{version}_{system}_class_graph.csv")
class_names, class_labels, class_embeddings = load_data_from_csv(f"generated_data/embedding/{version}_{system}_{model_type}_embeddings.csv")

# Data Structuring
class_embeddings_dict = dict(zip(class_names, class_embeddings))

In [None]:
def filter_valid_classes(communities, embeddings_dict):
    """Filters out classes not present in the embeddings dictionary."""
    return communities[communities['class_name'].isin(embeddings_dict.keys())]

def compute_service_embeddings(valid_communities, embeddings_dict):
    """Computes the service embeddings."""
    return valid_communities.groupby('service')['class_name'].apply(
        lambda x: sum(embeddings_dict[class_name] for class_name in x) / len(x)
    ).to_dict()

def compute_service_similarities(service_embeddings):
    """Computes the service similarities."""
    return {
        s1: {
            s2: 1 - spatial.distance.cosine(embedding1, embedding2)
            for s2, embedding2 in service_embeddings.items() if s1 != s2
        }
        for s1, embedding1 in service_embeddings.items()
    }

def process_distances(class_graph, communities):
    """Processes distances using merges."""
    merged_df = class_graph.merge(
        communities, left_on='class1', right_on='class_name', how='inner'
    ).merge(
        communities, left_on='class2', right_on='class_name', how='inner', suffixes=('_1', '_2')
    )

    static_dict = merged_df.loc[merged_df['service_1'] != merged_df['service_2']].groupby(['service_1', 'service_2'])['static_distance'].sum().to_dict()
    return static_dict

def normalize_static_distances(distances):
    """Normalizes static distances."""
    max_distance = max(distances.values()) if distances else 0
    return {k: v / max_distance for k, v in distances.items()}

def create_service_graph_dataframe(static_distances, semantic_similarities):
    """Creates the service graph DataFrame."""
    data = [
        [s1, s2, static_distances[(s1, s2)], semantic_similarities.get((s1, s2), 0)]
        for s1, s2 in static_distances.keys()
    ]
    return pd.DataFrame(data, columns=['service1', 'service2', 'static_distance', 'semantic_distance'])

In [None]:
# Usage
valid_communities_df = filter_valid_classes(communities_df, class_embeddings_dict)
service_to_embedding = compute_service_embeddings(valid_communities_df, class_embeddings_dict)
service_similarities = compute_service_similarities(service_to_embedding)
static_distances = process_distances(class_graph_df, communities_df)
semantic_similarities = {(s1, s2): service_similarities.get(s1, {}).get(s2) for s1, s2 in static_distances.keys()}
normalized_static_distances = normalize_static_distances(static_distances)
service_graph_df = create_service_graph_dataframe(normalized_static_distances, semantic_similarities)

# Save to CSV
filename = f"generated_data/graph/service/{version}_{system}_service_graph.csv"
service_graph_df.to_csv(filename, index=False)

## 1.2 Cluster services

In [None]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
import skfuzzy as fuzz


In [None]:
# Constants (change if needed)
N_COMPONENTS = 2
DISSIMILARITY = 'precomputed'
FUZZINESS = 2
ERROR_THRESHOLD = 0.005
MAX_ITERATIONS = 1000
CLUSTER_RANGE = range(1, 100)

In [None]:
def compute_edge_weight(semantic, static, alpha=0.5, beta=0.5):
    return alpha * static + beta * semantic

def compute_2D_embedding(distances):
    """Computes a 2D embedding from the given distance matrix."""
    embedding = MDS(n_components=N_COMPONENTS, dissimilarity=DISSIMILARITY)
    return embedding.fit_transform(distances)

def determine_optimal_clusters(data):
    """Determines the optimal number of clusters using the Elbow method."""
    fpc_values = []
    for c_value in CLUSTER_RANGE:
        _, _, _, _, _, _, fpc = fuzz.cmeans(
            data.T, 
            c=c_value, 
            m=FUZZINESS, 
            error=ERROR_THRESHOLD, 
            maxiter=MAX_ITERATIONS
        )
        fpc_values.append(fpc)

    plt.figure()
    plt.plot(CLUSTER_RANGE, fpc_values)
    plt.title('Fuzzy Partition Coefficient (FPC) for different cluster numbers')
    plt.xlabel('Number of clusters')
    plt.ylabel('FPC')
    plt.grid(True)
    plt.show()

    return detect_elbow(fpc_values)

def detect_elbow(y_values):
    """Detects the 'elbow' in a list of y-values."""
    # Get coordinates of all the points
    n_points = len(y_values)
    all_coords = np.vstack((range(n_points), y_values)).T
    # Get vectors between all points from the first point to the last point
    first_point = all_coords[0]
    line_vector = all_coords[-1] - all_coords[0]
    line_vector_norm = line_vector / np.sqrt(np.sum(line_vector**2))
    
    # Get orthogonal vectors from the first point to all points
    vec_from_first = all_coords - first_point
    scalar_prod = np.sum(vec_from_first * line_vector_norm, axis=1)
    vec_from_first_parallel = np.outer(scalar_prod, line_vector_norm)
    vec_to_line = vec_from_first - vec_from_first_parallel
    
    # Compute the distance to the line
    dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1))
    
    # Return the index of the point with max distance to the line
    idx_elbow = np.argmax(dist_to_line)
    return idx_elbow

def fuzzy_cmeans_clustering(data, optimal_clusters):
    """Clusters the data using Fuzzy C-Means."""
    cntr, u, _, _, _, _, _ = fuzz.cmeans(
        data.T, 
        c=optimal_clusters, 
        m=FUZZINESS, 
        error=ERROR_THRESHOLD, 
        maxiter=MAX_ITERATIONS
    )
    return np.argmax(u, axis=0)

def visualize_clusters(data, labels):
    """Visualizes the clustered data."""
    plt.figure(figsize=(10, 8))
    for cluster_num in np.unique(labels):
        cluster_points = data[labels == cluster_num]
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Microservice {cluster_num + 1}')

    plt.title('Fuzzy C-Means Clustering')
    plt.xlabel('MDS Dimension 1')
    plt.ylabel('MDS Dimension 2')
    plt.legend()
    plt.show()

def save_to_file(labels, services_graph, communities_df, filename):
    """Saves the clustered services and related classes to a file."""
    with open(filename, "w") as file:
        for cluster_num in np.unique(labels):
            file.write(f"Microservice {cluster_num + 1}:\n")
            cluster_services = [service_name for idx, service_name in enumerate(services_graph.nodes) if labels[idx] == cluster_num]
            
            for service in cluster_services:
                file.write(f"  - Service: {service}\n")
                related_classes = communities_df[communities_df['service'] == service]['class_name'].tolist()
                for related_class in related_classes:
                    file.write(f"    - Class: {related_class}\n")
            file.write("\n")

In [None]:
# Usage
services_graph = nx.Graph([
    (row['service1'], row['service2'], {"weight": compute_edge_weight(row['semantic_distance'], row['static_distance'])})
    for _, row in service_graph_df.iterrows()
])

shortest_distances = nx.floyd_warshall_numpy(services_graph, weight='weight')
transformed_data = compute_2D_embedding(shortest_distances)
optimal_clusters = determine_optimal_clusters(transformed_data)
print('Optimal number of clusters: ', optimal_clusters)
labels = fuzzy_cmeans_clustering(transformed_data, optimal_clusters)
visualize_clusters(transformed_data, labels)

# Save results
save_to_file(labels, services_graph, communities_df, f"./results/{version}_{system}_microservices.txt")