# Phase 3: Microservice Identification (Grouping by Similar Services)

In [None]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
model_type = "albert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert

## 1.1 Create service graph

In [None]:
import pandas as pd
from scipy import spatial
from utils import load_data_from_csv

In [None]:
# Read CSV to DataFrame
best_community_detection_algorithm = 'EdMot' # Change this
communities_df = pd.read_csv(f"generated_data/community/{version}_{system}_{best_community_detection_algorithm}_communities.csv")
class_graph_df = pd.read_csv(f"generated_data/graph/class/{version}_{system}_class_graph.csv")
class_names, class_labels, class_embeddings = load_data_from_csv(f"generated_data/embedding/{version}_{system}_{model_type}_embeddings.csv")

# Data Structuring
class_embeddings_dict = dict(zip(class_names, class_embeddings))

In [None]:
# 1. Calculate service embeddings

# Filter out class names not present in the embeddings dictionary
valid_communities_df = communities_df[communities_df['class_name'].isin(class_embeddings_dict.keys())]

# Calculate service embeddings
service_to_embedding = valid_communities_df.groupby('service')['class_name'].apply(
    lambda x: sum(class_embeddings_dict[class_name] for class_name in x) / len(x)
).to_dict()

# 2. Calculate service similarities
service_similarities = {
    s1: {
        s2: 1 - spatial.distance.cosine(embedding1, embedding2)
        for s2, embedding2 in service_to_embedding.items() if s1 != s2
    }
    for s1, embedding1 in service_to_embedding.items()
}

# 3. Create dictionaries to store processed distances using merges
merged_df = class_graph_df.merge(
    communities_df, left_on='class1', right_on='class_name', how='inner'
).merge(
    communities_df, left_on='class2', right_on='class_name', how='inner', suffixes=('_1', '_2')
)

# Filter rows where services are the same and accumulate distances
static_dict = merged_df.loc[merged_df['service_1'] != merged_df['service_2']].groupby(['service_1', 'service_2'])['static_distance'].sum().to_dict()
semantic_dict = {(s1, s2): service_similarities.get(s1, {}).get(s2) for s1, s2 in static_dict.keys()}

# 4. Normalize static distances
max_static_distance = max(static_dict.values()) if static_dict else 0
normalized_static_dict = {k: v / max_static_distance for k, v in static_dict.items()}

# 5. Create the service_graph_df DataFrame
service_graph_data = [
    [s1, s2, normalized_static_dict[(s1, s2)], semantic_dict.get((s1, s2), 0)]
    for s1, s2 in normalized_static_dict.keys()
]

service_graph_df = pd.DataFrame(service_graph_data, columns=['service1', 'service2', 'static_distance', 'semantic_distance'])

# 6. Save service_graph_df to CSV
service_graph_df.to_csv(f"generated_data/graph/service/{version}_{system}_service_graph.csv", index=False)

service_graph_df.head()  # Display the first few rows of the dataframe

## 1.2 Cluster services

In [None]:
import networkx as nx
import numpy as np
import math

In [None]:
def compute_edge_weight(semantic_distance, static_distance):
    """
    Calculate the edge weight based on semantic and static distances.
    """
    semantic_factor = 1 / semantic_distance if semantic_distance != 0 else 0
    return 0.1 * static_distance + 100 * semantic_factor


def build_adjacency_matrix(graph, df):
    """
    Generate a directed adjacency matrix from the service graph and the data frame.
    """
    nodes = list(graph.nodes)
    node_to_index = {node: idx for idx, node in enumerate(nodes)}
    matrix = np.full((len(nodes), len(nodes)), np.inf)
    src_indices = df['service1'].map(node_to_index).values
    dest_indices = df['service2'].map(node_to_index).values
    static_dists = df['static_distance'].replace(0, np.inf).values
    matrix[src_indices, dest_indices] = 100 / static_dists
    return matrix, nodes


def compute_shortest_paths(matrix):
    """
    Determine shortest paths for all node pairs using the Floyd Warshall algorithm.
    """
    distances = matrix.copy()
    num_nodes = distances.shape[0]
    for k in range(num_nodes):
        distances = np.minimum(distances, distances[:, k][:, np.newaxis] + distances[np.newaxis, :, k])
    return distances


def incorporate_semantic_distances(shortest_distances, nodes, df):
    """
    Integrate semantic distances into the shortest distances matrix.
    """
    for i, service_i in enumerate(nodes):
        for j, service_j in enumerate(nodes):
            distance_series = df.query(f'service1 == "{service_i}" & service2 == "{service_j}"')['semantic_distance']
            distance = distance_series.iloc[0] if not distance_series.empty else np.nan
            shortest_distances[i][j] += (distance * 2 if not np.isnan(distance) else 0)
    return shortest_distances


def fuzzy_weight(service_idx, center_idx, distances, center_indices):
    """
    Compute the fuzzy weight of a service relative to a center node.
    """
    m, coef = 3, 2 / (3 - 1)
    direct_distance = distances[service_idx][center_idx]
    
    if direct_distance == 0:
        return float('inf')

    distance_ratio_sum = sum(
        direct_distance / (distances[service_idx][idx] if distances[service_idx][idx] != 0 else float('inf'))
        for idx in center_indices
    )

    return 1 / math.pow(distance_ratio_sum, coef)


def map_centers_to_services(graph, distances, center_prefix="Application"):
    """
    Associate center nodes to related services based on the fuzzy weights.
    """
    centers = {name: idx for idx, name in enumerate(graph.nodes) if name.startswith(center_prefix)}
    mapping = {}

    for center_name, center_idx in centers.items():
        related = [
            service_name 
            for service_idx, service_name in enumerate(graph.nodes) 
            if service_name != center_name and fuzzy_weight(service_idx, center_idx, distances, list(centers.values())) * 100 > 9
        ]
        mapping[center_name] = related

    return mapping


def save_center_service_mapping_to_file(center_service_mapping, communities_df, filename="./results/microservices.txt"):
    """
    Save the center_service_mapping to a file, mapping back services to their corresponding classes.
    """
    with open(filename, "w") as file:
        for center_name, services in center_service_mapping.items():
            file.write(f"Center Node: {center_name}\n")
            
            for service in services:
                file.write(f"  - Service: {service}\n")
                
                # Retrieve and write related classes for the service
                related_classes = communities_df[communities_df['service'] == service]['class_name'].tolist()
                for related_class in related_classes:
                    file.write(f"    - Class: {related_class}\n")
                    
            file.write("\n")

In [None]:
# Compute edges with weights and construct the graph
services_graph = nx.Graph([
    (row['service1'], row['service2'], {"weight": compute_edge_weight(row['semantic_distance'], row['static_distance'])})
    for _, row in service_graph_df.iterrows()
])

adjacency_matrix, node_names = build_adjacency_matrix(services_graph, service_graph_df)
shortest_paths = compute_shortest_paths(adjacency_matrix)
shortest_paths = incorporate_semantic_distances(shortest_paths, node_names, service_graph_df)
center_service_mapping = map_centers_to_services(services_graph, shortest_paths)
save_center_service_mapping_to_file(center_service_mapping, communities_df)