# Phase 3: Microservice Identification (Grouping by Similar Services)

In [None]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
model_type = "albert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert

## 1.1 Create service graph

In [None]:
import pandas as pd
from scipy import spatial
from utils import load_data_from_csv

In [None]:
# Read CSV to DataFrame
best_community_detection_algorithm = 'EdMot' # Change this
communities_df = pd.read_csv(f"generated_data/community/{version}_{system}_{best_community_detection_algorithm}_communities.csv")
class_graph_df = pd.read_csv(f"generated_data/graph/class/{version}_{system}_class_graph.csv")
class_names, class_labels, class_embeddings = load_data_from_csv(f"generated_data/embedding/{version}_{system}_{model_type}_embeddings.csv")

# Data Structuring
class_embeddings_dict = dict(zip(class_names, class_embeddings))

In [None]:
# 1. Calculate service embeddings

# Filter out class names not present in the embeddings dictionary
valid_communities_df = communities_df[communities_df['class_name'].isin(class_embeddings_dict.keys())]

# Calculate service embeddings
service_to_embedding = valid_communities_df.groupby('service')['class_name'].apply(
    lambda x: sum(class_embeddings_dict[class_name] for class_name in x) / len(x)
).to_dict()

# 2. Calculate service similarities
service_similarities = {
    s1: {
        s2: 1 - spatial.distance.cosine(embedding1, embedding2)
        for s2, embedding2 in service_to_embedding.items() if s1 != s2
    }
    for s1, embedding1 in service_to_embedding.items()
}

# 3. Create dictionaries to store processed distances using merges
merged_df = class_graph_df.merge(
    communities_df, left_on='class1', right_on='class_name', how='inner'
).merge(
    communities_df, left_on='class2', right_on='class_name', how='inner', suffixes=('_1', '_2')
)

# Filter rows where services are the same and accumulate distances
static_dict = merged_df.loc[merged_df['service_1'] != merged_df['service_2']].groupby(['service_1', 'service_2'])['static_distance'].sum().to_dict()
semantic_dict = {(s1, s2): service_similarities.get(s1, {}).get(s2) for s1, s2 in static_dict.keys()}

# 4. Normalize static distances
max_static_distance = max(static_dict.values()) if static_dict else 0
normalized_static_dict = {k: v / max_static_distance for k, v in static_dict.items()}

# 5. Create the service_graph_df DataFrame
service_graph_data = [
    [s1, s2, normalized_static_dict[(s1, s2)], semantic_dict.get((s1, s2), 0)]
    for s1, s2 in normalized_static_dict.keys()
]

service_graph_df = pd.DataFrame(service_graph_data, columns=['service1', 'service2', 'static_distance', 'semantic_distance'])

# 6. Save service_graph_df to CSV
service_graph_df.to_csv(f"generated_data/graph/service/{version}_{system}_service_graph.csv", index=False)

service_graph_df.head()  # Display the first few rows of the dataframe

## 1.2 Cluster services

In [None]:
import networkx as nx
import numpy as np
import math

In [None]:
def edge_weight(semantic, static):
    semantic_weight = 1 / semantic if semantic != 0 else 0 # why
    return 0.1 * static + 100 * semantic_weight

# Compute edges with weights and construct the graph
services_graph = nx.Graph([
    (row['service1'], row['service2'], {"weight": edge_weight(row['semantic_distance'], row['static_distance'])})
    for _, row in service_graph_df.iterrows()
])

In [None]:
# Adjacency matrix
def create_adjacency_matrix(graph, df):
    """Create a directed adjacency matrix from service graph and dataframe."""
    nodes = list(graph.nodes)
    node_to_index = {node: idx for idx, node in enumerate(nodes)}
    
    # Initialize the matrix with inf values
    matrix = np.full((len(nodes), len(nodes)), np.inf) # why
    
    # Extract the source, destination and static distance columns
    src_indices = df['service1'].map(node_to_index).values
    dest_indices = df['service2'].map(node_to_index).values
    static_dists = df['static_distance'].replace(0, np.inf).values
    
    # Use the indices for efficient assignment
    matrix[src_indices, dest_indices] = 100 / static_dists # why

    return matrix, nodes

In [None]:
# Floyd Warshall Algorithm
def floyd_warshall(adj_matrix):
    """Compute shortest paths for all pairs of nodes using the Floyd Warshall algorithm (optimized with numpy)."""
    distance = adj_matrix.copy()
    num_vertices = distance.shape[0]
    
    for k in range(num_vertices):
        distance = np.minimum(distance, distance[:, k][:, np.newaxis] + distance[np.newaxis, :, k])
                
    return np.array(distance)

# Alternative: Use networkx's floyd_warshall_numpy function for shortest paths computation
# shortest_distances = nx.floyd_warshall_numpy(services_graph, weight='weight') # skips the need for adjacency matrix

In [None]:
# Create adjacency matrix
adj_matrix, nodes_list = create_adjacency_matrix(services_graph, service_graph_df)
print(adj_matrix)

# Calculate shortest paths using Floyd Warshall
shortest_distances = floyd_warshall(adj_matrix)
print(shortest_distances)

# Update with semantic component
for i, service_i in enumerate(nodes_list):
    for j, service_j in enumerate(nodes_list):
        distance = service_graph_df[
            (service_graph_df['service1'] == service_i) & 
            (service_graph_df['service2'] == service_j)
        ]['semantic_distance'].iloc[0] if not service_graph_df[
            (service_graph_df['service1'] == service_i) & 
            (service_graph_df['service2'] == service_j)
        ]['semantic_distance'].empty else 0
        
        shortest_distances[i][j] += distance * 2

In [None]:
def calculate_fuzzy_weight(service_idx, center_idx, distances, centers):
    """
    Calculate the fuzzy weight for a service in relation to a center node.
    """
    m = 3
    coef = 2 / (m - 1)
    
    service_to_center_distance = distances[service_idx][center_idx]

    if service_to_center_distance == 0:
        return float('inf')

    normalized_distance_sum = sum(
        service_to_center_distance / (distances[service_idx][center_index] if distances[service_idx][center_index] != 0 else float('inf'))
        for center_index in centers.values()
    )

    return 1 / math.pow(normalized_distance_sum, coef)

# Identify center nodes and their indices
center_type_prefix = "Application"  # Change this prefix as needed to adapt for different types of nodes
centers = {node_name: idx for idx, node_name in enumerate(services_graph.nodes) if node_name.startswith(center_type_prefix)}

# Map centers to related services based on the fuzzy weights
center_to_services = {}

for center_name, center_idx in centers.items():
    related_services = [
        service_name 
        for service_idx, service_name in enumerate(services_graph.nodes) 
        if service_name != center_name and calculate_fuzzy_weight(service_idx, center_idx, shortest_distances, centers) * 100 > 9
    ]

    center_to_services[center_name] = related_services

print(center_to_services)