# Phase 3: Microservice Identification (Grouping by Similar Services)

In [497]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
model_type = "albert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert

## 1.1 Create service graph

In [498]:
import pandas as pd
from scipy import spatial
from utils import load_data_from_csv

In [499]:
# Read CSV to DataFrame
best_community_detection_algorithm = 'EdMot' # Change this
communities_df = pd.read_csv(f"generated_data/community/{version}_{system}_{best_community_detection_algorithm}_communities.csv")
class_graph_df = pd.read_csv(f"generated_data/graph/class/{version}_{system}_class_graph.csv")
class_names, class_labels, class_embeddings = load_data_from_csv(f"generated_data/embedding/{version}_{system}_{model_type}_embeddings.csv")

# Data Structuring
class_embeddings_dict = dict(zip(class_names, class_embeddings))

In [504]:
# Function to calculate service embedding
def calculate_service_embedding(service_classes):
    return sum(class_embeddings_dict[class_name] for class_name in service_classes) / len(service_classes)

# Calculate service embeddings
service_to_embedding = communities_df.groupby('service')['class_name'].apply(lambda x: calculate_service_embedding(x)).to_dict()

# Calculate service similarities
service_similarities = {
    s1: {
        s2: 1 - spatial.distance.cosine(service_to_embedding[s1], service_to_embedding[s2])
        for s2 in service_to_embedding if s1 != s2
    }
    for s1 in service_to_embedding
}

# Create dictionaries to store processed distances
semantic_dict = {}
static_dict = {}
for row in class_graph_df.itertuples(index=False):
    mrs1 = communities_df.loc[communities_df['class_name'] == row.class1]
    mrs2 = communities_df.loc[communities_df['class_name'] == row.class2]

    if mrs1.empty or mrs2.empty or mrs1['service'].values[0] == mrs2['service'].values[0]:
       continue
    else:
        service1 = mrs1['service'].values[0]
        service2 = mrs2['service'].values[0]

    static_dict[(service1, service2)] = float(static_dict.get((service1, service2), 0)) + row.static_distance
    semantic_dict[(service1, service2)] = service_similarities.get(service1, {}).get(service2)


# Rest of your code remains the same
service_graph_data = []

for s1, s2 in static_dict.keys():

    semantic_distance = semantic_dict.get((s1, s2), 0)
    static_distance = static_dict.get((s1, s2), 0)
    service_graph_data.append([s1, s2, static_distance, semantic_distance])


# Create service_graph_df DataFrame
service_graph_df = pd.DataFrame(service_graph_data, columns=['service1', 'service2', 'static_distance', 'semantic_distance'])
# Output service_graph_df
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Save service_graph_df to CSV (we may need to add the community detection algo in the filename)
service_graph_df.to_csv(f"generated_data/graph/service/{version}_{system}_service_graph.csv", index=False)

## 1.2 Cluster services

In [505]:
import networkx as nx
import pandas as pd
import numpy as np

# Create a graph and add edges
GServices = nx.Graph()
for d in service_graph_data:
    GServices.add_edge(d[0], d[1], weight=0.5 * d[2] +0.5 * d[3])

node_list = list(GServices.nodes())

# Create an empty adjacency matrix filled with zeros
num_nodes = len(node_list)
adjacency_matrix = np.zeros((num_nodes, num_nodes))

# Populate the adjacency matrix with edge weights
for i in range(num_nodes):
    for j in range(num_nodes):
        if i == j:
            # Diagonal elements (self-loops) can be set to zero or any other appropriate value
            adjacency_matrix[i][j] = 0
        else:
            # Check if there is an edge between the nodes
            if GServices.has_edge(node_list[i], node_list[j]):
                # Get the weight of the edge
                edge_data = GServices[node_list[i]][node_list[j]]
                edge_weight = edge_data['weight']
                adjacency_matrix[i][j] = edge_weight

# Create a DataFrame for the adjacency matrix with service names
adjacency_df = pd.DataFrame(adjacency_matrix, index=node_list, columns=node_list)

# Save the DataFrame to a CSV file
csv_filename = f"generated_data/graph/service/{version}_{system}_adjacency_matrix.csv"
adjacency_df.to_csv(csv_filename)

print(f"Adjacency matrix saved to {csv_filename}")
import numpy as np
import skfuzzy as fuzz

# Define a range of cluster numbers to consider
num_clusters_range = range(1, 20)  # Adjust the range as needed

# Initialize empty lists to store the results
fcm_scores = []
fpc_scores = []

# Calculate Fuzzy C-Means scores and FPC scores for different cluster numbers
for num_clusters in num_clusters_range:
    # Specify the fuzziness coefficient (e.g., m=2.0)
    m = 3.0
    fcm = fuzz.cmeans(adjacency_matrix, num_clusters, m, error=0.005, maxiter=1000)
    fcm_scores.append(fcm[3])  # Appending the mean squared error (MSE) to scores

    # Calculate FPC score using the formula FPC = (Tr(B) / W) / (1 - Tr(W) / W)
    B = np.linalg.norm(fcm[0], axis=0)
    W = np.mean(fcm[3])
    fpc = (np.sum(B) / W) / (1 - W)
    fpc_scores.append(fpc)

print("fpc_score for each number of MS")
print(fpc_scores)
# Find the number of clusters with the highest FPC score


optimal_num_clusters = num_clusters_range[np.argmax(fpc_scores)]

print("Optimal number of MS")
print(optimal_num_clusters)

# Apply Fuzzy C-Means clustering with the optimal number of clusters
fcm = fuzz.cmeans(adjacency_matrix, optimal_num_clusters, m, error=0.005, maxiter=1000)

# Get cluster memberships for each Application Service
membership_degrees = fcm[0]

# Calculate the threshold dynamically based on the mean membership degree
mean_membership_degrees = np.mean(membership_degrees, axis=1)
threshold = np.mean(mean_membership_degrees)
print(mean_membership_degrees)
print("threshold" + str(threshold))
# Create a dictionary to map Application Services to their clusters
service_clusters = {}
for i, service in enumerate(GServices.nodes()):
    for cluster in range(optimal_num_clusters):
        if membership_degrees[cluster][i] > threshold:
            print(membership_degrees[cluster])
            if cluster not in service_clusters:
                service_clusters[cluster] = []
            service_clusters[cluster].append(service)

output_file_path = f"generated_data/graph/service/{version}_{system}_microservices.txt"

with open(output_file_path, 'w') as file:
    for ms, s in service_clusters.items():
        file.write(f"Microservice {ms + 1}:\n")
        for service in s:
            file.write(f"  - {service}\n")

print("Microservices assignments saved to", output_file_path)

Adjacency matrix saved to generated_data/graph/service/v_imen_pos_adjacency_matrix.csv
fpc_score for each number of MS
[42.298693284804735, 59.819384563889315, 73.26348052414795, 84.59738590008996, 94.58274448698809, 103.61021329267504, 111.91181315406274, 119.63876654884865, 126.89607850048722, 133.76021176476098, 140.2888828153357, 146.52696838256043, 152.51008087047975, 158.2672109732212, 163.82213375581463, 169.19476722012817, 174.40194092610616, 179.45814190280282, 184.3757277839843]
Optimal number of MS
19
[0.45839317 0.45836772 0.45839474 0.45836605 0.45838205 0.45838782
 0.45838614 0.45838175 0.45836227 0.45837596 0.45838546 0.45839336
 0.45838497 0.45838066 0.45838543 0.45839485 0.45838156 0.45837883
 0.45839226]
threshold0.458382896819415
[0.46530034 0.4585631  0.46426587 0.46333525 0.46322661 0.46569693
 0.46265784 0.45400271 0.46309868 0.4501855  0.46582086 0.45964083
 0.46179108 0.41451146 0.453455   0.46509621 0.46501272 0.46026264
 0.46273142 0.45555348 0.45896479 0.4637