# Phase 3: Microservice Identification (Grouping by Similar Services)

In [96]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
model_type = "albert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert

## 1.1 Create service graph

In [97]:
import pandas as pd
from scipy import spatial
from utils import load_data_from_csv

In [98]:
# Read CSV to DataFrame
best_community_detection_algorithm = 'EdMot' # Change this
communities_df = pd.read_csv(f"generated_data/community/{version}_{system}_{best_community_detection_algorithm}_communities.csv")
class_graph_df = pd.read_csv(f"generated_data/graph/class/{version}_{system}_class_graph.csv")
class_names, class_labels, class_embeddings = load_data_from_csv(f"generated_data/embedding/{version}_{system}_{model_type}_embeddings.csv")

# Data Structuring
class_embeddings_dict = dict(zip(class_names, class_embeddings))

In [99]:
# Function to calculate service embedding
def calculate_service_embedding(service_classes):
    return sum(class_embeddings_dict[class_name] for class_name in service_classes) / len(service_classes)

# Calculate service embeddings
service_to_embedding = communities_df.groupby('service')['class_name'].apply(lambda x: calculate_service_embedding(x)).to_dict()


service_graph_data = []
for row in class_graph_df.itertuples(index=True):
    mrs1 = communities_df.loc[communities_df['class_name'] == row.class1]
    mrs2 = communities_df.loc[communities_df['class_name'] == row.class2]
    
    # Check if there are matching rows for row.class2
    if not mrs1.empty and not mrs2.empty:
        service1 = mrs1['service'].values[0]
        service2 = mrs2['service'].values[0]
    else:
        # Handle the case where there are no matches for row.class2
        continue

    if service1 == service2:
        continue
    static_distance = row.static_distance
    semantic_distance = row.semantic_distance
    if semantic_distance is not None:
        service_graph_data.append([service1, service2, static_distance, semantic_distance])


# Create service_graph_df DataFrame
service_graph_df = pd.DataFrame(service_graph_data, columns=['service1', 'service2', 'static_distance', 'semantic_distance'])

# Output service_graph_df
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Save service_graph_df to CSV (we may need to add the community detection algo in the filename)
service_graph_df.to_csv(f"generated_data/graph/service/{version}_{system}_service_graph.csv", index=False)

## 1.2 Cluster services