# Phase 2: Type-based Service Identification (Grouping by Same Type Classes)

In [None]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
# Pas d'influence du modèle d'embeddings puisqu'on n'utilise que les distances statiques
model_type = "codebert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert

## 1.1 Create class graph

In [None]:
from utils import print_communities, save_communities_to_csv, load_class_data, load_call_graph, merge_dataframes
from distances import compute_semantic_distances_for_class_pairs
from visualization import visualize_class_distance_heatmap
from normalization import filter_and_normalize_distances
from community_tuning import fine_tune_all_services

In [None]:
# Main execution
_, class_labels_dict, class_embeddings_dict = load_class_data(system, version, model_type)
static_df = load_call_graph(system)

if system == 'cargotracker': # temporary fix
    # Replace 'org.eclipse' with 'net.java' in both class1 and class2 columns
    static_df['class1'] = static_df['class1'].str.replace('org.eclipse', 'net.java', regex=False)
    static_df['class2'] = static_df['class2'].str.replace('org.eclipse', 'net.java', regex=False)

semantic_df = compute_semantic_distances_for_class_pairs(class_embeddings_dict)
static_df = filter_and_normalize_distances(static_df, class_labels_dict)
semantic_df = filter_and_normalize_distances(semantic_df, class_labels_dict)
class_graph = merge_dataframes(static_df, semantic_df)

# Visualizations
visualize_class_distance_heatmap(class_graph, 'static_distance', "Static Distances Between Classes")
visualize_class_distance_heatmap(class_graph, 'semantic_distance', "Semantic Distances Between Classes")

# Save to CSV
filename = f"./generated_data/graph/class/{version}_{system}_class_graph.csv"
class_graph.to_csv(filename, index=False)

## 1.2 Community detection

In [None]:
import networkx as nx
from community_detection import CommunityDetection
from constants import COMMUNITY_DETECTION_ALGORITHMS

In [None]:
# Main execution
G = nx.from_pandas_edgelist(class_graph[class_graph['static_distance'] != 0], 'class1', 'class2', ['static_distance'])
cd = CommunityDetection(G, class_labels_dict, optimize_hyperparameters_flag=False)  # Set optimize_hyperparameters_flag=True if you wish optimize parameters of clustering algorithms

# Fine-tuning clusters using static distance
distances = [(row['class1'], row['class2'], row['static_distance']) for index, row in class_graph.iterrows()]  # OR other distances

for algorithm in COMMUNITY_DETECTION_ALGORITHMS: # OR use those you need
    print(f"Running {algorithm} algorithm...")
    
    communities = {
        'Application': cd.detect_communities('Application', algorithm),
        'Entity': cd.detect_communities('Entity', algorithm),
        'Utility': cd.detect_communities('Utility', algorithm)
    }

    fine_tuned_communities = {
        label_type: fine_tune_all_services(services, distances)
        for label_type, services in communities.items()
    }

    # Print the communities
    for label_type, services in fine_tuned_communities.items():
        print_communities(label_type, services)

    # Save fine-tuned communities to CSV
    save_communities_to_csv(fine_tuned_communities, version, system, algorithm)

# 2. Generate Measures (F-Measure, Precision, Recall)

In [None]:
from results_helpers import generate_services_clustering_results

models = ['custom_cmeans', 'cmeans', 'hierarchical']

matching_threshold = 0.8

generate_microservices_clustering_results_by_model(models, version, system, best_community_detection_algorithm, matching_threshold)