# Phase 2: Type-based Service Identification (Grouping by Same Type Classes)

In [None]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
model_type = "albert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert

## 1.1 Create class graph

In [None]:
import pandas as pd
from scipy import spatial
import seaborn as sns
import matplotlib.pyplot as plt
import re
from utils import write_call_graph_to_csv, load_class_code_from_directory, load_data_from_csv

In [None]:
class_code_dict = load_class_code_from_directory(system)
class_names, class_labels, class_embeddings = load_data_from_csv(f"generated_data/embedding/{version}_{system}_{model_type}_embeddings.csv")

# Create 2 dicts: one for class labels and one for class embeddings using class names as keys
class_labels_dict = dict(zip(class_names, class_labels))
class_embeddings_dict = dict(zip(class_names, class_embeddings))

### 1.1.1 Generate call graph

In [None]:
# todo: à remplacer par la méthode d'Imen de génération des dépendances
# Counts the number of dependencies between classes in both directions
def extract_dependencies_from_class_code(class_code_dict):
    all_classes = set(class_code_dict.keys())
    dependencies = {}

    for class_name, content in class_code_dict.items():
        # Count dependencies for the current class code
        for target_class in all_classes:
            if class_name != target_class:
                # The pattern r'{}\s*[.\(]' searches for the target class's name followed by a space, period (.), or an opening parenthesis (().
                # This is to avoid counting dependencies for classes that have similar names (e.g. "User" and "UserDetails")
                count = len(re.findall(r'{}\s*[.\(]'.format(target_class.split('.')[-1]), content))
                if count > 0:
                    if (class_name, target_class) in dependencies:
                        dependencies[(class_name, target_class)] += count
                    else:
                        dependencies[(class_name, target_class)] = count
    
    return all_classes, dependencies

def compute_distance_matrix(all_classes, dependencies):
    matrix = {}
    for class1 in all_classes:
        matrix[class1] = {}
        for class2 in all_classes:
            if class1 == class2:
                matrix[class1][class2] = 0
            else:
                matrix[class1][class2] = dependencies.get((class1, class2), 0) + dependencies.get((class2, class1), 0)
    return matrix

In [None]:
all_classes, dependencies = extract_dependencies_from_class_code(class_code_dict)
distance_matrix = compute_distance_matrix(all_classes, dependencies)
write_call_graph_to_csv(distance_matrix, version, system)

### 1.1.2 Compute static distance

In [None]:
file_path = f"./generated_data/graph/call/{version}_{system}_call_graph.csv"

# Read the dependency graph CSV file into a DataFrame
dependency_graph_df = pd.read_csv(
    file_path,
    delimiter=',',
    header=None,
    skiprows=1,  # Skip the first row which contains the header
    names=['class1', 'class2', 'static_distance']
)

# Optional: Normalize structural distances
static_distance_min = dependency_graph_df['static_distance'].min()
static_distance_max = dependency_graph_df['static_distance'].max()

dependency_graph_df['static_distance'] = (dependency_graph_df['static_distance'] - static_distance_min) / \
                                  (static_distance_max - static_distance_min)

# Create the static_df with the 'static_distance' column
static_df = dependency_graph_df[['class1', 'class2', 'static_distance']]

### 1.1.3 Compute semantic distance

In [None]:
# Compute pairwise semantic distances
semantic_distances = []
for class_name1 in class_embeddings_dict:
    for class_name2 in class_embeddings_dict:
        distance = 1 - spatial.distance.cosine(class_embeddings_dict[class_name1], class_embeddings_dict[class_name2])
        semantic_distances.append([class_name1, class_name2, distance])

semantic_df = pd.DataFrame(semantic_distances, columns=['class1', 'class2', 'semantic_distance'])

# Filter rows where one of the class names is not in class_labels keys
class_label_keys = set(class_labels_dict.keys())
semantic_df = semantic_df[(semantic_df['class1'].isin(class_label_keys)) & (semantic_df['class2'].isin(class_label_keys))]

# Normalize semantic distances
semantic_df['semantic_distance'] = (semantic_df['semantic_distance'] - semantic_df['semantic_distance'].min()) / \
                                   (semantic_df['semantic_distance'].max() - semantic_df['semantic_distance'].min())

### 1.1.4 Visualize distances

In [None]:
# Merge structural and semantic dataframes based on class1 and class2 columns
class_graph = static_df.merge(semantic_df, on=['class1', 'class2'], how='outer')

# Fill NA values (if any) for both static_distance and semantic_distance with zeros
class_graph.fillna({'static_distance': 0, 'semantic_distance': 0}, inplace=True)

# Visualize the static distances
static_pivot_table = class_graph.pivot(index='class1', columns='class2', values='static_distance')

plt.figure(figsize=(10, 8))
sns.heatmap(static_pivot_table, cmap='coolwarm', cbar_kws={'label': 'Static Distance'})
plt.title("Static Distances Between Classes")
plt.show()

# Visualize the semantic distances
semantic_pivot_table = class_graph.pivot(index='class1', columns='class2', values='semantic_distance')

plt.figure(figsize=(10, 8))
sns.heatmap(semantic_pivot_table, cmap='coolwarm', cbar_kws={'label': 'Semantic Distance'})
plt.title("Semantic Distances Between Classes")
plt.show()

# Save the full connection graph to a CSV file
class_graph.to_csv(f"./generated_data/graph/class/{version}_{system}_class_graph.csv", index=False)


## 1.2 Community detection

In [None]:
import networkx as nx
from cdlib import algorithms
from karateclub import EdMot

In [None]:
def print_communities(type, communities):
    for idx, community in enumerate(communities):
        print(f"{type} Service Community {idx + 1}:")
        for class_name in community:
            print(f"  - {class_name}")
        print("=" * 40)
    print("\n")

In [None]:
def get_subgraph_reindexed(graph, class_labels_dict, type_label):
    classes = [class_name for class_name, label in class_labels_dict.items() if label == type_label]
    subgraph = graph.subgraph(classes)
    
    # Re-index nodes
    mapping = {node: i for i, node in enumerate(subgraph.nodes())}
    inverse_mapping = {i: node for node, i in mapping.items()}  # This is the correct inverse mapping
    subgraph_reindexed = nx.relabel_nodes(subgraph, mapping)
    
    return subgraph_reindexed, inverse_mapping

In [None]:
def perform_community_detection(graph, class_labels_dict, type_label, algorithm, level=1, resolution=0.5):
    subgraph_reindexed, inverse_mapping = get_subgraph_reindexed(graph, class_labels_dict, type_label)

    ALGORITHMS = {
        'Louvain': lambda: algorithms.louvain(subgraph_reindexed, weight='weight', resolution=resolution).communities,
        'Infomap': lambda: algorithms.infomap(subgraph_reindexed).communities,
        'LabelPropagation': lambda: algorithms.label_propagation(subgraph_reindexed).communities,
        'GirvanNewman': lambda: algorithms.girvan_newman(subgraph_reindexed, level=level).communities,
        'FastGreedy': lambda: algorithms.greedy_modularity(subgraph_reindexed).communities
    }
    
    if len(subgraph_reindexed.nodes()) < 4:
        communities_reindexed = [list(community) for community in nx.community.k_clique_communities(subgraph_reindexed, 3)]
    else:
        if algorithm == 'EdMot':
            edmot = EdMot()
            edmot.fit(subgraph_reindexed)
            memberships = edmot.get_memberships()
            unique_communities = set(memberships.values())
            communities_reindexed = [list({node for node, community_id in memberships.items() if community_id == c}) for c in unique_communities]
        else:
            communities_reindexed = ALGORITHMS.get(algorithm, lambda: print(f"Error: The algorithm '{algorithm}' is not supported. Supported algorithms are: {', '.join(ALGORITHMS.keys())}."))()

    communities = [[inverse_mapping[node] for node in community] for community in communities_reindexed]

    return communities, communities_reindexed

In [None]:
# Create a NetworkX graph from the class graph
G = nx.Graph()
for index, row in class_graph.iterrows():
    G.add_edge(row['class1'], row['class2'], weight=row['static_distance']) # OR weight=row['combined_distance'] OR weight=row['semantic_distance'] OR weight=row['static_distance']

# Specify the algorithm to use (e.g. 'Louvain', 'EdMot', ...)
algorithm = 'Louvain'  # Change this

# Perform community detection for Application Services using the specified algorithm
application_communities, _ = perform_community_detection(G, class_labels_dict, 0, algorithm)
print_communities('Application', application_communities)

# Perform community detection for Entity Services using the specified algorithm 
entity_communities, _ = perform_community_detection(G, class_labels_dict, 2, algorithm)
print_communities('Entity', entity_communities)

# Perform community detection for Utility Services using the specified algorithm
utility_communities, _ = perform_community_detection(G, class_labels_dict, 1, algorithm)
print_communities('Utility', utility_communities)

### 1.2.1 Optimize parameters (optional)

In [None]:
import numpy as np

In [None]:
# Optimize hyperparameters based on modularity metric
def search_for_best_params_based_on_modularity(graph, class_labels_dict, algorithm):
    best_params = {}
    total_best_modularity = -1  # initialize with a low value

    if algorithm == 'Louvain':
        for resolution in np.arange(0.1, 2.0, 0.2):  # example resolution values
            total_modularity = 0  # Sum of modularities for all type labels for the current resolution
            for type_label in [0, 1, 2]:
                subgraph_reindexed, _ = get_subgraph_reindexed(graph, class_labels_dict, type_label)
                _, communities_reindexed = perform_community_detection(graph, class_labels_dict, type_label, algorithm, level=None, resolution=resolution)
                try:
                    modularity_value = nx.community.modularity(subgraph_reindexed, communities_reindexed)
                except ZeroDivisionError:
                    continue
                total_modularity += modularity_value  # Add modularity of current type label to the total

            if total_modularity > total_best_modularity:  # Check total modularity across all type labels
                total_best_modularity = total_modularity
                best_params['resolution'] = resolution

    elif algorithm == 'GirvanNewman':
        for level in range(1, 10):  # example level values
            total_modularity = 0  # Sum of modularities for all type labels for the current level
            for type_label in [0, 1, 2]:
                subgraph_reindexed, _ = get_subgraph_reindexed(graph, class_labels_dict, type_label)
                _, communities_reindexed = perform_community_detection(graph, class_labels_dict, type_label, algorithm, level=level)
                try:
                    modularity_value = nx.community.modularity(subgraph_reindexed, communities_reindexed)
                except ZeroDivisionError:
                    continue
                total_modularity += modularity_value  # Add modularity of current type label to the total

            if total_modularity > total_best_modularity:  # Check total modularity across all type labels
                total_best_modularity = total_modularity
                best_params['level'] = level

    return best_params

In [None]:
best_params = search_for_best_params_based_on_modularity(G, class_labels_dict, 'Louvain')
print(best_params)

## 1.3 Fine-tune clusters

In [None]:
def fine_tune_cluster(service, services, distance_map):
    score_service = {i: 0 for i in range(len(services))}
    
    for other_service, distance in distance_map.get(service, {}).items():
        for i, s in enumerate(services):
            if other_service in s:
                score_service[i] += distance
    
    max_score = max(score_service.values())
    if max_score > 0:
        max_indices = [i for i, x in score_service.items() if x == max_score]
        if len(max_indices) == 1:
            services[max_indices[0]].append(service)
            services = [s for s in services if s != [service]]

    return services

def fine_tune_all_services(services_list, distances):
    distance_map = {s1: {s2: d} for s1, s2, d in distances}
    
    for i, s in enumerate(services_list):
        if len(s) < 2:
            services_list = fine_tune_cluster(s[0], services_list, distance_map)
    return services_list

In [None]:
# Create a list of distances to be used for fine-tuning
distances = [(row['class1'], row['class2'], row['static_distance']) for index, row in class_graph.iterrows()] # OR row['combined_distance'] OR row['semantic_distance'] OR row['static_distance']

# Fine-tune the communities using specified distance
fine_tuned_application_communities = fine_tune_all_services(application_communities, distances)
fine_tuned_entity_communities = fine_tune_all_services(entity_communities, distances)
fine_tuned_utility_communities = fine_tune_all_services(utility_communities, distances)

print_communities('Application', fine_tuned_application_communities)
print_communities('Entity', fine_tuned_entity_communities)
print_communities('Utility', fine_tuned_utility_communities)

# Save fine-tuned communities in a single CSV file for later use
with open(f'generated_data/community/{version}_{system}_{algorithm}_communities.csv', 'w') as f:
    f.write('class_name,service\n')
    for i, service in enumerate(fine_tuned_application_communities):
        for class_name in service:
            f.write(f'{class_name},Application Service {i + 1}\n')
    for i, service in enumerate(fine_tuned_entity_communities):
        for class_name in service:
            f.write(f'{class_name},Entity Service {i + 1}\n')
    for i, service in enumerate(fine_tuned_utility_communities):
        for class_name in service:
            f.write(f'{class_name},Utility Service {i + 1}\n')