# Phase 2: Type-based Service Identification (Grouping by Same Type Classes)

In [None]:
version = "v_imen" # All options: v_imen, v_team
system = "pos" # All options: jforum, cargotracker, petclinic, pos
model_type = "albert" # All options: ft_codebert, word2vec, albert, codebert, roberta, bert

## 1.1 Create connection graph

In [None]:
import pandas as pd
from scipy import spatial
import seaborn as sns
import matplotlib.pyplot as plt
import re
from utils import write_distance_to_csv, load_class_code_from_directory, associate_classes_to_types, load_data_from_csv

In [None]:
class_code = load_class_code_from_directory(system)
class_labels = associate_classes_to_types(version, system)
class_embeddings = load_data_from_csv(f"csv_files/{version}_{system}_{model_type}_embeddings.csv")

### 1.1.1 Generate dependency graph

In [None]:
# Counts the number of dependencies between classes in both directions
def extract_dependencies_from_class_code(class_code_dict):
    all_classes = set(class_code_dict.keys())
    dependencies = {}

    for class_name, content in class_code_dict.items():
        # Count dependencies for the current class code
        for target_class in all_classes:
            if class_name != target_class:
                # The pattern r'{}\s*[.\(]' searches for the target class's name followed by a space, period (.), or an opening parenthesis (().
                # This is to avoid counting dependencies for classes that have similar names (e.g. "User" and "UserDetails")
                count = len(re.findall(r'{}\s*[.\(]'.format(target_class.split('.')[-1]), content))
                if count > 0:
                    if (class_name, target_class) in dependencies:
                        dependencies[(class_name, target_class)] += count
                    else:
                        dependencies[(class_name, target_class)] = count
    
    return all_classes, dependencies

def compute_distance_matrix(all_classes, dependencies):
    matrix = {}
    for class1 in all_classes:
        matrix[class1] = {}
        for class2 in all_classes:
            if class1 == class2:
                matrix[class1][class2] = 0
            else:
                matrix[class1][class2] = dependencies.get((class1, class2), 0) + dependencies.get((class2, class1), 0)
    return matrix

In [None]:
all_classes, dependencies = extract_dependencies_from_class_code(class_code)
distance_matrix = compute_distance_matrix(all_classes, dependencies)
print(distance_matrix)
write_distance_to_csv(distance_matrix, version, system)

### 1.1.2 Compute static (structural) distance

In [None]:
file_name = f"./csv_files/{version}_{system}_dependency_graph.csv"
data = pd.read_csv(file_name, delimiter=';', header=None, names=['class1', 'class2', 'structural_distance'])

# Filter rows where both classes are in class_embeddings
structural_df = data[data['class1'].isin(class_embeddings) & data['class2'].isin(class_embeddings)]

# Compute min and max structural distances for potential normalization
# If needed, normalize structural distances
# structural_df['structural_distance'] = (structural_df['structural_distance'] - structural_df['structural_distance'].min()) / \
#                                        (structural_df['structural_distance'].max() - structural_df['structural_distance'].min())

### 1.1.3 Compute semantic distance

In [None]:
# Compute pairwise semantic distances
semantic_distances = []
for class_name1 in class_embeddings:
    for class_name2 in class_embeddings:
        distance = 1 - spatial.distance.cosine(class_embeddings[class_name1], class_embeddings[class_name2])
        semantic_distances.append([class_name1, class_name2, distance])

semantic_df = pd.DataFrame(semantic_distances, columns=['class1', 'class2', 'semantic_distance'])

# Filter rows where one of the class names is not in class_labels keys
class_label_keys = set(class_labels.keys())
semantic_df = semantic_df[(semantic_df['class1'].isin(class_label_keys)) & (semantic_df['class2'].isin(class_label_keys))]

# Normalize semantic distances
semantic_df['semantic_distance'] = (semantic_df['semantic_distance'] - semantic_df['semantic_distance'].min()) / \
                                   (semantic_df['semantic_distance'].max() - semantic_df['semantic_distance'].min())

### 1.1.4 Create connection (call) graph

In [None]:
# Merge structural and semantic dataframes
full_connection_graph = structural_df.merge(semantic_df, on=['class1', 'class2'], how='outer')
full_connection_graph.fillna(0, inplace=True)  # Fill NA values (if any) with zeros

# Visualize the graph
pivot_table = full_connection_graph.pivot(index='class1', columns='class2', values='semantic_distance')

plt.figure(figsize=(10, 8))
sns.heatmap(pivot_table, cmap='coolwarm', cbar_kws={'label': 'Semantic Distance'})
plt.title("Semantic Distances Between Classes")
plt.show()