## Graph Neural Networks with Word2Vec Feature Embeddings

In [39]:
import os
import re
import pandas as pd
import numpy as np
import random

from sklearn.metrics import precision_score, recall_score, f1_score,classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

import ast
from collections import defaultdict
from typing import Callable, Union
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:.3f}'.format

In [40]:
logs_dir = 'NaiveBayes_Metrics'
os.makedirs(logs_dir, exist_ok=True)

# Dataset


In [41]:
df_ant = pd.read_csv('Processed_data\\ant_data.csv')
dep_ant = pd.read_csv('Processed_data\\ant_dependencies.csv')
df_ant=df_ant[~df_ant.Module.isna()]
dep_ant = dep_ant[dep_ant['Source'].isin(df_ant['Entity']) & dep_ant['Target'].isin(df_ant['Entity'])]


df_jab = pd.read_csv('Processed_data\\jabref_data.csv')
dep_jab = pd.read_csv('Processed_data\\jabref_dependencies.csv')

df_team = pd.read_csv('Processed_data\\teammates_data.csv')
dep_team = pd.read_csv('Processed_data\\teammates_dependencies.csv')

df_sh = pd.read_csv('Processed_data\\sweetHome3D_data.csv')
dep_sh = pd.read_csv('Processed_data\\sweetHome3D_dependencies.csv')

df_argo = pd.read_csv('Processed_data\\argouml_data.csv')
dep_argo = pd.read_csv('Processed_data\\argouml_dependencies.csv')

df_prom = pd.read_csv('Processed_data\\prom_data.csv')
dep_prom = pd.read_csv('Processed_data\\prom_dependencies.csv')

df_lucene = pd.read_csv('Processed_data\\lucene_data.csv')
dep_lucene = pd.read_csv('Processed_data\\lucene_dependencies.csv')

df_common = pd.read_csv('Processed_data\\cimaging_data.csv')
dep_common = pd.read_csv('Processed_data\\cimaging_dependencies.csv')

datasets = {
    "ant": df_ant,
    "jabref": df_jab,
    "teammates": df_team,
    "sweetHome3D": df_sh,
    "argouml": df_argo,
    "prom": df_prom,
    "lucene": df_lucene,
    "common": df_common
}

dependencies = {
    "ant": dep_ant,
    "jabref": dep_jab,
    "teammates": dep_team,
    "sweetHome3D": dep_sh,
    "argouml": dep_argo,
    "prom": dep_prom,
    'lucene':dep_lucene,
    "common": dep_common
}

## 1 Generating Dataset

In [42]:
class DataNB():
    def __init__(self, df,df_dep):
        self.df = df
        self.df_dep =df_dep
        self.x = None
        self.Y = None
        self.n_classes = None
        self.label_encoder = None
        self.centrality_data = {
            "degree": np.array(df['Degree_Centrality'].values, dtype=float),
            "closeness": np.array(df['Closeness_Centrality'].values, dtype=float),
        }
        self.process_data()

    def process_data(self):
        """
        Processes the input data to create node features, labels, edge indices, and attributes.
        """
        self._preprocess_dataframes()
        self._generate_embeddings()

    def _preprocess_dataframes(self):
        """
        Preprocesses df and df_dep by mapping Source_File and Target_File from df.
        """
        self.df.fillna('', inplace=True)
        valid_ents = set(self.df['Entity'])
        self.df_dep = self.df_dep[self.df_dep['Source'].isin(valid_ents) & self.df_dep['Target'].isin(valid_ents)]
        entity_to_file = self.df.set_index('Entity')['File'].to_dict()

        # Map Source_File and Target_File in df_dep
        self.df_dep['Source_File'] = self.df_dep['Source'].map(entity_to_file)
        self.df_dep['Target_File'] = self.df_dep['Target'].map(entity_to_file)

    def _generate_embeddings(self):
        """
        Generates embeddings for CDA and Code columns, ensuring a consistent feature space for all rows.
        """
        self.df['CDA'] = self.df.apply(lambda row: self._generate_cda_text(row), axis=1).str.lower().str.split().str.join(' ')
        self.df['Code'] = self.df['Code'].apply(lambda code: self._clean_code_snippet(code) if isinstance(code, str) else '')
        corpus = self.df['CDA'].fillna('') + ' ' + self.df['Code'].fillna('')
        count_vectorizer = CountVectorizer()
        self.X = count_vectorizer.fit_transform(corpus).toarray()

        # Encode labels
        self.label_encoder = LabelEncoder()
        self.Y = self.label_encoder.fit_transform(self.df['Module'])
        self.num_classes = len(np.unique(self.Y))


    def _generate_cda_text(self, row):
        """
        Creates CDA text for a given entity by combining dependency text from df_dep
        where the entity appears as Source or Target.
        """
        entity = row['Entity']

        # Find matches where Entity is the Source
        source_matches = self.df_dep[self.df_dep['Source'] == entity]
        source_texts = [
            f"{match['Source_File']} {match['Dependency_Type']} {match['Target_File']}"
            for _, match in source_matches.iterrows()
            if pd.notna(match['Source_File']) and pd.notna(match['Target_File'])
        ]

        # Find matches where Entity is the Target
        target_matches = self.df_dep[self.df_dep['Target'] == entity]
        target_texts = [
            f"{match['Source_File']} {match['Dependency_Type']} {match['Target_File']}"
            for _, match in target_matches.iterrows()
            if pd.notna(match['Source_File']) and pd.notna(match['Target_File'])
        ]

        combined_texts = source_texts + target_texts
        return ' '.join(combined_texts)

    def _clean_code_snippet(self, code):
        """
        Cleans code snippets by removing comments, unnecessary characters, and normalizing whitespace.
        """
        code = re.sub(r'/\*.*?\*/|//.*', '', code, flags=re.DOTALL)
        code = code.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        code = re.sub(r'[^a-zA-Z0-9\s.]', '', code) # Remove non-alphanumeric characters except periods
        return code.strip()


    def generate_split(self, centrality_type='closeness', q_threshold=0.75, split_ratio=0.1):
        if centrality_type not in self.centrality_data:
            raise ValueError(f"Invalid centrality type '{centrality_type}'. Choose from {list(self.centrality_data.keys())}.")

        centrality_values = self.centrality_data[centrality_type]
        labels = self.Y

        all_indices = np.arange(len(labels))

        threshold_value = np.quantile(centrality_values, q_threshold)
        high_centrality_indices = all_indices[centrality_values >= threshold_value]
        num_train_samples = max(1, int(split_ratio * len(all_indices)))
        train_indices = np.random.choice(high_centrality_indices, min(len(high_centrality_indices), num_train_samples), replace=False).tolist()
        for cls in np.unique(labels):
            class_indices = all_indices[labels == cls]
            if not any(labels[train_indices] == cls):  
                random_entity = np.random.choice(class_indices)
                train_indices.append(random_entity)

        if len(train_indices) < num_train_samples:
            remaining_candidates = list(set(all_indices) - set(train_indices))
            additional_indices = np.random.choice(remaining_candidates, num_train_samples - len(train_indices), replace=False).tolist()
            train_indices.extend(additional_indices)
        elif len(train_indices) > num_train_samples:
            train_indices = train_indices[:num_train_samples]

        test_indices = np.array(list(set(all_indices) - set(train_indices)))
        np.random.shuffle(train_indices)
        np.random.shuffle(test_indices)

        return train_indices, test_indices


In [43]:
dataset = {}
for name, df, in datasets.items():
    dataset[name] = DataNB(df,dependencies[name])

## Iterative Learnin

In [55]:
def iterative_learning(X, Y, initial_mapping_indices, orphans_indices,
                       lambda_t=None,
                       test_mapped_entities=True,
                       verbose=True):
    mapped_entities = initial_mapping_indices
    orphans = orphans_indices
    predicted_labels = {}
    updated_entities = set()
    iteration = 0

    model = MultinomialNB()

    metrics_history = []  # To store metrics for each iteration

    while True:
        # Train the model on currently mapped entities
        X_train = X[mapped_entities]
        Y_train = Y[mapped_entities]
        model.fit(X_train, Y_train)

        # Predict labels and confidence scores for orphans
        X_orphans = X[orphans]
        probs = model.predict_proba(X_orphans)
        confidence_scores = np.max(probs, axis=1)
        predicted_classes = model.classes_[np.argmax(probs, axis=1)]

        # Determine the confidence threshold
        if lambda_t is not None:
            mean_confidence = confidence_scores.mean()
            std_confidence = confidence_scores.std()
            threshold = lambda_t if isinstance(lambda_t, float) else lambda_t(iteration)
            confidence_threshold = mean_confidence + std_confidence * threshold
            confidence_threshold = min(confidence_threshold, 0.99)
        else:
            confidence_threshold = 0.9


        # Identify high-confidence and low-confidence orphans
        high_conf_mask = confidence_scores >= confidence_threshold
        high_conf_indices = orphans[high_conf_mask]
        high_conf_labels = predicted_classes[high_conf_mask]
        low_conf_indices = orphans[~high_conf_mask]

        # Update mappings and labels for high-confidence orphans
        newly_mapped_count = 0
        for idx, label in zip(high_conf_indices, high_conf_labels):
            if idx not in predicted_labels:  # Check if it's a newly mapped entity
                predicted_labels[idx] = label
                newly_mapped_count += 1

        mapped_entities = np.concatenate([mapped_entities, high_conf_indices])
        orphans = low_conf_indices

        # Reevaluate predictions for mapped entities if enabled
        if test_mapped_entities and len(predicted_labels) > 0:
            X_mapped = X[list(predicted_labels.keys())]
            probs_mapped = model.predict_proba(X_mapped)
            confidence_scores_mapped = np.max(probs_mapped, axis=1)
            new_predictions = model.classes_[np.argmax(probs_mapped, axis=1)]

            for idx, score, new_label in zip(predicted_labels.keys(), confidence_scores_mapped, new_predictions):
                if score >= 0.9 and predicted_labels[idx] != new_label:
                    predicted_labels[idx] = new_label
                    updated_entities.add(idx)

        # Verbose iteration details
        if verbose and predicted_labels:
            mapped_entity_indices = list(predicted_labels.keys())
            mapped_predicted_labels = list(predicted_labels.values())
            true_labels = Y[mapped_entity_indices]

            # Calculate metrics
            current_f1_micro = f1_score(true_labels, mapped_predicted_labels, average='micro')
            current_f1_macro = f1_score(true_labels, mapped_predicted_labels, average='macro')
            current_precision_micro = precision_score(true_labels, mapped_predicted_labels, average='micro')
            current_precision_macro = precision_score(true_labels, mapped_predicted_labels, average='macro')
            current_recall_micro = recall_score(true_labels, mapped_predicted_labels, average='micro')
            current_recall_macro = recall_score(true_labels, mapped_predicted_labels, average='macro')

            # Store iteration metrics
            metrics_history.append({
                "iteration": iteration + 1,
                "initial_set_size": len(initial_mapping_indices),  # Include initial set size
                "f1_micro": current_f1_micro,
                "f1_macro": current_f1_macro,
                "precision_micro": current_precision_micro,
                "precision_macro": current_precision_macro,
                "recall_micro": current_recall_micro,
                "recall_macro": current_recall_macro,
                "mapped_ratio": (len(mapped_entities) + len(initial_mapping_indices)) / len(X),
                "remaining_orphans_ratio": len(orphans) / len(X),
            })

            print(f"Iteration {iteration + 1} - F1 Micro: {current_f1_micro:.3f}, "
                  f"F1 Macro: {current_f1_macro:.3f},"
                  f"Confidence Threshold: {confidence_threshold:.2f},"
                  f" Mapped Entities: {len(mapped_entities)}, "
                  f"Remaining Orphans: {len(orphans)}")

        # Stopping condition
        if len(high_conf_indices) == 0 or len(orphans) == 0:
            if verbose:
                print("No new entities mapped or no orphans left. Stopping iterations.")
            break

        iteration += 1

    # Final metrics calculation
    final_f1_macro, final_f1_micro = 0.0, 0.0
    final_precision_macro, final_precision_micro = 0.0, 0.0
    final_recall_macro, final_recall_micro = 0.0, 0.0

    if predicted_labels:
        mapped_entity_indices = list(predicted_labels.keys())
        mapped_predicted_labels = list(predicted_labels.values())
        true_labels = Y[mapped_entity_indices]

        final_f1_macro = f1_score(true_labels, mapped_predicted_labels, average='macro')
        final_f1_micro = f1_score(true_labels, mapped_predicted_labels, average='micro')
        final_precision_macro = precision_score(true_labels, mapped_predicted_labels, average='macro')
        final_precision_micro = precision_score(true_labels, mapped_predicted_labels, average='micro')
        final_recall_macro = recall_score(true_labels, mapped_predicted_labels, average='macro')
        final_recall_micro = recall_score(true_labels, mapped_predicted_labels, average='micro')

    # Return final metrics and history
    return {
        "initial_set_size": len(initial_mapping_indices),
        "final_mapped_size": len(mapped_entities),
        "final_unmapped_size": len(orphans),
        "final_f1_macro": final_f1_macro,
        "final_f1_micro": final_f1_micro,
        "final_precision_macro": final_precision_macro,
        "final_precision_micro": final_precision_micro,
        "final_recall_macro": final_recall_macro,
        "final_recall_micro": final_recall_micro,
        "metrics_history": metrics_history,
    }


### Iterative Learning with Tobias Strategy on Ant Data (𝓗=0.82)

### Tobias's Iterative Mapping Strategy

Tobias's approach, as outlined in the referenced study, relies on **iterative attraction-based mapping**. The core idea is to iteratively map unmapped entities (orphans) to architecture modules using various attraction functions. The process is governed by the following components:

---

#### 1. Initial Set and Orphans
- The **initial set** is generated by **removing a random subset** of known mappings from the system's ground truth mapping.
- Each module is ensured representation by at least one entity, ensuring the initial mapped set is never smaller than the number of modules.
- **Orphans** are the entities excluded from the initial set and considered candidates for automatic mapping. Some attraction functions (e.g., CountAttract) filter orphans based on dependency ratios to avoid unstable mappings early in the process.


#### 2. Attraction Functions
- Each function computes the association strength (attraction) of an orphan to each module.
- Examples:
  - `CountAttract`: Utilizes dependencies between entities.
  - `IRAttract`: Employs vector space models based on textual content.
  - `NBAttract`: Uses Naive Bayes to compute classification probabilities as attraction values.
- In `NBAttract`, features are derived from textual representations of entities. Classifier probabilities determine attraction, and higher probabilities indicate stronger association.


#### 3. Thresholding and Mapping Logic
- Orphans are mapped based on their **highest attraction score**. 
- If an orphan's attraction to a module exceeds a threshold (e.g., 90% for `NBAttract`), it is automatically mapped to that module.
- **Contextual Dependency Analysis (CDA)**:
  - Optionally applied for additional relevance.
  - Re-generates the orphan's text representation as if it were mapped to a candidate module, recalculating attraction scores. This is computationally intensive.


#### 4. Iterative Refinement
- The system iteratively updates the mapping until no new entities can be mapped (i.e., the size of the mapped set stops growing).
- Each iteration involves recalculating attraction values for orphans and re-evaluating their mapping to the updated module set.

---

As we can see that maping with Tobias Approach we hve some unmapped entties given that initial set size was 5% of the data.


### Tobiasstrategy wthout my quantile threshld logic

In [35]:
dataset.keys()

dict_keys(['ant', 'jabref', 'teammates', 'sweetHome3D', 'argouml', 'prom', 'lucene', 'common'])

## Training on Common-Imaging data

In [45]:
data =  dataset['common']
initial_mapping, orphans = data.generate_split(q_threshold=0.3, split_ratio=0.05)
#print(f'Initial Mapping Size: {len(initial_mapping)}')

metrics = iterative_learning(X = data.X,
                            Y = data.Y,
                            initial_mapping_indices = initial_mapping,
                            orphans_indices = orphans,
                            lambda_t= lambda t: 0.2 if t < 3 else 1,
                            )

Iteration 1 - F1 Micro: 0.540, F1 Macro: 0.229,Confidence Threshold: 0.99, Mapped Entities: 279, Remaining Orphans: 50
Iteration 2 - F1 Micro: 0.964, F1 Macro: 0.809,Confidence Threshold: 0.95, Mapped Entities: 322, Remaining Orphans: 7
Iteration 3 - F1 Micro: 0.984, F1 Macro: 0.870,Confidence Threshold: 0.47, Mapped Entities: 323, Remaining Orphans: 6
Iteration 4 - F1 Micro: 0.984, F1 Macro: 0.869,Confidence Threshold: 0.42, Mapped Entities: 325, Remaining Orphans: 4
Iteration 5 - F1 Micro: 0.984, F1 Macro: 0.869,Confidence Threshold: 0.34, Mapped Entities: 326, Remaining Orphans: 3
Iteration 6 - F1 Micro: 0.981, F1 Macro: 0.865,Confidence Threshold: 0.29, Mapped Entities: 327, Remaining Orphans: 2
Iteration 7 - F1 Micro: 0.978, F1 Macro: 0.863,Confidence Threshold: 0.19, Mapped Entities: 328, Remaining Orphans: 1
Iteration 8 - F1 Micro: 0.974, F1 Macro: 0.859,Confidence Threshold: 0.18, Mapped Entities: 329, Remaining Orphans: 0
No new entities mapped or no orphans left. Stopping ite

In [46]:
data =  dataset['common']
num_runs = 500
q_threshold = 0.3
all_metrics = []
split_ratio = 0.05

successful_runs = 0  
while successful_runs < num_runs:
    try:
        initial_mapping, orphans = data.generate_split(q_threshold=q_threshold, split_ratio=split_ratio)
        print(f'Attempting Run {successful_runs + 1}: Initial Mapping Size {len(initial_mapping)}')

        metrics = iterative_learning(
            X=data.X,
            Y=data.Y,
            initial_mapping_indices=initial_mapping,
            orphans_indices=orphans,
            lambda_t=lambda t: 0.2 if t < 3 else 1,
            test_mapped_entities=True,
        )
        if len(metrics["metrics_history"]) > 2:
            metrics["q_threshold"] = q_threshold
            metrics["split_ratio"] = split_ratio
            all_metrics.append(metrics)
            successful_runs += 1 
            print(f"Run {successful_runs} completed successfully.")
        else:
            print("Run did not progress beyond the initial iteration. Retrying...")

    except Exception as e:
        print(f"Run failed with error: {e}. Retrying...")

metrics_df = pd.DataFrame(all_metrics)
output_path = os.path.join(logs_dir, 'common_imaging.csv')
metrics_df.to_csv(output_path, index=False)

Attempting Run 1: Initial Mapping Size 16
Iteration 1 - F1 Micro: 0.690, F1 Macro: 0.325,Confidence Threshold: 0.99, Mapped Entities: 287, Remaining Orphans: 42
Iteration 2 - F1 Micro: 0.984, F1 Macro: 0.869,Confidence Threshold: 0.93, Mapped Entities: 320, Remaining Orphans: 9
Iteration 3 - F1 Micro: 0.984, F1 Macro: 0.862,Confidence Threshold: 0.50, Mapped Entities: 322, Remaining Orphans: 7
Iteration 4 - F1 Micro: 0.987, F1 Macro: 0.873,Confidence Threshold: 0.43, Mapped Entities: 323, Remaining Orphans: 6
Iteration 5 - F1 Micro: 0.984, F1 Macro: 0.855,Confidence Threshold: 0.42, Mapped Entities: 325, Remaining Orphans: 4
Iteration 6 - F1 Micro: 0.984, F1 Macro: 0.855,Confidence Threshold: 0.34, Mapped Entities: 326, Remaining Orphans: 3
Iteration 7 - F1 Micro: 0.981, F1 Macro: 0.852,Confidence Threshold: 0.29, Mapped Entities: 327, Remaining Orphans: 2
Iteration 8 - F1 Micro: 0.978, F1 Macro: 0.849,Confidence Threshold: 0.19, Mapped Entities: 328, Remaining Orphans: 1
Iteration 9 -

## Training on ArgoUML data

In [13]:
data =  dataset['argouml']
initial_mapping, orphans = data.generate_split(q_threshold=0.3, split_ratio=0.02)
#print(f'Initial Mapping Size: {len(initial_mapping)}')

metrics = iterative_learning(X = data.X,
                            Y = data.Y,
                            initial_mapping_indices = initial_mapping,
                            orphans_indices = orphans,
                            lambda_t= lambda t: 0.2 if t < 3 else 1,
                            )

Iteration 1 - F1 Micro: 0.577, F1 Macro: 0.234,Confidence Threshold: 0.99, Mapped Entities: 683, Remaining Orphans: 84
Iteration 2 - F1 Micro: 0.954, F1 Macro: 0.810,Confidence Threshold: 0.94, Mapped Entities: 745, Remaining Orphans: 22
Iteration 3 - F1 Micro: 0.961, F1 Macro: 0.850,Confidence Threshold: 0.81, Mapped Entities: 758, Remaining Orphans: 9
Iteration 4 - F1 Micro: 0.964, F1 Macro: 0.858,Confidence Threshold: 0.72, Mapped Entities: 759, Remaining Orphans: 8
Iteration 5 - F1 Micro: 0.961, F1 Macro: 0.856,Confidence Threshold: 0.57, Mapped Entities: 761, Remaining Orphans: 6
Iteration 6 - F1 Micro: 0.960, F1 Macro: 0.851,Confidence Threshold: 0.48, Mapped Entities: 763, Remaining Orphans: 4
Iteration 7 - F1 Micro: 0.960, F1 Macro: 0.851,Confidence Threshold: 0.36, Mapped Entities: 763, Remaining Orphans: 4
No new entities mapped or no orphans left. Stopping iterations.


In [47]:
data =  dataset['argouml']
num_runs = 500
q_threshold = 0.3
all_metrics = []
split_ratio = 0.03

successful_runs = 0  
while successful_runs < num_runs:
    try:
        initial_mapping, orphans = data.generate_split(q_threshold=q_threshold, split_ratio=split_ratio)
        print(f'Attempting Run {successful_runs + 1}: Initial Mapping Size {len(initial_mapping)}')

        metrics = iterative_learning(
            X=data.X,
            Y=data.Y,
            initial_mapping_indices=initial_mapping,
            orphans_indices=orphans,
            lambda_t=lambda t: 0.2 if t < 3 else 1,
            test_mapped_entities=True,
        )
        if len(metrics["metrics_history"]) > 2:
            metrics["q_threshold"] = q_threshold
            metrics["split_ratio"] = split_ratio
            all_metrics.append(metrics)
            successful_runs += 1 
            print(f"Run {successful_runs} completed successfully.")
        else:
            print("Run did not progress beyond the initial iteration. Retrying...")

    except Exception as e:
        print(f"Run failed with error: {e}. Retrying...")

metrics_df = pd.DataFrame(all_metrics)
output_path = os.path.join(logs_dir, 'argouml.csv')
metrics_df.to_csv(output_path, index=False)

Attempting Run 1: Initial Mapping Size 23
Iteration 1 - F1 Micro: 0.769, F1 Macro: 0.300,Confidence Threshold: 0.99, Mapped Entities: 654, Remaining Orphans: 113
Iteration 2 - F1 Micro: 0.956, F1 Macro: 0.814,Confidence Threshold: 0.97, Mapped Entities: 749, Remaining Orphans: 18
Iteration 3 - F1 Micro: 0.960, F1 Macro: 0.852,Confidence Threshold: 0.73, Mapped Entities: 757, Remaining Orphans: 10
Iteration 4 - F1 Micro: 0.963, F1 Macro: 0.860,Confidence Threshold: 0.73, Mapped Entities: 758, Remaining Orphans: 9
Iteration 5 - F1 Micro: 0.959, F1 Macro: 0.855,Confidence Threshold: 0.60, Mapped Entities: 761, Remaining Orphans: 6
Iteration 6 - F1 Micro: 0.959, F1 Macro: 0.854,Confidence Threshold: 0.48, Mapped Entities: 763, Remaining Orphans: 4
Iteration 7 - F1 Micro: 0.959, F1 Macro: 0.854,Confidence Threshold: 0.36, Mapped Entities: 763, Remaining Orphans: 4
No new entities mapped or no orphans left. Stopping iterations.
Run 1 completed successfully.
Attempting Run 2: Initial Mapping 

## Training on Jabref

In [52]:
data =  dataset['jabref']
initial_mapping, orphans = data.generate_split(q_threshold=0.3, split_ratio=0.03)
print(f'Initial Mapping Size: {len(initial_mapping)}')

metrics = iterative_learning(X = data.X,
                            Y = data.Y,
                            initial_mapping_indices = initial_mapping,
                            orphans_indices = orphans,
                            lambda_t= lambda t: 0.2 if t < 3 else 1,
                            )

Initial Mapping Size: 30
Iteration 1 - F1 Micro: 0.851, F1 Macro: 0.364,Confidence Threshold: 0.98, Mapped Entities: 972, Remaining Orphans: 43
Iteration 2 - F1 Micro: 0.929, F1 Macro: 0.704,Confidence Threshold: 0.98, Mapped Entities: 1008, Remaining Orphans: 7
Iteration 3 - F1 Micro: 0.929, F1 Macro: 0.698,Confidence Threshold: 0.88, Mapped Entities: 1012, Remaining Orphans: 3
Iteration 4 - F1 Micro: 0.929, F1 Macro: 0.692,Confidence Threshold: 0.72, Mapped Entities: 1013, Remaining Orphans: 2
Iteration 5 - F1 Micro: 0.928, F1 Macro: 0.691,Confidence Threshold: 0.69, Mapped Entities: 1014, Remaining Orphans: 1
Iteration 6 - F1 Micro: 0.929, F1 Macro: 0.692,Confidence Threshold: 0.67, Mapped Entities: 1015, Remaining Orphans: 0
No new entities mapped or no orphans left. Stopping iterations.


In [56]:
data =  dataset['jabref']
num_runs = 500
q_threshold = 0.3
all_metrics = []
split_ratio = 0.025
successful_runs = 0  
while successful_runs < num_runs:
    try:
        initial_mapping, orphans = data.generate_split(q_threshold=q_threshold, split_ratio=split_ratio)
        print(f'Attempting Run {successful_runs + 1}: Initial Mapping Size {len(initial_mapping)}')

        metrics = iterative_learning(
            X=data.X,
            Y=data.Y,
            initial_mapping_indices=initial_mapping,
            orphans_indices=orphans,
            lambda_t=lambda t: 0.2 if t < 3 else 1,
            test_mapped_entities=True,
        )
        if len(metrics["metrics_history"]) > 2:
            metrics["q_threshold"] = q_threshold
            metrics["split_ratio"] = split_ratio
            all_metrics.append(metrics)
            successful_runs += 1 
            print(f"Run {successful_runs} completed successfully.")
        else:
            print("Run did not progress beyond the initial iteration. Retrying...")

    except Exception as e:
        print(f"Run failed with error: {e}. Retrying...")

metrics_df = pd.DataFrame(all_metrics)
metrics_df.to_csv(os.path.join(logs_dir, 'jabref.csv'),index=False)

Attempting Run 1: Initial Mapping Size 25
Iteration 1 - F1 Micro: 0.863, F1 Macro: 0.347,Confidence Threshold: 0.99, Mapped Entities: 968, Remaining Orphans: 47
Iteration 2 - F1 Micro: 0.928, F1 Macro: 0.703,Confidence Threshold: 0.99, Mapped Entities: 1011, Remaining Orphans: 4
Iteration 3 - F1 Micro: 0.926, F1 Macro: 0.697,Confidence Threshold: 0.83, Mapped Entities: 1013, Remaining Orphans: 2
Iteration 4 - F1 Micro: 0.926, F1 Macro: 0.697,Confidence Threshold: 0.67, Mapped Entities: 1014, Remaining Orphans: 1
Iteration 5 - F1 Micro: 0.925, F1 Macro: 0.697,Confidence Threshold: 0.56, Mapped Entities: 1015, Remaining Orphans: 0
No new entities mapped or no orphans left. Stopping iterations.
Run 1 completed successfully.
Attempting Run 2: Initial Mapping Size 25
Iteration 1 - F1 Micro: 0.812, F1 Macro: 0.377,Confidence Threshold: 0.99, Mapped Entities: 945, Remaining Orphans: 70
Iteration 2 - F1 Micro: 0.920, F1 Macro: 0.695,Confidence Threshold: 0.98, Mapped Entities: 1001, Remaining 

## Training on Ant Data

In [62]:
data =  dataset['ant']
initial_mapping, orphans = data.generate_split(q_threshold=0.3, split_ratio=0.02)
print(f'Initial Mapping Size: {len(initial_mapping)}')

metrics = iterative_learning(X = data.X,
                            Y = data.Y,
                            initial_mapping_indices = initial_mapping,
                            orphans_indices = orphans,
                            lambda_t= lambda t: 0.2 if t < 3 else 1,
                            )

Initial Mapping Size: 18
Iteration 1 - F1 Micro Score: 0.494, Confidence Threshold: 0.982, Mapped Entities: 378, Remaining Orphans: 90
Iteration 2 - F1 Micro Score: 0.912, Confidence Threshold: 0.969, Mapped Entities: 451, Remaining Orphans: 17
Iteration 3 - F1 Micro Score: 0.943, Confidence Threshold: 0.792, Mapped Entities: 460, Remaining Orphans: 8
Iteration 4 - F1 Micro Score: 0.948, Confidence Threshold: 0.660, Mapped Entities: 462, Remaining Orphans: 6
Iteration 5 - F1 Micro Score: 0.948, Confidence Threshold: 0.527, Mapped Entities: 464, Remaining Orphans: 4
Iteration 6 - F1 Micro Score: 0.951, Confidence Threshold: 0.364, Mapped Entities: 465, Remaining Orphans: 3
Iteration 7 - F1 Micro Score: 0.951, Confidence Threshold: 0.343, Mapped Entities: 465, Remaining Orphans: 3
No new entities mapped or no orphans left. Stopping iterations.


In [57]:
data =  dataset['ant']
num_runs = 500
q_threshold = 0.3
all_metrics = []
split_ratio = 0.04

successful_runs = 0  
while successful_runs < num_runs:
    try:
        initial_mapping, orphans = data.generate_split(q_threshold=q_threshold, split_ratio=split_ratio)
        print(f'Attempting Run {successful_runs + 1}: Initial Mapping Size {len(initial_mapping)}')

        metrics = iterative_learning(
            X=data.X,
            Y=data.Y,
            initial_mapping_indices=initial_mapping,
            orphans_indices=orphans,
            lambda_t=lambda t: 0.2 if t < 3 else 1,
            test_mapped_entities=True,
        )
        if len(metrics["metrics_history"]) > 2:
            metrics["q_threshold"] = q_threshold
            metrics["split_ratio"] = split_ratio
            all_metrics.append(metrics)
            successful_runs += 1 
            print(f"Run {successful_runs} completed successfully.")
        else:
            print("Run did not progress beyond the initial iteration. Retrying...")

    except Exception as e:
        print(f"Run failed with error: {e}. Retrying...")
 
metrics_df = pd.DataFrame(all_metrics)
metrics_df.to_csv(os.path.join(logs_dir, 'ant.csv'),index=False)

Attempting Run 1: Initial Mapping Size 18
Iteration 1 - F1 Micro: 0.411, F1 Macro: 0.205,Confidence Threshold: 0.99, Mapped Entities: 422, Remaining Orphans: 46
Iteration 2 - F1 Micro: 0.959, F1 Macro: 0.925,Confidence Threshold: 0.96, Mapped Entities: 460, Remaining Orphans: 8
Iteration 3 - F1 Micro: 0.955, F1 Macro: 0.916,Confidence Threshold: 0.59, Mapped Entities: 464, Remaining Orphans: 4
Iteration 4 - F1 Micro: 0.960, F1 Macro: 0.925,Confidence Threshold: 0.36, Mapped Entities: 465, Remaining Orphans: 3
Iteration 5 - F1 Micro: 0.960, F1 Macro: 0.925,Confidence Threshold: 0.34, Mapped Entities: 465, Remaining Orphans: 3
No new entities mapped or no orphans left. Stopping iterations.
Run 1 completed successfully.
Attempting Run 2: Initial Mapping Size 18
Iteration 1 - F1 Micro: 0.391, F1 Macro: 0.246,Confidence Threshold: 0.99, Mapped Entities: 417, Remaining Orphans: 51
Iteration 2 - F1 Micro: 0.920, F1 Macro: 0.837,Confidence Threshold: 0.94, Mapped Entities: 457, Remaining Orpha

## Training on Teammates Data

In [79]:
data =  dataset['teammates']
initial_mapping, orphans = data.generate_split(q_threshold=0.3, split_ratio=0.02)
print(f'Initial Mapping Size: {len(initial_mapping)}')

metrics = iterative_learning(X = data.X,
                            Y = data.Y,
                            initial_mapping_indices = initial_mapping,
                            orphans_indices = orphans,
                            lambda_t= lambda t: 0.1 if t < 3 else 1,
                            )

Initial Mapping Size: 25
Iteration 1 - F1 Micro Score: 0.590, Confidence Threshold: 0.997, Mapped Entities: 737, Remaining Orphans: 42
Iteration 2 - F1 Micro Score: 0.972, Confidence Threshold: 0.965, Mapped Entities: 770, Remaining Orphans: 9
Iteration 3 - F1 Micro Score: 0.972, Confidence Threshold: 0.877, Mapped Entities: 777, Remaining Orphans: 2
Iteration 4 - F1 Micro Score: 0.973, Confidence Threshold: 0.644, Mapped Entities: 778, Remaining Orphans: 1
Iteration 5 - F1 Micro Score: 0.973, Confidence Threshold: 0.599, Mapped Entities: 779, Remaining Orphans: 0
No new entities mapped or no orphans left. Stopping iterations.


In [58]:
data =  dataset['teammates']
num_runs = 500
q_threshold = 0.3
all_metrics = []
split_ratio = 0.03
successful_runs = 0  
while successful_runs < num_runs:
    try:
        initial_mapping, orphans = data.generate_split(q_threshold=q_threshold, split_ratio=split_ratio)
        print(f'Attempting Run {successful_runs + 1}: Initial Mapping Size {len(initial_mapping)}')

        metrics = iterative_learning(
            X=data.X,
            Y=data.Y,
            initial_mapping_indices=initial_mapping,
            orphans_indices=orphans,
            lambda_t=lambda t: 0.1 if t < 3 else 1,
            test_mapped_entities=True,
        )
        if len(metrics["metrics_history"]) > 2:
            metrics["q_threshold"] = q_threshold
            metrics["split_ratio"] = split_ratio
            all_metrics.append(metrics)
            successful_runs += 1 
            print(f"Run {successful_runs} completed successfully.")
        else:
            print("Run did not progress beyond the initial iteration. Retrying...")

    except Exception as e:
        print(f"Run failed with error: {e}. Retrying...")

metrics_df = pd.DataFrame(all_metrics)
metrics_df.to_csv(os.path.join(logs_dir, 'teammates.csv'),index=False)

Attempting Run 1: Initial Mapping Size 23
Iteration 1 - F1 Micro: 0.360, F1 Macro: 0.104,Confidence Threshold: 0.99, Mapped Entities: 753, Remaining Orphans: 26
Iteration 2 - F1 Micro: 0.973, F1 Macro: 0.877,Confidence Threshold: 0.97, Mapped Entities: 774, Remaining Orphans: 5
Iteration 3 - F1 Micro: 0.972, F1 Macro: 0.881,Confidence Threshold: 0.83, Mapped Entities: 777, Remaining Orphans: 2
Iteration 4 - F1 Micro: 0.974, F1 Macro: 0.882,Confidence Threshold: 0.65, Mapped Entities: 778, Remaining Orphans: 1
Iteration 5 - F1 Micro: 0.972, F1 Macro: 0.872,Confidence Threshold: 0.63, Mapped Entities: 779, Remaining Orphans: 0
No new entities mapped or no orphans left. Stopping iterations.
Run 1 completed successfully.
Attempting Run 2: Initial Mapping Size 23
Iteration 1 - F1 Micro: 0.743, F1 Macro: 0.245,Confidence Threshold: 0.99, Mapped Entities: 752, Remaining Orphans: 27
Iteration 2 - F1 Micro: 0.972, F1 Macro: 0.883,Confidence Threshold: 0.96, Mapped Entities: 772, Remaining Orpha

### Training on Lucene Data

In [85]:
data =  dataset['lucene']
initial_mapping, orphans = data.generate_split(q_threshold=0.3, split_ratio=0.03)
print(f'Initial Mapping Size: {len(initial_mapping)}')

metrics = iterative_learning(X = data.X,
                            Y = data.Y,
                            initial_mapping_indices = initial_mapping,
                            orphans_indices = orphans,
                            lambda_t= lambda t: 0.2 if t < 3 else 1,
                            )

Initial Mapping Size: 13
Iteration 1 - F1 Micro Score: 0.532, Confidence Threshold: 0.998, Mapped Entities: 453, Remaining Orphans: 57
Iteration 2 - F1 Micro Score: 0.971, Confidence Threshold: 0.970, Mapped Entities: 499, Remaining Orphans: 11
Iteration 3 - F1 Micro Score: 0.963, Confidence Threshold: 0.758, Mapped Entities: 505, Remaining Orphans: 5
Iteration 4 - F1 Micro Score: 0.968, Confidence Threshold: 0.533, Mapped Entities: 506, Remaining Orphans: 4
Iteration 5 - F1 Micro Score: 0.966, Confidence Threshold: 0.491, Mapped Entities: 507, Remaining Orphans: 3
Iteration 6 - F1 Micro Score: 0.964, Confidence Threshold: 0.434, Mapped Entities: 508, Remaining Orphans: 2
Iteration 7 - F1 Micro Score: 0.964, Confidence Threshold: 0.358, Mapped Entities: 510, Remaining Orphans: 0
No new entities mapped or no orphans left. Stopping iterations.


In [59]:
data =  dataset['lucene']
num_runs = 500
q_threshold = 0.3
all_metrics = []
split_ratio = 0.03

successful_runs = 0  
while successful_runs < num_runs:
    try:
        initial_mapping, orphans = data.generate_split(q_threshold=q_threshold, split_ratio=split_ratio)
        print(f'Attempting Run {successful_runs + 1}: Initial Mapping Size {len(initial_mapping)}')

        metrics = iterative_learning(
            X=data.X,
            Y=data.Y,
            initial_mapping_indices=initial_mapping,
            orphans_indices=orphans,
            lambda_t=lambda t: 0.2 if t < 3 else 1,
            test_mapped_entities=True,
        )
        if len(metrics["metrics_history"]) > 2:
            metrics["q_threshold"] = q_threshold
            metrics["split_ratio"] = split_ratio
            all_metrics.append(metrics)
            successful_runs += 1 
            print(f"Run {successful_runs} completed successfully.")
        else:
            print("Run did not progress beyond the initial iteration. Retrying...")

    except Exception as e:
        print(f"Run failed with error: {e}. Retrying...")

metrics_df = pd.DataFrame(all_metrics)
metrics_df.to_csv(os.path.join(logs_dir, 'lucene.csv'),index=False)

Attempting Run 1: Initial Mapping Size 15
Iteration 1 - F1 Micro: 0.840, F1 Macro: 0.419,Confidence Threshold: 0.99, Mapped Entities: 464, Remaining Orphans: 46
Iteration 2 - F1 Micro: 0.973, F1 Macro: 0.974,Confidence Threshold: 0.96, Mapped Entities: 501, Remaining Orphans: 9
Iteration 3 - F1 Micro: 0.965, F1 Macro: 0.946,Confidence Threshold: 0.71, Mapped Entities: 505, Remaining Orphans: 5
Iteration 4 - F1 Micro: 0.967, F1 Macro: 0.960,Confidence Threshold: 0.53, Mapped Entities: 506, Remaining Orphans: 4
Iteration 5 - F1 Micro: 0.965, F1 Macro: 0.955,Confidence Threshold: 0.49, Mapped Entities: 507, Remaining Orphans: 3
Iteration 6 - F1 Micro: 0.963, F1 Macro: 0.949,Confidence Threshold: 0.43, Mapped Entities: 508, Remaining Orphans: 2
Iteration 7 - F1 Micro: 0.964, F1 Macro: 0.949,Confidence Threshold: 0.36, Mapped Entities: 510, Remaining Orphans: 0
No new entities mapped or no orphans left. Stopping iterations.
Run 1 completed successfully.
Attempting Run 2: Initial Mapping Siz

## Training on SweetHome-3D Data

In [94]:
data =  dataset['sweetHome3D']
initial_mapping, orphans = data.generate_split(q_threshold=0.3, split_ratio=0.05)
print(f'Initial Mapping Size: {len(initial_mapping)}')

metrics = iterative_learning(X = data.X,
                            Y = data.Y,
                            initial_mapping_indices = initial_mapping,
                            orphans_indices = orphans,
                            lambda_t= lambda t: 0.2 if t < 3 else 1,
                            )

Initial Mapping Size: 11
Iteration 1 - F1 Micro Score: 0.647, Confidence Threshold: 0.996, Mapped Entities: 147, Remaining Orphans: 20
Iteration 2 - F1 Micro Score: 0.974, Confidence Threshold: 1.000, Mapped Entities: 165, Remaining Orphans: 2
Iteration 3 - F1 Micro Score: 0.974, Confidence Threshold: 0.984, Mapped Entities: 166, Remaining Orphans: 1
Iteration 4 - F1 Micro Score: 0.981, Confidence Threshold: 0.959, Mapped Entities: 167, Remaining Orphans: 0
No new entities mapped or no orphans left. Stopping iterations.


In [60]:
data =  dataset['sweetHome3D']
num_runs = 500
q_threshold = 0.3
all_metrics = []
split_ratio = 0.05

successful_runs = 0  
while successful_runs < num_runs:
    try:
        initial_mapping, orphans = data.generate_split(q_threshold=q_threshold, split_ratio=split_ratio)
        print(f'Attempting Run {successful_runs + 1}: Initial Mapping Size {len(initial_mapping)}')

        metrics = iterative_learning(
            X=data.X,
            Y=data.Y,
            initial_mapping_indices=initial_mapping,
            orphans_indices=orphans,
            lambda_t= None, #lambda t: 0.2 if t < 3 else 1,
            test_mapped_entities=True,
        )
        if len(metrics["metrics_history"]) > 2:
            metrics["q_threshold"] = q_threshold
            metrics["split_ratio"] = split_ratio
            all_metrics.append(metrics)
            successful_runs += 1 
            print(f"Run {successful_runs} completed successfully.")
        else:
            print("Run did not progress beyond the initial iteration. Retrying...")

    except Exception as e:
        print(f"Run failed with error: {e}. Retrying...")

metrics_df = pd.DataFrame(all_metrics)
metrics_df.to_csv(os.path.join(logs_dir, 'sweetHome3D_static.csv'),index=False)

Attempting Run 1: Initial Mapping Size 8
Iteration 1 - F1 Micro: 0.425, F1 Macro: 0.130,Confidence Threshold: 0.90, Mapped Entities: 161, Remaining Orphans: 6
Iteration 2 - F1 Micro: 0.962, F1 Macro: 0.906,Confidence Threshold: 0.90, Mapped Entities: 167, Remaining Orphans: 0
No new entities mapped or no orphans left. Stopping iterations.
Run did not progress beyond the initial iteration. Retrying...
Attempting Run 1: Initial Mapping Size 8
Iteration 1 - F1 Micro: 0.703, F1 Macro: 0.374,Confidence Threshold: 0.90, Mapped Entities: 153, Remaining Orphans: 14
Iteration 2 - F1 Micro: 0.968, F1 Macro: 0.919,Confidence Threshold: 0.90, Mapped Entities: 165, Remaining Orphans: 2
Iteration 3 - F1 Micro: 0.981, F1 Macro: 0.947,Confidence Threshold: 0.90, Mapped Entities: 166, Remaining Orphans: 1
Iteration 4 - F1 Micro: 0.981, F1 Macro: 0.947,Confidence Threshold: 0.90, Mapped Entities: 166, Remaining Orphans: 1
No new entities mapped or no orphans left. Stopping iterations.
Run 1 completed su

## Training on ProM Data

In [59]:
data =  dataset['prom']
initial_mapping, orphans = data.generate_split(q_threshold=0.6, split_ratio=0.05)
print(f'Initial Mapping Size: {len(initial_mapping)}')

metrics = iterative_learning(X = data.X,
                            Y = data.Y,
                            initial_mapping_indices = initial_mapping,
                            orphans_indices = orphans,
                            lambda_t= lambda t: 0.3 if t < 3 else 1,
                            )

Initial Mapping Size: 8
Iteration 1 - F1 Micro: 0.516, F1 Macro: 0.181,Confidence Threshold: 0.99, Mapped Entities: 225, Remaining Orphans: 36
Iteration 2 - F1 Micro: 0.937, F1 Macro: 0.775,Confidence Threshold: 0.89, Mapped Entities: 246, Remaining Orphans: 15
Iteration 3 - F1 Micro: 0.938, F1 Macro: 0.776,Confidence Threshold: 0.61, Mapped Entities: 249, Remaining Orphans: 12
Iteration 4 - F1 Micro: 0.934, F1 Macro: 0.773,Confidence Threshold: 0.56, Mapped Entities: 251, Remaining Orphans: 10
Iteration 5 - F1 Micro: 0.934, F1 Macro: 0.773,Confidence Threshold: 0.52, Mapped Entities: 252, Remaining Orphans: 9
Iteration 6 - F1 Micro: 0.934, F1 Macro: 0.773,Confidence Threshold: 0.51, Mapped Entities: 252, Remaining Orphans: 9
No new entities mapped or no orphans left. Stopping iterations.


In [61]:
data =  dataset['prom']
num_runs = 500
q_threshold = 0.3
all_metrics = []
split_ratio = 0.05

successful_runs = 0  
while successful_runs < num_runs:
    try:
        initial_mapping, orphans = data.generate_split(q_threshold=q_threshold, split_ratio=split_ratio)
        print(f'Attempting Run {successful_runs + 1}: Initial Mapping Size {len(initial_mapping)}')

        metrics = iterative_learning(
            X=data.X,
            Y=data.Y,
            initial_mapping_indices=initial_mapping,
            orphans_indices=orphans,
            lambda_t=None,  #lambda t: 0.2 if t < 3 else 1,
            test_mapped_entities=True,
        )
        if len(metrics["metrics_history"]) > 2:
            metrics["q_threshold"] = q_threshold
            metrics["split_ratio"] = split_ratio
            all_metrics.append(metrics)
            successful_runs += 1 
            print(f"Run {successful_runs} completed successfully.")
        else:
            print("Run did not progress beyond the initial iteration. Retrying...")

    except Exception as e:
        print(f"Run failed with error: {e}. Retrying...")

metrics_df = pd.DataFrame(all_metrics)
metrics_df.to_csv(os.path.join(logs_dir, 'prom_static.csv'),index=False)

Attempting Run 1: Initial Mapping Size 13
Iteration 1 - F1 Micro: 0.928, F1 Macro: 0.701,Confidence Threshold: 0.90, Mapped Entities: 234, Remaining Orphans: 27
Iteration 2 - F1 Micro: 0.927, F1 Macro: 0.759,Confidence Threshold: 0.90, Mapped Entities: 245, Remaining Orphans: 16
Iteration 3 - F1 Micro: 0.927, F1 Macro: 0.763,Confidence Threshold: 0.90, Mapped Entities: 246, Remaining Orphans: 15
Iteration 4 - F1 Micro: 0.927, F1 Macro: 0.763,Confidence Threshold: 0.90, Mapped Entities: 246, Remaining Orphans: 15
No new entities mapped or no orphans left. Stopping iterations.
Run 1 completed successfully.
Attempting Run 2: Initial Mapping Size 13
Iteration 1 - F1 Micro: 0.902, F1 Macro: 0.678,Confidence Threshold: 0.90, Mapped Entities: 238, Remaining Orphans: 23
Iteration 2 - F1 Micro: 0.940, F1 Macro: 0.778,Confidence Threshold: 0.90, Mapped Entities: 246, Remaining Orphans: 15
Iteration 3 - F1 Micro: 0.936, F1 Macro: 0.774,Confidence Threshold: 0.90, Mapped Entities: 247, Remaining O

### Explanation of Our Iterative Learning Approach

Our iterative learning approach builds on the principles of **dynamic thresholding** and iterative entity mapping. It is designed to map all entities in a dataset, aiming for complete coverage while dynamically adjusting the confidence threshold to balance precision and recall.

---

#### **1. Initial Mapping**
- We start with a small subset of labeled data, selected using centrality-based heuristics to ensure a structurally significant representation. In some of my experiments I niotices that uit does not put a huge impact it just makes model' performance consistent to a certain degree avoiding outliers in the evaluation metrics given that if we randomly chosen data point that might be too densce in its graph representation or too isolated from all nodes.
- This subset serves as the initial training data for the model.


#### **2. Model Training**
- A Naive Bayes classifier is trained on the initial set to learn the mappings from entities to classes.
- The model generates confidence scores (probabilities) for the mapping of each entity to the possible classes.



#### **3. Confidence Thresholding**
- Unlike static thresholds, our approach dynamically adjusts the threshold based on:
  - The mean confidence score of predictions.
  - The standard deviation of confidence scores, scaled by a hyperparameter `λ` (lambda).
- This allows the system to adapt the threshold iteratively to ensure that high-confidence mappings are prioritized.



#### **4. Iterative Entity Mapping**
- In each iteration:
  - Entities (orphans) with confidence scores exceeding the threshold are mapped to the corresponding classes.
  - Mapped entities are added to the training set for the next iteration.
  - The remaining entities are reevaluated in subsequent iterations until no more entities can be confidently mapped.


#### **5. Re-evaluation of Mapped Entities**
- For entities mapped in previous iterations, the model reassesses their predictions.
- If a better class prediction is made with confidence >90%, the mapping is updated, ensuring incremental refinement.



#### **6. Stopping Condition**
- The process stops when:
  - No new entities are mapped in an iteration.
  - All entities have been mapped.

#### **7. Advantages and Challenges**
- **Advantages**:
  - Dynamically adjusts to dataset-specific characteristics.
  - Ensures that all entities are eventually mapped, leaving no unmapped ones.

- **Challenges**:
  - Tends to map all entities, even low-confidence ones in later iterations, which can hurt metrics like macro-average F1-score.
  - Works better with neural network models, where confidence scores span a broader range, than with Naive Bayes models, where confidence scores are often high for most predictions.


### Iteratively Increasing lambda threshold.

In our iterative learning approach, the lambda threshold dynamically adjusts during each iteration to align with the model's confidence scores and standard deviations. This ensures that the model can adapt to the data's characteristics and make informed mapping decisions. As the lambda value increases iteratively, the model becomes stricter in its mapping, leaving isolated or low-confidence nodes unmapped. This prevents overgeneralization and ensures robustness in the learned mappings. Intentionally leaving some isolated nodes unmapped aligns with the idea that certain nodes, which lack sufficient context or connection to the graph structure, should not be forced into a mapping. Tobias's approach works effectively with Naive Bayes due to its deterministic confidence score outputs, but such a strategy might not generalize well to neural networks, where confidence distributions can vary significantly.

For reference, dynamic thresholding techniques like ours are discussed in [Semi-Supervised Learning with Dynamic Thresholding](https://arxiv.org/abs/2109.00650?utm_source=chatgpt.com), which highlights the benefits of adjusting thresholds based on the data's characteristics.

The idea of iteratively increasing lambda_t is inspired by reinforcement learning, where epsilon is decreased over time to balance exploration and exploitation. Here, we reverse that logic by gradually increasing the lambda threshold. This strategy leverages the insight that the model's confidence score distribution narrows as iterations progress, ensuring a more cautious and refined decision-making process.


---

The following code block represents wehen we mapped the adata using our dynamic thresholding it shows the simialr metrices but tries to map all the entiites in the dataset that we can also control if we are making our  lambda_t threshold increasing with all the iterations. so our lambda_t threshold could be a float or a lambda function which grown with each iteration. Because when we run the model and come forward in the iteration process its highly like that the mean of the confidence scores and its std are dropping. so we can iteratively increase this lambda threshold.

## Key Differences in Our Approach

#### Dependence on Initial Mapping
- Tobias's strategy heavily relies on the random selection of the initial mapping, leading to variability in results. A poor initial set undermines performance.
- **Our approach** uses **centrality-based heuristics** to select structurally important nodes for the initial set, ensuring robustness and reducing reliance on randomness. Which still is a random process but I have seen the impact of this measure in making the model leave some nodes unmapped as for increased q_thresjhold model does nottry to over generalize itself.

#### Thresholding
- Tobias employs a **fixed 90% threshold** for `NBAttract`, which is dataset-agnostic and rigid.
- **Our approach** adopts an **adaptive thresholding mechanism**, considering the mean and standard deviation of confidence scores for each iteration. This ensures flexibility across datasets.



#### CDA Implementation
- Tobias's CDA implementation uses a reclassification method for orphans, increasing computational overhead. In their apporach they make sentences like Module $A$ implemnts Module $B$ if in the intial set entity $a$ belonging to module $A$ implements entity $b$ and attach these sentences to the training data and tries to predcit the new wntityes and also theese sentences when a prediction is made.

- **Our approach** simplifies representation generation, we simply created CDA sentences for all the entities based on their depenndecy on another enttity and do not replace the entity with its module name in eachteration. For Graph neural networks I have observed that these CDA sentences does not make any difference as syntactic dependecies and structural information is learned from the graph architecture. So having Cda or not having them has no impact on the model.
---
