# Evaluating model generations in terms of semantic leakage

## Installations and imports necessary to run the code

In [None]:
# ! pip3 install evaluate
# ! pip install bert_score
# ! pip install -U ipywidgets
# ! pip install openpyxl
# ! pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from evaluate import load
import numpy as np
from tqdm.notebook import tqdm as tqdm

## Loading backbone models to calculate semantic similarity for Leak-Rate

In [9]:
bertscore = load("bertscore")
st_minilm = SentenceTransformer("all-MiniLM-L6-v2")



## Functions to calculate Leak-Rate

In [49]:
def get_bert_score(concept, generation):
    """
    Compute the BERTScore similarity between a concept and a generation.

    Args:
        concept (str): The reference concept.
        generation (str): The generated text to compare against the concept.

    Returns:
        float: The similarity score (F1) between the concept and the generation.
    """
    result = bertscore.compute(predictions=[concept], references=[generation], model_type="distilbert-base-uncased")
    similarity = result["f1"][0]
    return similarity


def get_st_minilm_score(concept, generation):
    """
    Compute the similarity between a concept and a generation using SBERT MiniLM embeddings.

    Args:
        concept (str): The reference concept.
        generation (str): The generated text to compare against the concept.

    Returns:
        float: The similarity score between the concept and the generation.
    """
    embedding_concept = st_minilm.encode(concept)
    embedding_generation = st_minilm.encode(generation)
    similarity = st_minilm.similarity(embedding_concept, embedding_generation)
    similarity = float(similarity[0][0])
    return similarity


def get_leak_rate_for_pair(concept, control_generation, test_generation, score="bertscore"):
    """
    Determine the leak rate for a single pair of control and test generations compared to a concept.

    Args:
        concept (str): The reference concept.
        control_generation (str): The control generation.
        test_generation (str): The test generation.
        score (str, optional): The scoring method to use ('bertscore' or 'sb_minilm'). Defaults to 'bertscore'.

    Returns:
        float: The leak rate for the pair, where 1 indicates leakage, 0.5 indicates neutrality, and 0 indicates no leakage.
    """
    if score == "bertscore":
        score_control = round(get_bert_score(concept, control_generation), 3)
        score_test = round(get_bert_score(concept, test_generation), 3)
    elif score == "sb_minilm":
        score_control = round(get_st_minilm_score(concept, control_generation), 3)
        score_test = round(get_st_minilm_score(concept, test_generation), 3)

    if score_test > score_control:
        leak_rate = 1
    elif score_test == score_control:
        leak_rate = 0.5
    else:
        leak_rate = 0

    return leak_rate


def get_leak_rate_for_df(df, n_generations=1, score="bertscore", cat=None):
    """
    Calculate the average leak rate for a dataframe containing concepts and generations.

    Args:
        df (pd.DataFrame): The dataframe containing concepts, control generations, and test generations.
        n_generations (int, optional): The number of generations to evaluate for each concept. Defaults to 1.
        score (str, optional): The scoring method to use ('bertscore' or 'sb_minilm'). Defaults to 'bertscore'.
        cat (str, optional): A specific category to filter the rows by. If None, all rows are considered. Defaults to None.

    Returns:
        float: The average leak rate across all evaluated pairs in the dataframe.
    """
    bert_scores = []

    for _, row in df.iterrows():

        if cat and row["category"] != cat:
            continue

        concept = row["concept"]
        n_control_row = row["n_control"]

        if n_control_row is None or n_control_row == "none" or (isinstance(n_control_row, float) and np.isnan(n_control_row)):
            continue

        control_row = df.iloc[int(n_control_row)]

        for n_gen in range(n_generations):
            test_generation = row[f"generation_{n_gen+1}"]
            control_generation = control_row[f"generation_{n_gen+1}"]
            leak_rate_for_pair = get_leak_rate_for_pair(concept, control_generation, test_generation, score=score)
            bert_scores.append(leak_rate_for_pair)

    leak_rate = np.mean(bert_scores)
    return leak_rate


def get_leak_rates_for_all_model_sizes(filenames, n_generations=1):
    """
    Calculate leak rates for multiple model sizes using BERTScore and SBERT MiniLM.

    Args:
        filenames (list of str): List of file paths to the data files.
        n_generations (int, optional): Number of generations to evaluate. Defaults to 1.

    Returns:
        dict: Dictionary containing leak rates for each file and scoring method.
    """
    scores = {}

    for filename in filenames:
        scores[filename] = {}
        df = pd.read_excel(filename)
        bs = get_leak_rate_for_df(
            df,
            n_generations=n_generations,
            score="bertscore"
        )
        sb_minilm = get_leak_rate_for_df(
            df,
            n_generations=n_generations,
            score="sb_minilm"
        )
        scores[filename]['bertscore'] = bs
        scores[filename]['sb_mililm'] = sb_minilm

    return scores

def get_leak_rates_for_separate_categories(filenames, n_generations=1):
    """
    Calculate leak rates for separate categories within data files using BERTScore and SBERT MiniLM.

    Args:
        filenames (list of str): List of file paths to the data files.
        n_generations (int, optional): Number of generations to evaluate. Defaults to 1.

    Returns:
        dict: Dictionary containing leak rates for each file, category, and scoring method.
    """
    scores_for_cat = {}

    for filename in tqdm(filenames, desc="Processing files"):
        scores_for_cat[filename] = {}
        df = pd.read_excel(filename)

        categories = df['category'].dropna().unique()
        bertscore_scores = {}
        sb_minilm_scores = {}

        for cat in categories:
            bs_cat = get_leak_rate_for_df(
                df,
                n_generations=n_generations,
                score="bertscore",
                cat=cat
            )
            sb_minilm_cat = get_leak_rate_for_df(
                df,
                n_generations=n_generations,
                score="sb_minilm",
                cat=cat
            )
            bertscore_scores[str(cat)] = bs_cat
            sb_minilm_scores[str(cat)] = sb_minilm_cat

        scores_for_cat[filename]['bertscore'] = bertscore_scores
        scores_for_cat[filename]['sb_mililm'] = sb_minilm_scores

    return scores_for_cat

## Leak-rate for original dataset

In [52]:
filenames_initial_prompts = [
    "initial_prompts/initial_prompts_qwen_05_instr.xlsx",
    "initial_prompts/initial_prompts_qwen_15_instr.xlsx",
    "initial_prompts/initial_prompts_qwen_3_instr.xlsx",
    "initial_prompts/initial_prompts_qwen_7_instr.xlsx",
]

scores = get_leak_rates_for_all_model_sizes(filenames_initial_prompts)

for filename, score_info in scores.items():
    print(f"{filename}:")

    for metric, value in score_info.items():
        print(f"{metric} = {round(value * 100, 2)}")

initial_prompts/initial_prompts_qwen_05_instr.xlsx:
bertscore = 69.27
sb_mililm = 77.52
initial_prompts/initial_prompts_qwen_15_instr.xlsx:
bertscore = 75.23
sb_mililm = 79.36
initial_prompts/initial_prompts_qwen_3_instr.xlsx:
bertscore = 83.03
sb_mililm = 73.85
initial_prompts/initial_prompts_qwen_7_instr.xlsx:
bertscore = 71.1
sb_mililm = 78.44


## Leak-rate for color-related dataset

In [54]:
filenames_color_related_prompts = [
    "color_prompts/color_extended/color_prompts_qwen_05_instr_ext.xlsx",
    "color_prompts/color_extended/color_prompts_qwen_15_instr_ext.xlsx",
    "color_prompts/color_extended/color_prompts_qwen_3_instr_ext.xlsx",
    "color_prompts/color_extended/color_prompts_qwen_7_instr_ext.xlsx"
]

scores_col = get_leak_rates_for_all_model_sizes(filenames_color_related_prompts, n_generations=5)

for filename, score_info in scores.items():
    print(f"{filename}:")

    for metric, value in score_info.items():
        print(f"{metric} = {round(value * 100, 2)}")

initial_prompts/initial_prompts_qwen_05_instr.xlsx:
bertscore = 69.27
sb_mililm = 77.52
initial_prompts/initial_prompts_qwen_15_instr.xlsx:
bertscore = 75.23
sb_mililm = 79.36
initial_prompts/initial_prompts_qwen_3_instr.xlsx:
bertscore = 83.03
sb_mililm = 73.85
initial_prompts/initial_prompts_qwen_7_instr.xlsx:
bertscore = 71.1
sb_mililm = 78.44


## Leak_rate for color-related dataset by category

In [55]:
scores_for_cat = get_leak_rates_for_separate_categories(filenames_color_related_prompts, n_generations=5)

for filename, score_info in scores_for_cat.items():
    print(f"{filename}:")

    for metric, cat_info in score_info.items():
        for cat, value in cat_info.items():
            if value:
                print(f"cat {cat}: {metric} = {round(value * 100, 2)}")

Processing files:   0%|          | 0/4 [00:00<?, ?it/s]

color_prompts/color_extended/color_prompts_qwen_05_instr_ext.xlsx:
cat 1: bertscore = 65.03
cat 2: bertscore = 56.33
cat 3: bertscore = 71.22
cat 1: sb_mililm = 70.61
cat 2: sb_mililm = 51.85
cat 3: sb_mililm = 70.61
color_prompts/color_extended/color_prompts_qwen_15_instr_ext.xlsx:
cat 1: bertscore = 73.18
cat 2: bertscore = 70.36
cat 3: bertscore = 66.33
cat 1: sb_mililm = 72.79
cat 2: sb_mililm = 66.36
cat 3: sb_mililm = 61.02
color_prompts/color_extended/color_prompts_qwen_3_instr_ext.xlsx:
cat 1: bertscore = 70.45
cat 2: bertscore = 84.36
cat 3: bertscore = 57.76
cat 1: sb_mililm = 73.76
cat 2: sb_mililm = 86.91
cat 3: sb_mililm = 72.45
color_prompts/color_extended/color_prompts_qwen_7_instr_ext.xlsx:
cat 1: bertscore = 88.45
cat 2: bertscore = 58.36
cat 3: bertscore = 60.82
cat 1: sb_mililm = 70.3
cat 2: sb_mililm = 62.48
cat 3: sb_mililm = 64.9
