In [1]:
# 1. Clone your GitHub repository
!git clone https://github.com/rafidreezwan/nlp-robustness-study.git

# 2. Navigate into your project directory
%cd nlp-robustness-study

# 3. Install all the required Python libraries (sentencepiece is new)
!pip install -q datasets transformers torch sentencepiece nltk pandas tqdm sentence-transformers

# 4. Download the necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Download punkt_tab as suggested by the error

print("\n✅✅✅ Setup complete! You are ready to run the new experiment. ✅✅✅")

fatal: destination path 'nlp-robustness-study' already exists and is not an empty directory.
/content/nlp-robustness-study
[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



✅✅✅ Setup complete! You are ready to run the new experiment. ✅✅✅


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [2]:
from huggingface_hub import login
from google.colab import userdata

# This securely gets the token you just saved in Colab's secrets
HF_TOKEN = userdata.get('HF_TOKEN')

# Log in to the Hugging Face Hub
login(token=HF_TOKEN)

print("✅ Successfully logged into Hugging Face!")

✅ Successfully logged into Hugging Face!


In [3]:
# ==============================================================================
# FINAL SCRIPT: GENERATING DIPLOMATIC SENTENCES WITH THE OPENAI API
# ==============================================================================

# --- Part 1: Imports and Setup ---
# First, we need to install the OpenAI library
!pip install -q openai

import pandas as pd
import nltk
from tqdm import tqdm
from datasets import load_dataset
from google.colab import userdata
from openai import OpenAI
import time
import re

def generate_diplomatic_with_openai():
    """Function to generate diplomatic perturbations using the OpenAI API."""

    # --- Part 2: Configuration ---
    NUM_ROOT_SENTENCES = 200
    OUTPUT_FILE = 'diplomatic_sentences_openai.csv'

    # Configure the OpenAI API client with your secret key
    try:
        OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
        client = OpenAI(api_key=OPENAI_API_KEY)
    except Exception as e:
        print("---!!! ERROR: Could not configure the OpenAI API. !!!---")
        print("Please make sure you have created an API key and saved it as the secret 'OPENAI_API_KEY'.")
        return

    print("="*50)
    print("🚀 STARTING DIPLOMATIC GENERATION WITH OPENAI API 🚀")
    print(f"Number of root sentences: {NUM_ROOT_SENTENCES}")
    print("="*50)

    # --- Part 3: Load and Prepare Data ---
    print("\n[PHASE 1/2] Loading and preparing original sentences...")
    imdb_dataset = load_dataset("imdb", split='train')
    negative_reviews = imdb_dataset.filter(lambda example: example['label'] == 0)
    source_reviews = negative_reviews.select(range(NUM_ROOT_SENTENCES * 3))
    root_sentences = []
    for review in source_reviews:
        if len(root_sentences) >= NUM_ROOT_SENTENCES: break
        sentences = nltk.sent_tokenize(review['text'])
        for sentence in sentences:
            cleaned_sentence = re.sub(r'<.*?>', ' ', sentence).strip()
            if 10 < len(cleaned_sentence.split()) < 40:
                root_sentences.append(cleaned_sentence)
                if len(root_sentences) >= NUM_ROOT_SENTENCES: break
    print(f"Successfully extracted and cleaned {len(root_sentences)} root sentences.")

    # --- Part 4: Generate Diplomatic Sentences via OpenAI API ---
    print("\n[PHASE 2/2] Generating Diplomatic Sentences with Few-Shot Prompt...")

    diplomatic_sentences = []
    for sentence in tqdm(root_sentences, desc="Generating Diplomatic Text"):
        # The prompt is now formatted for a chat model
        system_prompt = """
        You are an expert movie critic. Your task is to rewrite a negative criticism in a polite, professional, and indirect way. Follow the examples below precisely.

        -- Example 1 --
        Criticism: "The plot was boring and predictable."
        Rewritten Criticism: "The narrative followed a very traditional structure, which made some of the twists foreseeable."

        -- Example 2 --
        Criticism: "The acting was terrible."
        Rewritten Criticism: "While the actors were committed, their performances didn't always land with the intended emotional impact."
        """
        user_prompt = f'Criticism: "{sentence}"\nRewritten Criticism:'

        max_retries = 3
        for attempt in range(max_retries):
            try:
                chat_completion = client.chat.completions.create(
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt}
                    ],
                    model="gpt-3.5-turbo",
                )
                rewritten_text = chat_completion.choices[0].message.content.strip()
                diplomatic_sentences.append(rewritten_text)
                break # Success
            except Exception as e:
                if attempt < max_retries - 1:
                    print(f"\n---! API Error for sentence: '{sentence}'. Retrying ({attempt + 1}/{max_retries})... !---")
                    time.sleep(5)
                else:
                    print(f"\n---!!! API Error after {max_retries} retries. Skipping. !!!---")
                    diplomatic_sentences.append("GENERATION_FAILED")

        # No artificial delay needed for paid keys with high rate limits. This will be fast.

    # --- Part 5: Assemble and Save ---
    final_data = []
    for i in range(len(root_sentences)):
        final_data.append({
            'root_sentence': root_sentences[i],
            'perturbed_sentence': diplomatic_sentences[i],
            'method': 'diplomatic-attack'
        })

    df_diplomatic = pd.DataFrame(final_data)
    df_diplomatic.to_csv(OUTPUT_FILE, index=False)
    print(f"\n✅ High-quality diplomatic sentences saved to '{OUTPUT_FILE}'")
    print("Generation complete!")

# Call the function in a new cell
# generate_diplomatic_with_openai()

[0m

In [4]:
generate_diplomatic_with_openai()

🚀 STARTING DIPLOMATIC GENERATION WITH OPENAI API 🚀
Number of root sentences: 200

[PHASE 1/2] Loading and preparing original sentences...


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Successfully extracted and cleaned 200 root sentences.

[PHASE 2/2] Generating Diplomatic Sentences with Few-Shot Prompt...


Generating Diplomatic Text: 100%|██████████| 200/200 [02:46<00:00,  1.20it/s]


✅ High-quality diplomatic sentences saved to 'diplomatic_sentences_openai.csv'
Generation complete!





In [5]:
# ==============================================================================
# FINAL STEP: COMBINE AND EVALUATE
# ==============================================================================
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch
import os

def combine_and_evaluate_final():
    """
    Loads the separate paraphrase and OpenAI-generated diplomatic datasets,
    combines them, and runs the final evaluation.
    """

    # --- Configuration ---
    PARAPHRASE_DATA_FILE = 'final_experiment_dataset_cleaned.csv' # The file with your good paraphrase data
    DIPLOMATIC_DATA_FILE = 'diplomatic_sentences_openai.csv'      # The file you just created with OpenAI

    FINAL_RESULTS_FILE = 'final_hotcake_results.csv'
    FINAL_COMBINED_DATASET_FILE = 'final_hotcake_dataset.csv'
    EVALUATION_MODEL = 'all-MiniLM-L6-v2'

    print("="*50)
    print("🚀 STARTING FINAL COMBINATION AND EVALUATION 🚀")
    print("="*50)

    # --- Load and Combine Datasets ---
    print("\n[PHASE 1/2] Loading and combining datasets...")

    if not os.path.exists(PARAPHRASE_DATA_FILE) or not os.path.exists(DIPLOMATIC_DATA_FILE):
        print(f"---!!! ERROR: Make sure both '{PARAPHRASE_DATA_FILE}' and '{DIPLOMATIC_DATA_FILE}' exist! !!!---")
        return

    df_previous = pd.read_csv(PARAPHRASE_DATA_FILE)
    df_paraphrasing = df_previous[df_previous['method'] == 'paraphrasing'].copy()

    df_diplomatic = pd.read_csv(DIPLOMATIC_DATA_FILE)

    df_combined = pd.concat([df_paraphrasing, df_diplomatic], ignore_index=True)
    df_combined.to_csv(FINAL_COMBINED_DATASET_FILE, index=False)

    root_sentences = df_combined['root_sentence'].unique().tolist()

    print(f"Successfully combined datasets. Total test cases: {len(df_combined)}")

    # --- Run Final Evaluation ---
    print("\n[PHASE 2/2] Evaluating model performance...")
    model = SentenceTransformer(EVALUATION_MODEL)
    corpus_embeddings = model.encode(root_sentences, convert_to_tensor=True, show_progress_bar=True)

    correct_predictions = 0
    results_list = []

    for _, row in tqdm(df_combined.iterrows(), total=df_combined.shape[0], desc="Evaluating"):
        true_root, query, method = row['root_sentence'], row['perturbed_sentence'], row['method']
        query_embedding = model.encode(query, convert_to_tensor=True)
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        predicted_root = root_sentences[torch.argmax(cos_scores).item()]

        is_correct = (predicted_root == true_root)
        if is_correct: correct_predictions += 1

        results_list.append({
            'root_sentence': true_root, 'perturbed_sentence': query, 'method': method,
            'predicted_root': predicted_root, 'is_correct': is_correct
        })

    # --- Display and Save Final Results ---
    df_results = pd.DataFrame(results_list)
    df_results.to_csv(FINAL_RESULTS_FILE, index=False)

    overall_accuracy = (correct_predictions / len(df_results)) * 100
    accuracy_by_method = df_results.groupby('method')['is_correct'].mean().apply(lambda x: x * 100)

    print("\n" + "="*50)
    print("🎉 FINAL 'HOTCAKE' EXPERIMENT COMPLETE 🎉")
    print("="*50)
    print(f"\nModel Tested: {EVALUATION_MODEL}")
    print(f"Total Perturbed Sentences: {len(df_results)}")
    print(f"Correctly Identified: {correct_predictions}")
    print(f"Overall Accuracy: {overall_accuracy:.2f}%")
    print("\n--- Accuracy by Method ---")
    print(accuracy_by_method.to_string(float_format="%.2f%%"))
    print("="*50)
    print(f"\nDetailed results saved to '{FINAL_RESULTS_FILE}'")
    print("This is your true 'hotcake' result!")

In [6]:
combine_and_evaluate_final()

🚀 STARTING FINAL COMBINATION AND EVALUATION 🚀

[PHASE 1/2] Loading and combining datasets...
Successfully combined datasets. Total test cases: 400

[PHASE 2/2] Evaluating model performance...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 400/400 [00:03<00:00, 132.35it/s]


🎉 FINAL 'HOTCAKE' EXPERIMENT COMPLETE 🎉

Model Tested: all-MiniLM-L6-v2
Total Perturbed Sentences: 400
Correctly Identified: 338
Overall Accuracy: 84.50%

--- Accuracy by Method ---
method
diplomatic-attack   69.50%
paraphrasing        99.50%

Detailed results saved to 'final_hotcake_results.csv'
This is your true 'hotcake' result!





In [7]:
# --- RUN THIS CELL TO SAVE YOUR FINAL RESULTS ---



# Configure Git with your name and email

!git config --global user.name "rafidreezwan"

!git config --global user.email "rafidreezwan@gmail.com"


# Store your GitHub username and token

username = "rafidreezwan"

token = "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"


# Add, commit, and push the new result files

!git add final_hotcake_results.csv diplomatic_sentences_openai.csv

!git commit -m "feat: Add final results for hotcake experiment"


# Push to your repository using the token for authentication

!git push https://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@github.com/rafidreezwan/nlp-robustness-study.git

print("\n✅✅✅ Final results and dataset successfully saved to GitHub! ✅✅✅")

[main 86cd6f6] feat: Add final results for hotcake experiment
 2 files changed, 428 insertions(+), 227 deletions(-)
 create mode 100644 diplomatic_sentences_openai.csv
Enumerating objects: 6, done.
Counting objects: 100% (6/6), done.
Delta compression using up to 2 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 17.21 KiB | 2.87 MiB/s, done.
Total 4 (delta 3), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), completed with 2 local objects.[K
To https://github.com/rafidreezwan/nlp-robustness-study.git
   dd47d21..86cd6f6  main -> main

✅✅✅ Final results and dataset successfully saved to GitHub! ✅✅✅


In [8]:
# ==============================================================================
# FINAL ANALYSIS AND REPORTING SCRIPT
# ==============================================================================
import pandas as pd
import os

# --- Configuration ---
RESULTS_FILE = 'final_hotcake_results.csv'
SUMMARY_OUTPUT_FILE = 'results_summary.txt'
EXAMPLES_OUTPUT_FILE = 'qualitative_examples.txt'

print(f"--- Starting Final Analysis on '{RESULTS_FILE}' ---")

if not os.path.exists(RESULTS_FILE):
    print(f"---!!! ERROR: Results file not found. Please ensure '{RESULTS_FILE}' is in your directory. !!!---")
else:
    df = pd.read_csv(RESULTS_FILE)

    # --- 1. Overall Metrics ---
    total_tests = len(df)
    total_success = df['is_correct'].sum()
    total_failure = total_tests - total_success
    overall_accuracy = (total_success / total_tests) * 100

    # --- 2. Per-Method Metrics ---
    accuracy_by_method = df.groupby('method')['is_correct'].mean() * 100
    success_by_method = df[df['is_correct'] == True].groupby('method').size()
    failure_by_method = df[df['is_correct'] == False].groupby('method').size()

    # --- 3. Qualitative Evidence ---
    failed_cases = df[df['is_correct'] == False]
    successful_paraphrases = df[(df['is_correct'] == True) & (df['method'] == 'paraphrasing')].head(3)
    successful_diplomatic = df[(df['is_correct'] == True) & (df['method'] == 'diplomatic-attack')].head(3)

    # --- 4. Save Quantitative Summary to File ---
    with open(SUMMARY_OUTPUT_FILE, 'w') as f:
        f.write("==================================================\n")
        f.write("      QUANTITATIVE RESULTS SUMMARY\n")
        f.write("==================================================\n\n")
        f.write("--- Overall Performance ---\n")
        f.write(f"Total Sentences Tested: {total_tests}\n")
        f.write(f"Correctly Identified:   {total_success}\n")
        f.write(f"Failed to Identify:     {total_failure}\n")
        f.write(f"Overall Accuracy:       {overall_accuracy:.2f}%\n\n")
        f.write("--- Performance by Method ---\n")
        for method, accuracy in accuracy_by_method.items():
            f.write(f"\nMethod: {method}\n")
            f.write(f"  - Accuracy: {accuracy:.2f}%\n")
            f.write(f"  - Successes: {success_by_method.get(method, 0)} / 200\n")
            f.write(f"  - Failures:  {failure_by_method.get(method, 0)} / 200\n")

    print(f"✅ Quantitative summary saved to '{SUMMARY_OUTPUT_FILE}'")

    # --- 5. Save Qualitative Examples to File ---
    with open(EXAMPLES_OUTPUT_FILE, 'w') as f:
        f.write("==================================================\n")
        f.write("      QUALITATIVE EVIDENCE EXAMPLES\n")
        f.write("==================================================\n\n")
        f.write("--- ALL FAILED CASES ---\n")
        for index, row in failed_cases.iterrows():
            f.write(f"\nMethod: {row['method']}\n")
            f.write(f"  - Root Sentence:      {row['root_sentence']}\n")
            f.write(f"  - Perturbed Sentence: {row['perturbed_sentence']}\n")
            f.write(f"  - Model's Incorrect Guess: {row['predicted_root']}\n")

        f.write("\n\n--- EXAMPLES OF SUCCESSFUL PARAPHRASES ---\n")
        for index, row in successful_paraphrases.iterrows():
            f.write(f"\nRoot Sentence:      {row['root_sentence']}\n")
            f.write(f"Perturbed Sentence: {row['perturbed_sentence']}\n")

        f.write("\n\n--- EXAMPLES OF SUCCESSFUL DIPLOMATIC ATTACKS ---\n")
        for index, row in successful_diplomatic.iterrows():
            f.write(f"\nRoot Sentence:      {row['root_sentence']}\n")
            f.write(f"Perturbed Sentence: {row['perturbed_sentence']}\n")

    print(f"✅ Qualitative examples saved to '{EXAMPLES_OUTPUT_FILE}'")
    print("\n--- Analysis Complete. You can now save everything to GitHub. ---")

--- Starting Final Analysis on 'final_hotcake_results.csv' ---
✅ Quantitative summary saved to 'results_summary.txt'
✅ Qualitative examples saved to 'qualitative_examples.txt'

--- Analysis Complete. You can now save everything to GitHub. ---


In [10]:
# ==============================================================================
# FINAL SCRIPT: MULTI-MODEL EVALUATION AND COMPARISON
# ==============================================================================

# --- Part 1: Imports ---
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch
import os

def run_multi_model_evaluation():
    """
    Loads a dataset and evaluates multiple SentenceTransformer models on it,
    then saves a final comparison table.
    """

    # --- Part 2: Configuration ---
    # The final dataset you created
    DATASET_FILE = 'final_hotcake_dataset.csv'

    # New output file for the final leaderboard
    COMPARISON_RESULTS_FILE = 'multi_model_comparison_results.csv'

    # The list of models we will test
    MODELS_TO_TEST = [
        'all-MiniLM-L6-v2',         # Small, fast baseline
        'all-mpnet-base-v2',        # High-performance, medium size
        'BAAI/bge-large-en-v1.5'    # Large, state-of-the-art
    ]

    print("="*50)
    print("🚀 STARTING MULTI-MODEL EVALUATION 🚀")
    print("="*50)

    # --- Part 3: Load Data ---
    if not os.path.exists(DATASET_FILE):
        print(f"---!!! ERROR: Dataset file not found: '{DATASET_FILE}' !!!---")
        print("Please ensure your final dataset is present before running.")
        return

    df_combined = pd.read_csv(DATASET_FILE)
    root_sentences = df_combined['root_sentence'].unique().tolist()
    print(f"Successfully loaded {len(df_combined)} test cases from '{DATASET_FILE}'.")

    # --- Part 4: Main Evaluation Loop ---
    all_model_results = []

    for model_name in MODELS_TO_TEST:
        print(f"\n--- Testing Model: {model_name} ---")

        # Load the current SentenceTransformer model
        model = SentenceTransformer(model_name)

        # Encode the "answer key" sentences once
        corpus_embeddings = model.encode(root_sentences, convert_to_tensor=True, show_progress_bar=True)

        # Run the evaluation for the current model
        results_list = []
        for _, row in tqdm(df_combined.iterrows(), total=df_combined.shape[0], desc=f"Evaluating {model_name.split('/')[-1]}"):
            true_root, query = row['root_sentence'], row['perturbed_sentence']
            query_embedding = model.encode(query, convert_to_tensor=True)
            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
            predicted_root = root_sentences[torch.argmax(cos_scores).item()]

            is_correct = (predicted_root == true_root)
            results_list.append({**row, 'is_correct': is_correct})

        # Calculate and store the results for this model
        df_model_results = pd.DataFrame(results_list)
        overall_accuracy = df_model_results['is_correct'].mean() * 100
        accuracy_by_method = df_model_results.groupby('method')['is_correct'].mean() * 100

        all_model_results.append({
            'model_name': model_name,
            'overall_accuracy': overall_accuracy,
            'paraphrasing_accuracy': accuracy_by_method.get('paraphrasing', 0),
            'diplomatic_accuracy': accuracy_by_method.get('diplomatic-attack', 0)
        })

        # Clean up memory before loading the next model
        del model
        torch.cuda.empty_cache()

    # --- Part 5: Final Report ---
    df_comparison = pd.DataFrame(all_model_results)

    # Format accuracy columns to two decimal places for printing
    for col in ['overall_accuracy', 'paraphrasing_accuracy', 'diplomatic_accuracy']:
        df_comparison[col] = df_comparison[col].map('{:.2f}%'.format)

    print("\n" + "="*50)
    print("🎉 MULTI-MODEL EVALUATION COMPLETE 🎉")
    print("="*50)
    print("\n--- FINAL MODEL LEADERBOARD ---")
    print(df_comparison.to_string(index=False))
    print("="*50)

    # Save the final table to a CSV (without the '%' sign for easy data processing)
    df_comparison_to_save = pd.DataFrame(all_model_results)
    df_comparison_to_save.to_csv(COMPARISON_RESULTS_FILE, index=False)
    print(f"\n✅ Final comparison table saved to '{COMPARISON_RESULTS_FILE}'")


# Call the function in a new cell to run it
# run_multi_model_evaluation()

In [11]:
run_multi_model_evaluation()

🚀 STARTING MULTI-MODEL EVALUATION 🚀
Successfully loaded 400 test cases from 'final_hotcake_dataset.csv'.

--- Testing Model: all-MiniLM-L6-v2 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating all-MiniLM-L6-v2: 100%|██████████| 400/400 [00:02<00:00, 159.91it/s]



--- Testing Model: all-mpnet-base-v2 ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating all-mpnet-base-v2: 100%|██████████| 400/400 [00:04<00:00, 81.35it/s]



--- Testing Model: BAAI/bge-large-en-v1.5 ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating bge-large-en-v1.5: 100%|██████████| 400/400 [00:08<00:00, 48.02it/s]


🎉 MULTI-MODEL EVALUATION COMPLETE 🎉

--- FINAL MODEL LEADERBOARD ---
            model_name overall_accuracy paraphrasing_accuracy diplomatic_accuracy
      all-MiniLM-L6-v2           84.50%                99.50%              69.50%
     all-mpnet-base-v2           85.25%               100.00%              70.50%
BAAI/bge-large-en-v1.5           82.25%               100.00%              64.50%

✅ Final comparison table saved to 'multi_model_comparison_results.csv'





In [13]:
# ==============================================================================
# FINAL SCRIPT: DETAILED PER-MODEL REPORTING
# ==============================================================================

# --- Part 1: Imports ---
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch
import os

def run_detailed_multi_model_evaluation():
    """
    Loads a dataset, evaluates multiple models, and saves detailed per-model
    reports of successes and failures.
    """

    # --- Part 2: Configuration ---
    DATASET_FILE = 'final_hotcake_dataset.csv'
    FINAL_COMPARISON_FILE = 'multi_model_comparison_results.csv'

    MODELS_TO_TEST = [
        'all-MiniLM-L6-v2',
        'all-mpnet-base-v2',
        'BAAI/bge-large-en-v1.5'
    ]

    print("="*50)
    print("🚀 STARTING DETAILED MULTI-MODEL EVALUATION 🚀")
    print("="*50)

    # --- Part 3: Load Data ---
    if not os.path.exists(DATASET_FILE):
        print(f"---!!! ERROR: Dataset file not found: '{DATASET_FILE}' !!!---")
        return

    df_combined = pd.read_csv(DATASET_FILE)
    root_sentences = df_combined['root_sentence'].unique().tolist()
    print(f"Successfully loaded {len(df_combined)} test cases from '{DATASET_FILE}'.")

    # --- Part 4: Main Evaluation Loop ---
    all_model_results_summary = []

    for model_name in MODELS_TO_TEST:
        print(f"\n--- Testing Model: {model_name} ---")

        model = SentenceTransformer(model_name)
        corpus_embeddings = model.encode(root_sentences, convert_to_tensor=True, show_progress_bar=True)

        results_list = []
        for _, row in tqdm(df_combined.iterrows(), total=df_combined.shape[0], desc=f"Evaluating {model_name.split('/')[-1]}"):
            true_root, query = row['root_sentence'], row['perturbed_sentence']
            query_embedding = model.encode(query, convert_to_tensor=True)
            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
            predicted_root = root_sentences[torch.argmax(cos_scores).item()]

            is_correct = (predicted_root == true_root)
            results_list.append({**row, 'predicted_root': predicted_root, 'is_correct': is_correct})

        # Create a DataFrame with the detailed results for THIS model
        df_model_results = pd.DataFrame(results_list)

        # --- NEW: Save detailed per-model files ---
        # Sanitize filename
        safe_model_name = model_name.replace('/', '_')

        # 1. Save the detailed CSV report
        detailed_csv_filename = f"detailed_results_{safe_model_name}.csv"
        df_model_results.to_csv(detailed_csv_filename, index=False)
        print(f"  -> Saved detailed CSV report to '{detailed_csv_filename}'")

        # 2. Save the qualitative TXT report
        qualitative_txt_filename = f"qualitative_report_{safe_model_name}.txt"
        with open(qualitative_txt_filename, 'w') as f:
            f.write(f"==================================================\n")
            f.write(f"      QUALITATIVE REPORT FOR: {model_name}\n")
            f.write(f"==================================================\n\n")

            failed_cases = df_model_results[df_model_results['is_correct'] == False]
            f.write(f"--- ALL {len(failed_cases)} FAILED CASES ---\n")
            if failed_cases.empty:
                f.write("No failures found.\n")
            else:
                for _, row in failed_cases.iterrows():
                    f.write(f"\nMethod: {row['method']}\n")
                    f.write(f"  - Root Sentence:      {row['root_sentence']}\n")
                    f.write(f"  - Perturbed Sentence: {row['perturbed_sentence']}\n")
                    f.write(f"  - Model's Incorrect Guess: {row['predicted_root']}\n")

            f.write("\n\n--- EXAMPLES OF SUCCESSES ---\n")
            successful_cases = df_model_results[df_model_results['is_correct'] == True].head(5)
            for _, row in successful_cases.iterrows():
                f.write(f"\nMethod: {row['method']}\n")
                f.write(f"  - Root Sentence:      {row['root_sentence']}\n")
                f.write(f"  - Perturbed Sentence: {row['perturbed_sentence']}\n")
        print(f"  -> Saved qualitative text report to '{qualitative_txt_filename}'")

        # --- Aggregate results for the final summary table ---
        overall_accuracy = df_model_results['is_correct'].mean() * 100
        accuracy_by_method = df_model_results.groupby('method')['is_correct'].mean() * 100

        all_model_results_summary.append({
            'model_name': model_name,
            'overall_accuracy': overall_accuracy,
            'paraphrasing_accuracy': accuracy_by_method.get('paraphrasing', 0),
            'diplomatic_accuracy': accuracy_by_method.get('diplomatic-attack', 0)
        })

        del model
        torch.cuda.empty_cache()

    # --- Part 5: Final Summary Report ---
    df_comparison = pd.DataFrame(all_model_results_summary)

    print("\n" + "="*50)
    print("🎉 MULTI-MODEL EVALUATION COMPLETE 🎉")
    print("="*50)

    # Save the final summary table to a CSV
    df_comparison.to_csv(FINAL_COMPARISON_FILE, index=False)
    print(f"\n✅ Final summary table saved to '{FINAL_COMPARISON_FILE}'")

    # Format for clean printing
    for col in ['overall_accuracy', 'paraphrasing_accuracy', 'diplomatic_accuracy']:
        df_comparison[col] = df_comparison[col].map('{:.2f}%'.format)

    print("\n--- FINAL MODEL LEADERBOARD ---")
    print(df_comparison.to_string(index=False))
    print("="*50)

In [14]:
run_detailed_multi_model_evaluation()

🚀 STARTING DETAILED MULTI-MODEL EVALUATION 🚀
Successfully loaded 400 test cases from 'final_hotcake_dataset.csv'.

--- Testing Model: all-MiniLM-L6-v2 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating all-MiniLM-L6-v2: 100%|██████████| 400/400 [00:02<00:00, 146.70it/s]


  -> Saved detailed CSV report to 'detailed_results_all-MiniLM-L6-v2.csv'
  -> Saved qualitative text report to 'qualitative_report_all-MiniLM-L6-v2.txt'

--- Testing Model: all-mpnet-base-v2 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating all-mpnet-base-v2: 100%|██████████| 400/400 [00:05<00:00, 75.54it/s]


  -> Saved detailed CSV report to 'detailed_results_all-mpnet-base-v2.csv'
  -> Saved qualitative text report to 'qualitative_report_all-mpnet-base-v2.txt'

--- Testing Model: BAAI/bge-large-en-v1.5 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating bge-large-en-v1.5: 100%|██████████| 400/400 [00:07<00:00, 50.20it/s]

  -> Saved detailed CSV report to 'detailed_results_BAAI_bge-large-en-v1.5.csv'
  -> Saved qualitative text report to 'qualitative_report_BAAI_bge-large-en-v1.5.txt'

🎉 MULTI-MODEL EVALUATION COMPLETE 🎉

✅ Final summary table saved to 'multi_model_comparison_results.csv'

--- FINAL MODEL LEADERBOARD ---
            model_name overall_accuracy paraphrasing_accuracy diplomatic_accuracy
      all-MiniLM-L6-v2           84.50%                99.50%              69.50%
     all-mpnet-base-v2           85.25%               100.00%              70.50%
BAAI/bge-large-en-v1.5           82.25%               100.00%              64.50%





In [16]:
# ==============================================================================
# FINAL SCRIPT: EXTENDED MULTI-MODEL EVALUATION (5 MODELS)
# ==============================================================================

# --- Part 1: Imports ---
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch
import os

def run_extended_multi_model_evaluation():
    """
    Loads a dataset and evaluates an extended list of five SentenceTransformer models.
    """

    # --- Part 2: Configuration ---
    DATASET_FILE = 'final_hotcake_dataset.csv'
    FINAL_COMPARISON_FILE = 'multi_model_comparison_results_5_models.csv'

    # THE EXTENDED LIST OF MODELS TO TEST
    MODELS_TO_TEST = [
        'all-MiniLM-L6-v2',                 # Small, fast baseline
        'all-mpnet-base-v2',                # High-performance, medium size
        'BAAI/bge-large-en-v1.5',           # Large, state-of-the-art BGE
        'intfloat/e5-large-v2',             # New addition: E5 family
        'paraphrase-multilingual-mpnet-base-v2' # New addition: Multilingual
    ]

    print("="*50)
    print("🚀 STARTING EXTENDED MULTI-MODEL EVALUATION (5 MODELS) 🚀")
    print("="*50)

    # --- Part 3: Load Data ---
    if not os.path.exists(DATASET_FILE):
        print(f"---!!! ERROR: Dataset file not found: '{DATASET_FILE}' !!!---")
        return

    df_combined = pd.read_csv(DATASET_FILE)
    root_sentences = df_combined['root_sentence'].unique().tolist()
    print(f"Successfully loaded {len(df_combined)} test cases from '{DATASET_FILE}'.")

    # --- Part 4: Main Evaluation Loop ---
    all_model_results_summary = []

    for model_name in MODELS_TO_TEST:
        print(f"\n--- Testing Model: {model_name} ---")

        model = SentenceTransformer(model_name)
        corpus_embeddings = model.encode(root_sentences, convert_to_tensor=True, show_progress_bar=True)

        results_list = []
        for _, row in tqdm(df_combined.iterrows(), total=df_combined.shape[0], desc=f"Evaluating {model_name.split('/')[-1]}"):
            true_root, query = row['root_sentence'], row['perturbed_sentence']
            query_embedding = model.encode(query, convert_to_tensor=True)
            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
            predicted_root = root_sentences[torch.argmax(cos_scores).item()]

            is_correct = (predicted_root == true_root)
            results_list.append({**row, 'predicted_root': predicted_root, 'is_correct': is_correct})

        df_model_results = pd.DataFrame(results_list)

        # Save detailed per-model files
        safe_model_name = model_name.replace('/', '_')
        detailed_csv_filename = f"detailed_results_{safe_model_name}.csv"
        df_model_results.to_csv(detailed_csv_filename, index=False)
        print(f"  -> Saved detailed CSV report to '{detailed_csv_filename}'")

        # Aggregate results for the final summary table
        overall_accuracy = df_model_results['is_correct'].mean() * 100
        accuracy_by_method = df_model_results.groupby('method')['is_correct'].mean() * 100

        all_model_results_summary.append({
            'model_name': model_name,
            'overall_accuracy': overall_accuracy,
            'paraphrasing_accuracy': accuracy_by_method.get('paraphrasing', 0),
            'diplomatic_accuracy': accuracy_by_method.get('diplomatic-attack', 0)
        })

        del model
        torch.cuda.empty_cache()

    # --- Part 5: Final Report ---
    df_comparison = pd.DataFrame(all_model_results_summary)

    print("\n" + "="*50)
    print("🎉 EXTENDED MULTI-MODEL EVALUATION COMPLETE 🎉")
    print("="*50)

    df_comparison_to_save = df_comparison.copy()
    df_comparison_to_save.to_csv(FINAL_COMPARISON_FILE, index=False)
    print(f"\n✅ Final comparison table saved to '{FINAL_COMPARISON_FILE}'")

    for col in ['overall_accuracy', 'paraphrasing_accuracy', 'diplomatic_accuracy']:
        df_comparison[col] = df_comparison[col].map('{:.2f}%'.format)

    print("\n--- FINAL MODEL LEADERBOARD (5 MODELS) ---")
    print(df_comparison.to_string(index=False))
    print("="*50)

In [17]:
run_extended_multi_model_evaluation()

🚀 STARTING EXTENDED MULTI-MODEL EVALUATION (5 MODELS) 🚀
Successfully loaded 400 test cases from 'final_hotcake_dataset.csv'.

--- Testing Model: all-MiniLM-L6-v2 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating all-MiniLM-L6-v2: 100%|██████████| 400/400 [00:02<00:00, 159.51it/s]


  -> Saved detailed CSV report to 'detailed_results_all-MiniLM-L6-v2.csv'

--- Testing Model: all-mpnet-base-v2 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating all-mpnet-base-v2: 100%|██████████| 400/400 [00:05<00:00, 70.20it/s]


  -> Saved detailed CSV report to 'detailed_results_all-mpnet-base-v2.csv'

--- Testing Model: BAAI/bge-large-en-v1.5 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating bge-large-en-v1.5: 100%|██████████| 400/400 [00:08<00:00, 49.48it/s]


  -> Saved detailed CSV report to 'detailed_results_BAAI_bge-large-en-v1.5.csv'

--- Testing Model: intfloat/e5-large-v2 ---


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating e5-large-v2: 100%|██████████| 400/400 [00:08<00:00, 47.86it/s]


  -> Saved detailed CSV report to 'detailed_results_intfloat_e5-large-v2.csv'

--- Testing Model: paraphrase-multilingual-mpnet-base-v2 ---


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating paraphrase-multilingual-mpnet-base-v2: 100%|██████████| 400/400 [00:05<00:00, 78.77it/s]


  -> Saved detailed CSV report to 'detailed_results_paraphrase-multilingual-mpnet-base-v2.csv'

🎉 EXTENDED MULTI-MODEL EVALUATION COMPLETE 🎉

✅ Final comparison table saved to 'multi_model_comparison_results_5_models.csv'

--- FINAL MODEL LEADERBOARD (5 MODELS) ---
                           model_name overall_accuracy paraphrasing_accuracy diplomatic_accuracy
                     all-MiniLM-L6-v2           84.50%                99.50%              69.50%
                    all-mpnet-base-v2           85.25%               100.00%              70.50%
               BAAI/bge-large-en-v1.5           82.25%               100.00%              64.50%
                 intfloat/e5-large-v2           86.50%               100.00%              73.00%
paraphrase-multilingual-mpnet-base-v2           82.50%                99.50%              65.50%


In [18]:
# ==============================================================================
# FINAL SCRIPT: EXTENDED MULTI-MODEL EVALUATION (WITH ALL REPORTING)
# ==============================================================================

import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch
import os

def run_extended_multi_model_evaluation():
    DATASET_FILE = 'final_hotcake_dataset.csv'
    FINAL_COMPARISON_FILE = 'multi_model_comparison_results_5_models.csv'
    MODELS_TO_TEST = [
        'all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'BAAI/bge-large-en-v1.5',
        'intfloat/e5-large-v2', 'paraphrase-multilingual-mpnet-base-v2'
    ]

    print("="*50)
    print("🚀 STARTING EXTENDED MULTI-MODEL EVALUATION (5 MODELS) 🚀")
    print("="*50)

    if not os.path.exists(DATASET_FILE):
        print(f"---!!! ERROR: Dataset file not found: '{DATASET_FILE}' !!!---")
        return

    df_combined = pd.read_csv(DATASET_FILE)
    root_sentences = df_combined['root_sentence'].unique().tolist()
    print(f"Successfully loaded {len(df_combined)} test cases.")

    all_model_results_summary = []
    for model_name in MODELS_TO_TEST:
        print(f"\n--- Testing Model: {model_name} ---")
        model = SentenceTransformer(model_name)
        corpus_embeddings = model.encode(root_sentences, convert_to_tensor=True, show_progress_bar=True)

        results_list = []
        for _, row in tqdm(df_combined.iterrows(), total=df_combined.shape[0], desc=f"Evaluating {model_name.split('/')[-1]}"):
            true_root, query = row['root_sentence'], row['perturbed_sentence']
            query_embedding = model.encode(query, convert_to_tensor=True)
            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
            predicted_root = root_sentences[torch.argmax(cos_scores).item()]
            is_correct = (predicted_root == true_root)
            results_list.append({**row, 'predicted_root': predicted_root, 'is_correct': is_correct})

        df_model_results = pd.DataFrame(results_list)
        safe_model_name = model_name.replace('/', '_')

        # Save detailed CSV
        detailed_csv_filename = f"detailed_results_{safe_model_name}.csv"
        df_model_results.to_csv(detailed_csv_filename, index=False)
        print(f"  -> Saved detailed CSV report to '{detailed_csv_filename}'")

        # Save qualitative TXT report
        qualitative_txt_filename = f"qualitative_report_{safe_model_name}.txt"
        with open(qualitative_txt_filename, 'w') as f:
            f.write(f"QUALITATIVE REPORT FOR: {model_name}\n{'='*50}\n\n")
            failed_cases = df_model_results[df_model_results['is_correct'] == False]
            f.write(f"--- ALL {len(failed_cases)} FAILED CASES ---\n")
            for _, row in failed_cases.iterrows():
                f.write(f"\nMethod: {row['method']}\n  - Root:      {row['root_sentence']}\n  - Perturbed: {row['perturbed_sentence']}\n  - Guessed:   {row['predicted_root']}\n")
            f.write("\n\n--- EXAMPLES OF SUCCESSES ---\n")
            for _, row in df_model_results[df_model_results['is_correct'] == True].head(3).iterrows():
                 f.write(f"\nMethod: {row['method']}\n  - Root:      {row['root_sentence']}\n  - Perturbed: {row['perturbed_sentence']}\n")
        print(f"  -> Saved qualitative text report to '{qualitative_txt_filename}'")

        # Aggregate results for summary
        overall_accuracy = df_model_results['is_correct'].mean() * 100
        accuracy_by_method = df_model_results.groupby('method')['is_correct'].mean() * 100
        all_model_results_summary.append({
            'model_name': model_name, 'overall_accuracy': overall_accuracy,
            'paraphrasing_accuracy': accuracy_by_method.get('paraphrasing', 0),
            'diplomatic_accuracy': accuracy_by_method.get('diplomatic-attack', 0)
        })
        del model
        torch.cuda.empty_cache()

    # Final Summary Report
    df_comparison = pd.DataFrame(all_model_results_summary)
    print("\n" + "="*50 + "\n🎉 EXTENDED MULTI-MODEL EVALUATION COMPLETE 🎉\n" + "="*50)
    df_comparison.to_csv(FINAL_COMPARISON_FILE, index=False)
    print(f"\n✅ Final comparison table saved to '{FINAL_COMPARISON_FILE}'")

    # Format for printing
    for col in df_comparison.columns[1:]: df_comparison[col] = df_comparison[col].map('{:.2f}%'.format)
    print("\n--- FINAL MODEL LEADERBOARD (5 MODELS) ---\n" + df_comparison.to_string(index=False) + "\n" + "="*50)

In [19]:
run_extended_multi_model_evaluation()


🚀 STARTING EXTENDED MULTI-MODEL EVALUATION (5 MODELS) 🚀
Successfully loaded 400 test cases.

--- Testing Model: all-MiniLM-L6-v2 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating all-MiniLM-L6-v2: 100%|██████████| 400/400 [00:02<00:00, 156.64it/s]


  -> Saved detailed CSV report to 'detailed_results_all-MiniLM-L6-v2.csv'
  -> Saved qualitative text report to 'qualitative_report_all-MiniLM-L6-v2.txt'

--- Testing Model: all-mpnet-base-v2 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating all-mpnet-base-v2: 100%|██████████| 400/400 [00:05<00:00, 71.48it/s]


  -> Saved detailed CSV report to 'detailed_results_all-mpnet-base-v2.csv'
  -> Saved qualitative text report to 'qualitative_report_all-mpnet-base-v2.txt'

--- Testing Model: BAAI/bge-large-en-v1.5 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating bge-large-en-v1.5: 100%|██████████| 400/400 [00:08<00:00, 47.12it/s]


  -> Saved detailed CSV report to 'detailed_results_BAAI_bge-large-en-v1.5.csv'
  -> Saved qualitative text report to 'qualitative_report_BAAI_bge-large-en-v1.5.txt'

--- Testing Model: intfloat/e5-large-v2 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating e5-large-v2: 100%|██████████| 400/400 [00:07<00:00, 50.98it/s]


  -> Saved detailed CSV report to 'detailed_results_intfloat_e5-large-v2.csv'
  -> Saved qualitative text report to 'qualitative_report_intfloat_e5-large-v2.txt'

--- Testing Model: paraphrase-multilingual-mpnet-base-v2 ---


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating paraphrase-multilingual-mpnet-base-v2: 100%|██████████| 400/400 [00:04<00:00, 91.19it/s]


  -> Saved detailed CSV report to 'detailed_results_paraphrase-multilingual-mpnet-base-v2.csv'
  -> Saved qualitative text report to 'qualitative_report_paraphrase-multilingual-mpnet-base-v2.txt'

🎉 EXTENDED MULTI-MODEL EVALUATION COMPLETE 🎉

✅ Final comparison table saved to 'multi_model_comparison_results_5_models.csv'

--- FINAL MODEL LEADERBOARD (5 MODELS) ---
                           model_name overall_accuracy paraphrasing_accuracy diplomatic_accuracy
                     all-MiniLM-L6-v2           84.50%                99.50%              69.50%
                    all-mpnet-base-v2           85.25%               100.00%              70.50%
               BAAI/bge-large-en-v1.5           82.25%               100.00%              64.50%
                 intfloat/e5-large-v2           86.50%               100.00%              73.00%
paraphrase-multilingual-mpnet-base-v2           82.50%                99.50%              65.50%
