In [16]:
import re
import os
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from evaluate import load # For SacreBLEU

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


following code runs the plots for different metrics on 3 datasets vs token lengths

In [20]:
import os
import re
import matplotlib.pyplot as plt
import numpy as np

def extract_metrics(log_file):
    """
    Extracts BLEU scores, F1 scores, and max token length from a log file.

    Args:
        log_file: Path to the log file.

    Returns:
        A tuple containing max token length, BLEU score, F1 score, and dataset name, 
        or None if not found.
    """
    max_token = None
    bleu_score = None
    f1_score = None
    dataset_name = None

    with open(log_file, 'r') as f:
        for line in f:
            if "eval_bleu_score:" in line:
                bleu_score = float(line.split("eval_bleu_score:")[1].strip())
            elif "eval_token_set_f1:" in line:
                f1_score = float(line.split("eval_token_set_f1:")[1].strip())
            elif "maxtoken" in log_file:
                max_token = int(log_file.split("_")[-1].replace("maxtoken.log", ""))

        # Extract dataset name from the filename
        match = re.search(r"_(.*?)_500samples_", log_file)
        if match:
            dataset_name = match.group(1)

    if max_token is not None and bleu_score is not None and f1_score is not None and dataset_name is not None:
        return max_token, bleu_score, f1_score, dataset_name
    else:
        return None

def plot_metrics_vs_max_token(all_data, output_dir):
    """
    Plots BLEU scores and F1 scores against max token lengths for multiple datasets in a single plot.

    Args:
        all_data: A dictionary where keys are dataset names and values are dictionaries 
                  of max token lengths to lists of (BLEU score, F1 score) tuples.
        output_dir: The directory to save the plot to.
    """
    for metric_name, metric_index in zip(["BLEU Score", "F1 Score"], [0,1]):
      plt.figure(figsize=(10, 6))  # Adjust figure size for better readability
  
      for dataset_name, data in all_data.items():
          max_tokens = sorted(data.keys())
          metric_scores = [data[max_token] for max_token in max_tokens]
  
          means = [np.mean([scores[metric_index] for scores in met_scores]) for met_scores in metric_scores]
          stds = [np.std([scores[metric_index] for scores in met_scores]) for met_scores in metric_scores]
  
          plt.errorbar(max_tokens, means, yerr=stds, fmt='o-', capsize=5, label=dataset_name)
  
      plt.xlabel("Max Token Length")
      plt.ylabel(metric_name)
      plt.title(f"{metric_name} vs. Max Token Length (Multiple Datasets)")
      plt.grid(True)
      plt.legend()  # Show the legend
  
      # Save the plot
      output_filename = f"{metric_name.lower().replace(' ', '_')}_vs_max_token_all_datasets.png"
      output_path = os.path.join(output_dir, output_filename)
      plt.savefig(output_path)
      print(f"Plot saved to: {output_path}")
      plt.close()

def main():
    # --- Configuration ---
    log_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/"
    output_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/"
    # ---------------------

    print(f"Log directory: {log_dir}")
    print(f"Output directory: {output_dir}")

    all_data = {}  # Dictionary to store data for all datasets

    for filename in os.listdir(log_dir):
        if filename.endswith(".log"):
            filepath = os.path.join(log_dir, filename)
            result = extract_metrics(filepath)

            if result:
                max_token, bleu_score, f1_score, dataset_name = result
                if dataset_name not in all_data:
                    all_data[dataset_name] = {}
                if max_token not in all_data[dataset_name]:
                    all_data[dataset_name][max_token] = []
                all_data[dataset_name][max_token].append((bleu_score, f1_score))

    if all_data:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        plot_metrics_vs_max_token(all_data, output_dir)
    else:
        print("No valid data found in log files.")

if __name__ == "__main__":
    main()

Log directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/
Output directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/
Plot saved to: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/bleu_score_vs_max_token_all_datasets.png
Plot saved to: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/f1_score_vs_max_token_all_datasets.png


In [None]:

def calculate_token_f1(pred, label):
    """
    Calculates the token-level F1 score between two strings.

    Args:
        pred: The predicted string.
        label: The true label string.

    Returns:
        The token-level F1 score.
    """
    pred_tokens = set(pred.lower().split())
    label_tokens = set(label.lower().split())

    if not pred_tokens and not label_tokens:
        return 1.0  # Both empty

    if not pred_tokens or not label_tokens:
        return 0.0

    precision = len(pred_tokens.intersection(label_tokens)) / len(pred_tokens)
    recall = len(pred_tokens.intersection(label_tokens)) / len(label_tokens)

    if precision + recall == 0:
        return 0.0

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

def extract_metrics(log_file_path):
    """
    Extracts exact match, token F1, SBERT score, BLEU, and max token length from a log file.

    Args:
        log_file_path: Path to the log file.

    Returns:
        A dictionary containing lists of exact match, token F1, SBERT score,
        BLEU, max token length, and dataset name, or None if not found.
    """

    def clean_line(line):
        """Removes 'Index X:' prefix and extra spacing from a line."""
        line = re.sub(r"^Index \d+: ", "", line)
        line = line.strip()
        line = re.sub(r"\s+", " ", line)
        line = re.sub(r"\s*/\s*", "/", line)
        return line

    preds = {}
    labels = {}

    with open(log_file_path, 'r') as log_file:
        lines = log_file.readlines()

    preds_section = False
    labels_section = False

    for line in lines:
        if "Contents of decoded_preds:" in line:
            preds_section = True
            labels_section = False
            continue
        elif "Contents of decoded_labels:" in line:
            preds_section = False
            labels_section = True
            continue

        if preds_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                preds[index] = match.group(2)
        elif labels_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                labels[index] = match.group(2)

    if len(preds) != len(labels):
        print(f"Warning: Number of predictions ({len(preds)}) and labels ({len(labels)}) differ in {log_file_path}.")

    min_len = min(len(preds), len(labels))

    # Initialize Sentence-BERT model and SacreBLEU
    model = SentenceTransformer('all-mpnet-base-v2')
    bleu = load("sacrebleu")

    exact_matches = []
    token_f1s = []
    sbert_scores = []
    bleu_scores = []

    data_added_to_bleu = False  # Flag to track if data was added to bleu

    for i in range(min_len):
        if i in preds and i in labels:
            cleaned_pred = clean_line(preds[i])
            cleaned_label = clean_line(labels[i])

            # Exact Match
            exact_match = 1.0 if cleaned_pred == cleaned_label else 0.0
            exact_matches.append(exact_match)

            # Token F1
            token_f1 = calculate_token_f1(cleaned_pred, cleaned_label)
            token_f1s.append(token_f1)

            # Sentence-BERT
            pred_embedding = model.encode(cleaned_pred, convert_to_tensor=True)
            label_embedding = model.encode(cleaned_label, convert_to_tensor=True)
            cosine_sim = util.pytorch_cos_sim(pred_embedding, label_embedding)
            sbert_scores.append(cosine_sim.item())

            # SacreBLEU
            bleu.add_batch(predictions=[cleaned_pred], references=[[cleaned_label]])
            data_added_to_bleu = True

    # Compute BLEU only if data was added
    if data_added_to_bleu:
        bleu_results = bleu.compute()
        bleu_scores = [bleu_results["score"]] * min_len
    else:
        bleu_scores = [0.0] * min_len

    # Extract max token and dataset name from filename
    if "maxtoken" in log_file_path:
        max_token = int(log_file_path.split("_")[-1].replace("maxtoken.log", ""))
    else:
        max_token = None

    match = re.search(r"_(.*?)_500samples_", log_file_path)
    dataset_name = match.group(1) if match else None

    if max_token is not None and dataset_name is not None:
        return {
            "max_token": max_token,
            "dataset_name": dataset_name,
            "exact_match": exact_matches,
            "token_f1": token_f1s,
            "sbert_score": sbert_scores,  # Use sbert_score
            "bleu": bleu_scores,
        }
    else:
        return None

def plot_metrics_vs_max_token(all_data, output_dir):
    """
    Plots exact match, token F1, SBERT score, and BLEU against max token lengths for multiple datasets on the same plot.

    Args:
        all_data: A dictionary where keys are dataset names and values are dictionaries
                  of max token lengths to lists of metrics.
        output_dir: The directory to save the plots to.
    """

    metrics = {
        "exact_match": "Exact Match",
        "token_f1": "Token F1",
        "sbert_score": "SBERT Score",
        "bleu": "BLEU"
    }

    for metric_name, metric_label in metrics.items():
        plt.figure(figsize=(10, 6))
        for dataset_name, data in all_data.items():
            max_tokens = sorted(data.keys())
            means = [np.mean(data[max_token][metric_name]) for max_token in max_tokens]
            stds = [np.std(data[max_token][metric_name]) for max_token in max_tokens]

            plt.errorbar(max_tokens, means, yerr=stds, fmt='o-', capsize=5, label=dataset_name)

        plt.xlabel("Max Token Length")
        plt.ylabel(metric_label)
        plt.title(f"{metric_label} vs. Max Token Length")
        plt.grid(True)
        plt.legend()
        output_path = os.path.join(output_dir, f"{metric_name}_vs_max_token_all_datasets.png")
        plt.savefig(output_path)
        print(f"{metric_label} plot saved to: {output_path}")
        plt.close()

# --- Configuration ---
log_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/"
output_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/"
# ---------------------

print(f"Log directory: {log_dir}")
print(f"Output directory: {output_dir}")

all_data = {}  # Dictionary to store data for all datasets

for filename in os.listdir(log_dir):
    if filename.endswith(".log"):
        filepath = os.path.join(log_dir, filename)
        result = extract_metrics(filepath)

        if result:
            max_token = result["max_token"]
            dataset_name = result["dataset_name"]
            if dataset_name not in all_data:
                all_data[dataset_name] = {}
            if max_token not in all_data[dataset_name]:
                all_data[dataset_name][max_token] = {
                    "exact_match": [],
                    "token_f1": [],
                    "sbert_score": [],  # Use sbert_score here
                    "bleu": []
                }
            all_data[dataset_name][max_token]["exact_match"].extend(result["exact_match"])
            all_data[dataset_name][max_token]["token_f1"].extend(result["token_f1"])
            all_data[dataset_name][max_token]["sbert_score"].extend(result["sbert_score"])  # Extend sbert_score
            all_data[dataset_name][max_token]["bleu"].extend(result["bleu"])

if all_data:
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    plot_metrics_vs_max_token(all_data, output_dir)
else:
    print("No valid data found in log files.")

Log directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/
Output directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/


In [None]:

def calculate_token_f1(pred, label):
    """
    Calculates the token-level F1 score between two strings.

    Args:
        pred: The predicted string.
        label: The true label string.

    Returns:
        The token-level F1 score.
    """
    pred_tokens = set(pred.lower().split())
    label_tokens = set(label.lower().split())

    if not pred_tokens and not label_tokens:
        return 1.0  # Both empty

    if not pred_tokens or not label_tokens:
        return 0.0

    precision = len(pred_tokens.intersection(label_tokens)) / len(pred_tokens)
    recall = len(pred_tokens.intersection(label_tokens)) / len(label_tokens)

    if precision + recall == 0:
        return 0.0

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

def extract_metrics(log_file_path):
    """
    Extracts exact match, token F1, SBERT score, BLEU, and max token length from a log file.

    Args:
        log_file_path: Path to the log file.

    Returns:
        A dictionary containing lists of exact match, token F1, SBERT score,
        BLEU, max token length, and dataset name, or None if not found.
    """

    def clean_line(line):
        """Removes 'Index X:' prefix and extra spacing from a line."""
        line = re.sub(r"^Index \d+: ", "", line)
        line = line.strip()
        line = re.sub(r"\s+", " ", line)
        line = re.sub(r"\s*/\s*", "/", line)
        return line

    preds = {}
    labels = {}

    with open(log_file_path, 'r') as log_file:
        lines = log_file.readlines()

    preds_section = False
    labels_section = False

    for line in lines:
        if "Contents of decoded_preds:" in line:
            preds_section = True
            labels_section = False
            continue
        elif "Contents of decoded_labels:" in line:
            preds_section = False
            labels_section = True
            continue

        if preds_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                preds[index] = match.group(2)
        elif labels_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                labels[index] = match.group(2)

    if len(preds) != len(labels):
        print(f"Warning: Number of predictions ({len(preds)}) and labels ({len(labels)}) differ in {log_file_path}.")

    min_len = min(len(preds), len(labels))

    # Initialize Sentence-BERT model and SacreBLEU
    model = SentenceTransformer('all-mpnet-base-v2')
    bleu = load("sacrebleu")

    exact_matches = []
    token_f1s = []
    sbert_scores = []
    bleu_scores = []

    data_added_to_bleu = False  # Flag to track if data was added to bleu

    for i in range(min_len):
        if i in preds and i in labels:
            cleaned_pred = clean_line(preds[i])
            cleaned_label = clean_line(labels[i])

            # Exact Match
            exact_match = 1.0 if cleaned_pred == cleaned_label else 0.0
            exact_matches.append(exact_match)

            # Token F1
            token_f1 = calculate_token_f1(cleaned_pred, cleaned_label)
            token_f1s.append(token_f1)

            # Sentence-BERT
            pred_embedding = model.encode(cleaned_pred, convert_to_tensor=True)
            label_embedding = model.encode(cleaned_label, convert_to_tensor=True)
            cosine_sim = util.pytorch_cos_sim(pred_embedding, label_embedding)
            sbert_scores.append(cosine_sim.item())

            # SacreBLEU
            bleu.add_batch(predictions=[cleaned_pred], references=[[cleaned_label]])
            data_added_to_bleu = True

    # Compute BLEU only if data was added
    if data_added_to_bleu:
        bleu_results = bleu.compute()
        bleu_scores = [bleu_results["score"]] * min_len
    else:
        bleu_scores = [0.0] * min_len

    # Extract max token and dataset name from filename
    if "maxtoken" in log_file_path:
        max_token = int(log_file_path.split("_")[-1].replace("maxtoken.log", ""))
    else:
        max_token = None

    match = re.search(r"_(.*?)_500samples_", log_file_path)
    dataset_name = match.group(1) if match else None

    if max_token is not None and dataset_name is not None:
        return {
            "max_token": max_token,
            "dataset_name": dataset_name,
            "exact_match": exact_matches,
            "token_f1": token_f1s,
            "sbert_score": sbert_scores,  # Use sbert_score
            "bleu": bleu_scores,
        }
    else:
        return None

def plot_metrics_vs_max_token(all_data, output_dir):
    """
    Plots exact match, token F1, SBERT score, and BLEU against max token lengths for multiple datasets on the same plot.

    Args:
        all_data: A dictionary where keys are dataset names and values are dictionaries
                  of max token lengths to lists of metrics.
        output_dir: The directory to save the plots to.
    """

    metrics = {
        "exact_match": "Exact Match",
        "token_f1": "Token F1",
        "sbert_score": "SBERT Score",
        "bleu": "BLEU"
    }

    for metric_name, metric_label in metrics.items():
        plt.figure(figsize=(10, 6))
        for dataset_name, data in all_data.items():
            max_tokens = sorted(data.keys())
            means = [np.mean(data[max_token][metric_name]) for max_token in max_tokens]
            stds = [np.std(data[max_token][metric_name]) for max_token in max_tokens]

            plt.errorbar(max_tokens, means, yerr=stds, fmt='o-', capsize=5, label=dataset_name)

        plt.xlabel("Max Token Length")
        plt.ylabel(metric_label)
        plt.title(f"{metric_label} vs. Max Token Length")
        plt.grid(True)
        plt.legend()
        output_path = os.path.join(output_dir, f"{metric_name}_vs_max_token_all_datasets.png")
        plt.savefig(output_path)
        print(f"{metric_label} plot saved to: {output_path}")
        plt.close()

# --- Configuration ---
log_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/"
output_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/"
# ---------------------

print(f"Log directory: {log_dir}")
print(f"Output directory: {output_dir}")

all_data = {}  # Dictionary to store data for all datasets

for filename in os.listdir(log_dir):
    if filename.endswith(".log"):
        filepath = os.path.join(log_dir, filename)
        result = extract_metrics(filepath)

        if result:
            max_token = result["max_token"]
            dataset_name = result["dataset_name"]
            if dataset_name not in all_data:
                all_data[dataset_name] = {}
            if max_token not in all_data[dataset_name]:
                all_data[dataset_name][max_token] = {
                    "exact_match": [],
                    "token_f1": [],
                    "sbert_score": [],  # Use sbert_score here
                    "bleu": []
                }
            all_data[dataset_name][max_token]["exact_match"].extend(result["exact_match"])
            all_data[dataset_name][max_token]["token_f1"].extend(result["token_f1"])
            all_data[dataset_name][max_token]["sbert_score"].extend(result["sbert_score"])  # Extend sbert_score
            all_data[dataset_name][max_token]["bleu"].extend(result["bleu"])

if all_data:
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    plot_metrics_vs_max_token(all_data, output_dir)
else:
    print("No valid data found in log files.")

Log directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/
Output directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/


In [None]:

def calculate_token_f1(pred, label):
    """
    Calculates the token-level F1 score between two strings.

    Args:
        pred: The predicted string.
        label: The true label string.

    Returns:
        The token-level F1 score.
    """
    pred_tokens = set(pred.lower().split())
    label_tokens = set(label.lower().split())

    if not pred_tokens and not label_tokens:
        return 1.0  # Both empty

    if not pred_tokens or not label_tokens:
        return 0.0

    precision = len(pred_tokens.intersection(label_tokens)) / len(pred_tokens)
    recall = len(pred_tokens.intersection(label_tokens)) / len(label_tokens)

    if precision + recall == 0:
        return 0.0

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

def extract_metrics(log_file_path):
    """
    Extracts exact match, token F1, SBERT score, BLEU, and max token length from a log file.

    Args:
        log_file_path: Path to the log file.

    Returns:
        A dictionary containing lists of exact match, token F1, SBERT score,
        BLEU, max token length, and dataset name, or None if not found.
    """

    def clean_line(line):
        """Removes 'Index X:' prefix and extra spacing from a line."""
        line = re.sub(r"^Index \d+: ", "", line)
        line = line.strip()
        line = re.sub(r"\s+", " ", line)
        line = re.sub(r"\s*/\s*", "/", line)
        return line

    preds = {}
    labels = {}

    with open(log_file_path, 'r') as log_file:
        lines = log_file.readlines()

    preds_section = False
    labels_section = False

    for line in lines:
        if "Contents of decoded_preds:" in line:
            preds_section = True
            labels_section = False
            continue
        elif "Contents of decoded_labels:" in line:
            preds_section = False
            labels_section = True
            continue

        if preds_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                preds[index] = match.group(2)
        elif labels_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                labels[index] = match.group(2)

    if len(preds) != len(labels):
        print(f"Warning: Number of predictions ({len(preds)}) and labels ({len(labels)}) differ in {log_file_path}.")

    min_len = min(len(preds), len(labels))

    # Initialize Sentence-BERT model and SacreBLEU
    model = SentenceTransformer('all-mpnet-base-v2')
    bleu = load("sacrebleu")

    exact_matches = []
    token_f1s = []
    sbert_scores = []
    bleu_scores = []

    data_added_to_bleu = False  # Flag to track if data was added to bleu

    for i in range(min_len):
        if i in preds and i in labels:
            cleaned_pred = clean_line(preds[i])
            cleaned_label = clean_line(labels[i])

            # Exact Match
            exact_match = 1.0 if cleaned_pred == cleaned_label else 0.0
            exact_matches.append(exact_match)

            # Token F1
            token_f1 = calculate_token_f1(cleaned_pred, cleaned_label)
            token_f1s.append(token_f1)

            # Sentence-BERT
            pred_embedding = model.encode(cleaned_pred, convert_to_tensor=True)
            label_embedding = model.encode(cleaned_label, convert_to_tensor=True)
            cosine_sim = util.pytorch_cos_sim(pred_embedding, label_embedding)
            sbert_scores.append(cosine_sim.item())

            # SacreBLEU
            bleu.add_batch(predictions=[cleaned_pred], references=[[cleaned_label]])
            data_added_to_bleu = True

    # Compute BLEU only if data was added
    if data_added_to_bleu:
        bleu_results = bleu.compute()
        bleu_scores = [bleu_results["score"]] * min_len
    else:
        bleu_scores = [0.0] * min_len

    # Extract max token and dataset name from filename
    if "maxtoken" in log_file_path:
        max_token = int(log_file_path.split("_")[-1].replace("maxtoken.log", ""))
    else:
        max_token = None

    match = re.search(r"_(.*?)_500samples_", log_file_path)
    dataset_name = match.group(1) if match else None

    if max_token is not None and dataset_name is not None:
        return {
            "max_token": max_token,
            "dataset_name": dataset_name,
            "exact_match": exact_matches,
            "token_f1": token_f1s,
            "sbert_score": sbert_scores,  # Use sbert_score
            "bleu": bleu_scores,
        }
    else:
        return None

def plot_metrics_vs_max_token(all_data, output_dir):
    """
    Plots exact match, token F1, SBERT score, and BLEU against max token lengths for multiple datasets on the same plot.

    Args:
        all_data: A dictionary where keys are dataset names and values are dictionaries
                  of max token lengths to lists of metrics.
        output_dir: The directory to save the plots to.
    """

    metrics = {
        "exact_match": "Exact Match",
        "token_f1": "Token F1",
        "sbert_score": "SBERT Score",
        "bleu": "BLEU"
    }

    for metric_name, metric_label in metrics.items():
        plt.figure(figsize=(10, 6))
        for dataset_name, data in all_data.items():
            max_tokens = sorted(data.keys())
            means = [np.mean(data[max_token][metric_name]) for max_token in max_tokens]
            stds = [np.std(data[max_token][metric_name]) for max_token in max_tokens]

            plt.errorbar(max_tokens, means, yerr=stds, fmt='o-', capsize=5, label=dataset_name)

        plt.xlabel("Max Token Length")
        plt.ylabel(metric_label)
        plt.title(f"{metric_label} vs. Max Token Length")
        plt.grid(True)
        plt.legend()
        output_path = os.path.join(output_dir, f"{metric_name}_vs_max_token_all_datasets.png")
        plt.savefig(output_path)
        print(f"{metric_label} plot saved to: {output_path}")
        plt.close()

# --- Configuration ---
log_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/"
output_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/"
# ---------------------

print(f"Log directory: {log_dir}")
print(f"Output directory: {output_dir}")

all_data = {}  # Dictionary to store data for all datasets

for filename in os.listdir(log_dir):
    if filename.endswith(".log"):
        filepath = os.path.join(log_dir, filename)
        result = extract_metrics(filepath)

        if result:
            max_token = result["max_token"]
            dataset_name = result["dataset_name"]
            if dataset_name not in all_data:
                all_data[dataset_name] = {}
            if max_token not in all_data[dataset_name]:
                all_data[dataset_name][max_token] = {
                    "exact_match": [],
                    "token_f1": [],
                    "sbert_score": [],  # Use sbert_score here
                    "bleu": []
                }
            all_data[dataset_name][max_token]["exact_match"].extend(result["exact_match"])
            all_data[dataset_name][max_token]["token_f1"].extend(result["token_f1"])
            all_data[dataset_name][max_token]["sbert_score"].extend(result["sbert_score"])  # Extend sbert_score
            all_data[dataset_name][max_token]["bleu"].extend(result["bleu"])

if all_data:
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    plot_metrics_vs_max_token(all_data, output_dir)
else:
    print("No valid data found in log files.")

Log directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/
Output directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/


In [None]:

def calculate_token_f1(pred, label):
    """
    Calculates the token-level F1 score between two strings.

    Args:
        pred: The predicted string.
        label: The true label string.

    Returns:
        The token-level F1 score.
    """
    pred_tokens = set(pred.lower().split())
    label_tokens = set(label.lower().split())

    if not pred_tokens and not label_tokens:
        return 1.0  # Both empty

    if not pred_tokens or not label_tokens:
        return 0.0

    precision = len(pred_tokens.intersection(label_tokens)) / len(pred_tokens)
    recall = len(pred_tokens.intersection(label_tokens)) / len(label_tokens)

    if precision + recall == 0:
        return 0.0

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

def extract_metrics(log_file_path):
    """
    Extracts exact match, token F1, SBERT score, BLEU, and max token length from a log file.

    Args:
        log_file_path: Path to the log file.

    Returns:
        A dictionary containing lists of exact match, token F1, SBERT score,
        BLEU, max token length, and dataset name, or None if not found.
    """

    def clean_line(line):
        """Removes 'Index X:' prefix and extra spacing from a line."""
        line = re.sub(r"^Index \d+: ", "", line)
        line = line.strip()
        line = re.sub(r"\s+", " ", line)
        line = re.sub(r"\s*/\s*", "/", line)
        return line

    preds = {}
    labels = {}

    with open(log_file_path, 'r') as log_file:
        lines = log_file.readlines()

    preds_section = False
    labels_section = False

    for line in lines:
        if "Contents of decoded_preds:" in line:
            preds_section = True
            labels_section = False
            continue
        elif "Contents of decoded_labels:" in line:
            preds_section = False
            labels_section = True
            continue

        if preds_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                preds[index] = match.group(2)
        elif labels_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                labels[index] = match.group(2)

    if len(preds) != len(labels):
        print(f"Warning: Number of predictions ({len(preds)}) and labels ({len(labels)}) differ in {log_file_path}.")

    min_len = min(len(preds), len(labels))

    # Initialize Sentence-BERT model and SacreBLEU
    model = SentenceTransformer('all-mpnet-base-v2')
    bleu = load("sacrebleu")

    exact_matches = []
    token_f1s = []
    sbert_scores = []
    bleu_scores = []

    data_added_to_bleu = False  # Flag to track if data was added to bleu

    for i in range(min_len):
        if i in preds and i in labels:
            cleaned_pred = clean_line(preds[i])
            cleaned_label = clean_line(labels[i])

            # Exact Match
            exact_match = 1.0 if cleaned_pred == cleaned_label else 0.0
            exact_matches.append(exact_match)

            # Token F1
            token_f1 = calculate_token_f1(cleaned_pred, cleaned_label)
            token_f1s.append(token_f1)

            # Sentence-BERT
            pred_embedding = model.encode(cleaned_pred, convert_to_tensor=True)
            label_embedding = model.encode(cleaned_label, convert_to_tensor=True)
            cosine_sim = util.pytorch_cos_sim(pred_embedding, label_embedding)
            sbert_scores.append(cosine_sim.item())

            # SacreBLEU
            bleu.add_batch(predictions=[cleaned_pred], references=[[cleaned_label]])
            data_added_to_bleu = True

    # Compute BLEU only if data was added
    if data_added_to_bleu:
        bleu_results = bleu.compute()
        bleu_scores = [bleu_results["score"]] * min_len
    else:
        bleu_scores = [0.0] * min_len

    # Extract max token and dataset name from filename
    if "maxtoken" in log_file_path:
        max_token = int(log_file_path.split("_")[-1].replace("maxtoken.log", ""))
    else:
        max_token = None

    match = re.search(r"_(.*?)_500samples_", log_file_path)
    dataset_name = match.group(1) if match else None

    if max_token is not None and dataset_name is not None:
        return {
            "max_token": max_token,
            "dataset_name": dataset_name,
            "exact_match": exact_matches,
            "token_f1": token_f1s,
            "sbert_score": sbert_scores,  # Use sbert_score
            "bleu": bleu_scores,
        }
    else:
        return None

def plot_metrics_vs_max_token(all_data, output_dir):
    """
    Plots exact match, token F1, SBERT score, and BLEU against max token lengths for multiple datasets on the same plot.

    Args:
        all_data: A dictionary where keys are dataset names and values are dictionaries
                  of max token lengths to lists of metrics.
        output_dir: The directory to save the plots to.
    """

    metrics = {
        "exact_match": "Exact Match",
        "token_f1": "Token F1",
        "sbert_score": "SBERT Score",
        "bleu": "BLEU"
    }

    for metric_name, metric_label in metrics.items():
        plt.figure(figsize=(10, 6))
        for dataset_name, data in all_data.items():
            max_tokens = sorted(data.keys())
            means = [np.mean(data[max_token][metric_name]) for max_token in max_tokens]
            stds = [np.std(data[max_token][metric_name]) for max_token in max_tokens]

            plt.errorbar(max_tokens, means, yerr=stds, fmt='o-', capsize=5, label=dataset_name)

        plt.xlabel("Max Token Length")
        plt.ylabel(metric_label)
        plt.title(f"{metric_label} vs. Max Token Length")
        plt.grid(True)
        plt.legend()
        output_path = os.path.join(output_dir, f"{metric_name}_vs_max_token_all_datasets.png")
        plt.savefig(output_path)
        print(f"{metric_label} plot saved to: {output_path}")
        plt.close()

# --- Configuration ---
log_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/"
output_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/"
# ---------------------

print(f"Log directory: {log_dir}")
print(f"Output directory: {output_dir}")

all_data = {}  # Dictionary to store data for all datasets

for filename in os.listdir(log_dir):
    if filename.endswith(".log"):
        filepath = os.path.join(log_dir, filename)
        result = extract_metrics(filepath)

        if result:
            max_token = result["max_token"]
            dataset_name = result["dataset_name"]
            if dataset_name not in all_data:
                all_data[dataset_name] = {}
            if max_token not in all_data[dataset_name]:
                all_data[dataset_name][max_token] = {
                    "exact_match": [],
                    "token_f1": [],
                    "sbert_score": [],  # Use sbert_score here
                    "bleu": []
                }
            all_data[dataset_name][max_token]["exact_match"].extend(result["exact_match"])
            all_data[dataset_name][max_token]["token_f1"].extend(result["token_f1"])
            all_data[dataset_name][max_token]["sbert_score"].extend(result["sbert_score"])  # Extend sbert_score
            all_data[dataset_name][max_token]["bleu"].extend(result["bleu"])

if all_data:
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    plot_metrics_vs_max_token(all_data, output_dir)
else:
    print("No valid data found in log files.")

Log directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/
Output directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/


In [None]:

def calculate_token_f1(pred, label):
    """
    Calculates the token-level F1 score between two strings.

    Args:
        pred: The predicted string.
        label: The true label string.

    Returns:
        The token-level F1 score.
    """
    pred_tokens = set(pred.lower().split())
    label_tokens = set(label.lower().split())

    if not pred_tokens and not label_tokens:
        return 1.0  # Both empty

    if not pred_tokens or not label_tokens:
        return 0.0

    precision = len(pred_tokens.intersection(label_tokens)) / len(pred_tokens)
    recall = len(pred_tokens.intersection(label_tokens)) / len(label_tokens)

    if precision + recall == 0:
        return 0.0

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

def extract_metrics(log_file_path):
    """
    Extracts exact match, token F1, SBERT score, BLEU, and max token length from a log file.

    Args:
        log_file_path: Path to the log file.

    Returns:
        A dictionary containing lists of exact match, token F1, SBERT score,
        BLEU, max token length, and dataset name, or None if not found.
    """

    def clean_line(line):
        """Removes 'Index X:' prefix and extra spacing from a line."""
        line = re.sub(r"^Index \d+: ", "", line)
        line = line.strip()
        line = re.sub(r"\s+", " ", line)
        line = re.sub(r"\s*/\s*", "/", line)
        return line

    preds = {}
    labels = {}

    with open(log_file_path, 'r') as log_file:
        lines = log_file.readlines()

    preds_section = False
    labels_section = False

    for line in lines:
        if "Contents of decoded_preds:" in line:
            preds_section = True
            labels_section = False
            continue
        elif "Contents of decoded_labels:" in line:
            preds_section = False
            labels_section = True
            continue

        if preds_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                preds[index] = match.group(2)
        elif labels_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                labels[index] = match.group(2)

    if len(preds) != len(labels):
        print(f"Warning: Number of predictions ({len(preds)}) and labels ({len(labels)}) differ in {log_file_path}.")

    min_len = min(len(preds), len(labels))

    # Initialize Sentence-BERT model and SacreBLEU
    model = SentenceTransformer('all-mpnet-base-v2')
    bleu = load("sacrebleu")

    exact_matches = []
    token_f1s = []
    sbert_scores = []
    bleu_scores = []

    data_added_to_bleu = False  # Flag to track if data was added to bleu

    for i in range(min_len):
        if i in preds and i in labels:
            cleaned_pred = clean_line(preds[i])
            cleaned_label = clean_line(labels[i])

            # Exact Match
            exact_match = 1.0 if cleaned_pred == cleaned_label else 0.0
            exact_matches.append(exact_match)

            # Token F1
            token_f1 = calculate_token_f1(cleaned_pred, cleaned_label)
            token_f1s.append(token_f1)

            # Sentence-BERT
            pred_embedding = model.encode(cleaned_pred, convert_to_tensor=True)
            label_embedding = model.encode(cleaned_label, convert_to_tensor=True)
            cosine_sim = util.pytorch_cos_sim(pred_embedding, label_embedding)
            sbert_scores.append(cosine_sim.item())

            # SacreBLEU
            bleu.add_batch(predictions=[cleaned_pred], references=[[cleaned_label]])
            data_added_to_bleu = True

    # Compute BLEU only if data was added
    if data_added_to_bleu:
        bleu_results = bleu.compute()
        bleu_scores = [bleu_results["score"]] * min_len
    else:
        bleu_scores = [0.0] * min_len

    # Extract max token and dataset name from filename
    if "maxtoken" in log_file_path:
        max_token = int(log_file_path.split("_")[-1].replace("maxtoken.log", ""))
    else:
        max_token = None

    match = re.search(r"_(.*?)_500samples_", log_file_path)
    dataset_name = match.group(1) if match else None

    if max_token is not None and dataset_name is not None:
        return {
            "max_token": max_token,
            "dataset_name": dataset_name,
            "exact_match": exact_matches,
            "token_f1": token_f1s,
            "sbert_score": sbert_scores,  # Use sbert_score
            "bleu": bleu_scores,
        }
    else:
        return None

def plot_metrics_vs_max_token(all_data, output_dir):
    """
    Plots exact match, token F1, SBERT score, and BLEU against max token lengths for multiple datasets on the same plot.

    Args:
        all_data: A dictionary where keys are dataset names and values are dictionaries
                  of max token lengths to lists of metrics.
        output_dir: The directory to save the plots to.
    """

    metrics = {
        "exact_match": "Exact Match",
        "token_f1": "Token F1",
        "sbert_score": "SBERT Score",
        "bleu": "BLEU"
    }

    for metric_name, metric_label in metrics.items():
        plt.figure(figsize=(10, 6))
        for dataset_name, data in all_data.items():
            max_tokens = sorted(data.keys())
            means = [np.mean(data[max_token][metric_name]) for max_token in max_tokens]
            stds = [np.std(data[max_token][metric_name]) for max_token in max_tokens]

            plt.errorbar(max_tokens, means, yerr=stds, fmt='o-', capsize=5, label=dataset_name)

        plt.xlabel("Max Token Length")
        plt.ylabel(metric_label)
        plt.title(f"{metric_label} vs. Max Token Length")
        plt.grid(True)
        plt.legend()
        output_path = os.path.join(output_dir, f"{metric_name}_vs_max_token_all_datasets.png")
        plt.savefig(output_path)
        print(f"{metric_label} plot saved to: {output_path}")
        plt.close()

# --- Configuration ---
log_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/"
output_dir = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/"
# ---------------------

print(f"Log directory: {log_dir}")
print(f"Output directory: {output_dir}")

all_data = {}  # Dictionary to store data for all datasets

for filename in os.listdir(log_dir):
    if filename.endswith(".log"):
        filepath = os.path.join(log_dir, filename)
        result = extract_metrics(filepath)

        if result:
            max_token = result["max_token"]
            dataset_name = result["dataset_name"]
            if dataset_name not in all_data:
                all_data[dataset_name] = {}
            if max_token not in all_data[dataset_name]:
                all_data[dataset_name][max_token] = {
                    "exact_match": [],
                    "token_f1": [],
                    "sbert_score": [],  # Use sbert_score here
                    "bleu": []
                }
            all_data[dataset_name][max_token]["exact_match"].extend(result["exact_match"])
            all_data[dataset_name][max_token]["token_f1"].extend(result["token_f1"])
            all_data[dataset_name][max_token]["sbert_score"].extend(result["sbert_score"])  # Extend sbert_score
            all_data[dataset_name][max_token]["bleu"].extend(result["bleu"])

if all_data:
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    plot_metrics_vs_max_token(all_data, output_dir)
else:
    print("No valid data found in log files.")

Log directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/
Output directory: /home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/plots/


The next block of code investigates the different metrics on a dataset in a qualitative sense (we look at the samples to gauge how the metrics perform). Here you can notice certain metrics start performing worse on lower token lengths because of the way the metrics are calculated.

In [13]:

def extract_and_compare(log_file_path):
    """
    Extracts predicted and labeled lines from a log file, compares them,
    prints the comparisons, calculates BLEU and BERTScore, and
    provides descriptive average metrics.

    Args:
        log_file_path: Path to the log file.

    Returns:
        A dictionary containing:
        - proportion_identical: The proportion (mean) of identical lines.
        - average_bleu: The average BLEU score.
        - bertscore_precision: Average BERTScore precision.
        - bertscore_recall: Average BERTScore recall.
        - bertscore_f1: Average BERTScore F1.
    """

    def clean_line(line):
        """Removes 'Index X:' prefix and extra spacing from a line."""
        line = re.sub(r"^Index \d+: ", "", line)
        line = line.strip()
        line = re.sub(r"\s+", " ", line)
        line = re.sub(r"\s*/\s*", "/", line)
        return line

    preds = {}
    labels = {}

    with open(log_file_path, 'r') as log_file:
        lines = log_file.readlines()

    preds_section = False
    labels_section = False

    for line in lines:
        if "Contents of decoded_preds:" in line:
            preds_section = True
            labels_section = False
            continue
        elif "Contents of decoded_labels:" in line:
            preds_section = False
            labels_section = True
            continue

        if preds_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                preds[index] = match.group(2)
        elif labels_section:
            match = re.match(r"^Index (\d+): (.*)", line)
            if match:
                index = int(match.group(1))
                labels[index] = match.group(2)

    if len(preds) != len(labels):
        print(f"Warning: Number of predictions ({len(preds)}) and labels ({len(labels)}) differ.")

    min_len = min(len(preds), len(labels))

    # Initialize BERTScorer
    scorer = BERTScorer(lang="en", rescale_with_baseline=True)

    identical_count = 0
    bleu_scores = []
    bertscore_precisions = []
    bertscore_recalls = []
    bertscore_f1s = []

    for i in range(min_len):
        if i in preds and i in labels:
            cleaned_pred = clean_line(preds[i])
            cleaned_label = clean_line(labels[i])
            print(f"--- Index {i} ---")
            print(f"Prediction: {cleaned_pred}")
            print(f"Label:      {cleaned_label}")

            if cleaned_pred == cleaned_label:
                identical_count += 1
                print("Match: Yes")
            else:
                print("Match: No")

            # BLEU Score
            # Tokenize for BLEU (simple word tokenization)
            pred_tokens = nltk.word_tokenize(cleaned_pred.lower())
            label_tokens = nltk.word_tokenize(cleaned_label.lower())

            bleu = nltk.translate.bleu_score.sentence_bleu([label_tokens], pred_tokens)
            bleu_scores.append(bleu)

            # BERTScore
            P, R, F1 = scorer.score([cleaned_pred], [cleaned_label])
            bertscore_precisions.append(P.item())
            bertscore_recalls.append(R.item())
            bertscore_f1s.append(F1.item())

            print(f"BLEU Score: {bleu:.4f}")
            print(f"BERTScore - Precision: {P.item():.4f}, Recall: {R.item():.4f}, F1: {F1.item():.4f}")

    print("\n--- Score Calculation ---")
    print(f"Total number of comparisons: {min_len}")
    print(f"Number of identical pairs: {identical_count}")

    proportion_identical = identical_count / min_len if min_len > 0 else 0.0
    average_bleu = np.mean(bleu_scores) if bleu_scores else 0.0

    print(f"Proportion of identical lines: {proportion_identical}")
    print(f"Average BLEU Score: {average_bleu:.4f}")
    print(f"Median BLEU Score: {np.median(bleu_scores):.4f}")
    print(f"BLEU Score Standard Deviation: {np.std(bleu_scores):.4f}")

    print("\n--- BERTScore Metrics ---")
    print(f"Average BERTScore - Precision: {np.mean(bertscore_precisions):.4f}, Recall: {np.mean(bertscore_recalls):.4f}, F1: {np.mean(bertscore_f1s):.4f}")
    print(f"Median BERTScore - Precision: {np.median(bertscore_precisions):.4f}, Recall: {np.median(bertscore_recalls):.4f}, F1: {np.median(bertscore_f1s):.4f}")
    print(f"BERTScore Standard Deviation - Precision: {np.std(bertscore_precisions):.4f}, Recall: {np.std(bertscore_recalls):.4f}, F1: {np.std(bertscore_f1s):.4f}")
    
    return {
        "proportion_identical": proportion_identical,
        "average_bleu": average_bleu,
        "bertscore_precision": np.mean(bertscore_precisions),
        "bertscore_recall": np.mean(bertscore_recalls),
        "bertscore_f1": np.mean(bertscore_f1s)
    }

if __name__ == "__main__":
    log_file_path = "/home/jesse-wonnink/vec2text/scripts/outputs/tokenlengthsearch/repro_T2_gtr-50steps-4beam_dbpedia-entity_500samples_8maxtoken.log"  # Replace with your actual path

    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    try:
        extract_and_compare(log_file_path)
    except FileNotFoundError:
        print("The specified log file was not found. Please make sure to provide a valid file path.")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Index 0 ---
Prediction: Aurelius Ambrosi
Label:      Aurelius Ambrosi
Match: Yes
BLEU Score: 0.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 1 ---
Prediction: The Academy Awards or The Oscars
Label:      The Academy Awards or The Oscars
Match: Yes
BLEU Score: 1.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 2 ---
Prediction: The Economy of Angola is
Label:      The Economy of Angola is
Match: Yes
BLEU Score: 1.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 3 ---
Prediction: The northern cavefish or northern blind
Label:      The northern cavefish or northern blind
Match: Yes
BLEU Score: 1.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 4 ---
Prediction: Adelaide ( , "Dale
Label:      Adelaide (/dle
Match: No
BLEU Score: 0.0000
BERTScore - Precision: 0.0179, Recall: -0.0370, F1: -0.0080
--- Index 5 ---
Prediction: Actaeon (/
Label:      Actaeon (/
Match: Yes
BLEU Score: 0.0000
BER

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Score: 0.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 17 ---
Prediction: The Alexandrists were a
Label:      The Alexandrists were a
Match: Yes
BLEU Score: 1.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 18 ---
Prediction: Aristotle (
Label:      Aristotle (/
Match: No
BLEU Score: 0.0000
BERTScore - Precision: 0.7311, Recall: 0.7311, F1: 0.7315
--- Index 19 ---
Prediction: Revich aevii Revich aevii, Alex Petro
Label:      Alexei Petrovich Roma
Match: No
BLEU Score: 0.0000
BERTScore - Precision: -0.2856, Recall: -0.1997, F1: -0.2410
--- Index 20 ---
Prediction: The Analytical Engine was
Label:      The Analytical Engine was
Match: Yes
BLEU Score: 1.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 21 ---
Prediction: Anacharsis (/
Label:      Anacharsis (/
Match: Yes
BLEU Score: 0.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 22 ---
Prediction: NYSE MKT LLC,
Label:      NYSE MKT 

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Score: 1.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 33 ---
Prediction: Generally, an answer is
Label:      Generally, an answer is
Match: Yes
BLEU Score: 1.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 34 ---
Prediction: Area is the quantity that expresse
Label:      Area is the quantity that expresse
Match: Yes
BLEU Score: 1.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 35 ---
Prediction: Asphalt (US)/s
Label:      Asphalt (US/s
Match: No
BLEU Score: 0.0000
BERTScore - Precision: 0.6085, Recall: 0.6086, F1: 0.6092
--- Index 36 ---
Prediction: Arthur Schopenhauer (German
Label:      Arthur Schopenhauer (German
Match: Yes
BLEU Score: 1.0000
BERTScore - Precision: 1.0000, Recall: 1.0000, F1: 1.0000
--- Index 37 ---
Prediction: /Altai
Label:      Altaic/l
Match: No
BLEU Score: 0.0000
BERTScore - Precision: -0.0329, Recall: -0.3677, F1: -0.2042
--- Index 38 ---
Prediction: Albolin ( 530s)
Label:      A