In [64]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from unsloth import FastLanguageModel
from typing import List, Tuple
import random
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
)
import json
import evaluate  # instead of datasets.load_metric
from tqdm import tqdm
from unsloth import FastLanguageModel
import pandas as pd

### Load Models

In [65]:
def load_model_and_tokenizer(model_name, max_seq_length=2048, dtype=None, load_in_4bit=True):
    """
    Load a fine-tuned model and tokenizer with optional quantization and inference setup.

    Parameters:
    - model_name (str): Path to the model directory.
    - max_seq_length (int): Maximum sequence length the model should support.
    - dtype (torch.dtype or None): Set to None for auto detection, or specify torch.float16 or torch.bfloat16.
    - load_in_4bit (bool): Whether to use 4-bit quantization for memory efficiency.

    Returns:
    - model (PreTrainedModel): The loaded and prepared model.
    - tokenizer (PreTrainedTokenizer): The corresponding tokenizer.
    """
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit
    )
    FastLanguageModel.for_inference(model)
    return model, tokenizer

In [66]:
model_paths = {
    "ellenosborne": "/work/users/s/m/smerrill/Albemarle/trained_models/ellenosborne",
    "grahampaige": "/work/users/s/m/smerrill/Albemarle/trained_models/grahampaige",
    "judyle": "/work/users/s/m/smerrill/Albemarle/trained_models/judyle",
    "kateacuff": "/work/users/s/m/smerrill/Albemarle/trained_models/kateacuff",
    "katrinacallsen": "/work/users/s/m/smerrill/Albemarle/trained_models/katrinacallsen",
    "davidoberg": "/work/users/s/m/smerrill/Albemarle/trained_models/davidoberg",
    "jonnoalcaro": "/work/users/s/m/smerrill/Albemarle/trained_models/jonnoalcaro",
}


In [67]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

loaded_models = {}

for name, path in model_paths.items():
    print(f"Loading model: {name}")
    model, tokenizer = load_model_and_tokenizer(
        model_name=path,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit
    )
    loaded_models[name] = (model, tokenizer)


# Baseline not fine-tuned
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
loaded_models['baseline'] = (model, tokenizer)


Loading model: ellenosborne


RuntimeError: Unsloth: Failed to load model. Both AutoConfig and PeftConfig loading failed.

AutoConfig error: Unrecognized model in /work/users/s/m/smerrill/Albemarle/trained_models/ellenosborne. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v3, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, git, glm, glm4, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mistral3, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen3, qwen3_moe, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam_vision_model, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip_vision_model, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth

PeftConfig error: Can't find 'adapter_config.json' at '/work/users/s/m/smerrill/Albemarle/trained_models/ellenosborne'



### Dataprocessing Functions

In [38]:
def train_test_split(
    member: str,
    test_size: float = 0.2,
    seed: int = 42,
    data_path: str = "/work/users/s/m/smerrill/Albemarle/dataset",
) -> Tuple[List[dict], List[dict]]:
    """
    Splits the dataset into training and test sets, including synthetic data in training.

    Args:
        member: Board member name to select dataset files.
        test_size: Fraction of real data for testing (between 0 and 1).
        seed: Random seed for reproducibility.
        data_path: Base path for dataset files.

    Returns:
        Tuple of (train_data, test_data).
    """
    real_data, synth_data = [], []

    member_files = {
        "kateacuff": ("kateacuff.txt", "synth_kateacuff.txt"),
        "ellenosborne": ("ellenosborne.txt", "synth_ellenosborne.txt"),
        "grahampaige": ("grahampaige.txt", "synth_grahampaige.txt"),
        "judyle": ("judyle.txt", "synth_judyle.txt"),
        "katrinacallsen": ("katrinacallsen.txt", None),
        "davidoberg": ("davidoberg.txt", None),
        "jonnoalcaro": ("jonnoalcaro.txt", None),
    }

    if member not in member_files:
        raise ValueError(f"Unknown member: {member}")

    real_file, synth_file = member_files[member]

    real_data = load_chat_dataset(os.path.join(data_path, real_file))
    if synth_file:
        synth_data = load_chat_dataset(os.path.join(data_path, synth_file))

    if not (0 < test_size < 1):
        raise ValueError("test_size must be between 0 and 1.")

    random.seed(seed)
    shuffled_real = real_data.copy()
    random.shuffle(shuffled_real)

    split_index = int(len(shuffled_real) * (1 - test_size))
    train_data = shuffled_real[:split_index] + synth_data
    test_data = shuffled_real[split_index:]

    return train_data, test_data

def combine_conversations(dataset: List[dict]) -> Dataset:
    """
    Combine messages into conversation pairs (user + assistant).

    Args:
        dataset: List of dicts with keys 'role' and 'content'.

    Returns:
        Huggingface Dataset with combined conversations.
    """
    new_data = []
    i = 0
    while i < len(dataset):
        if i + 1 < len(dataset):
            convo = [
                {"role": dataset[i]["role"], "content": dataset[i]["content"]},
                {"role": dataset[i + 1]["role"], "content": dataset[i + 1]["content"]},
            ]
            new_data.append({"conversations": convo})
            i += 2
        else:
            convo = [{"role": dataset[i]["role"], "content": dataset[i]["content"]}]
            new_data.append({"conversations": convo})
            i += 1
    return Dataset.from_list(new_data)


def replace_system_message(example: dict) -> dict:
    """
    Replace system message in text if format allows.

    Args:
        example: Single dataset example with "text" field.
        custom_system_message: The system message string to replace with.

    Returns:
        Modified example dict.
    """
    parts = example["text"].split("<|eot_id|>", 1)
    if len(parts) == 2:
        new_text = custom_system_message + parts[1]
    else:
        new_text = example["text"]
    return {"text": new_text}


def formatting_prompts_func(examples: dict) -> dict:
    """
    Apply chat template formatting to conversations.

    Args:
        examples: Batch of examples from the dataset.
        tokenizer: Tokenizer instance with apply_chat_template method.

    Returns:
        Dictionary with formatted "text".
    """
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False).removeprefix(tokenizer.bos_token)
        for convo in convos
    ]
    return {"text": texts}

def load_chat_dataset(input_path: str) -> List[dict]:
    """
    Load a chat-style message dataset from a JSON or JSONL file.

    Args:
        input_path: Path to the dataset file.

    Returns:
        List of message dicts.
    """
    with open(input_path, "r", encoding="utf-8") as f:
        if input_path.endswith(".jsonl"):
            data = [json.loads(line) for line in f]
        else:  # Assume .json
            data = json.load(f)

    #logger.info(f"Loaded dataset from {input_path} with {len(data)} examples")
    return data

def convert_to_chat_format(data: List[dict]) -> List[dict]:
    """
    Convert dataset entries to chat format with alternating roles user/assistant.

    Args:
        data: List of dicts with keys 'prompt' and 'response'.

    Returns:
        List of dicts with 'role' and 'content'.
    """
    result = []
    for item in data:
        result.append({"role": "user", "content": item["prompt"]})
        result.append({"role": "assistant", "content": item["response"]})
    return result

### Evaluate

In [58]:
# Store metrics for all evaluations
metrics_list = []

# This first loop is to loop over datasets
for agent in agents:
    agent_lower = agent.replace(' ', '').lower()
    dataset_name = agent_lower
    message = f"""You are {agent}. Fill in the next response to the conversation by continuing the dialogue naturally. Only write what {agent} would say next — do not write for other speakers.

            Example:
            user:  
            {agent_lower}: Good evening, everyone. Let's get started with the agenda.  
            Speaker_1: Thanks, {agent_lower}. I just had a quick question about the minutes from last time.  
            assistant:  
            {agent_lower}: Sure, go ahead with your question."""

    custom_system_message = f"<|start_header_id|>system<|end_header_id|>\n\n{message}\n<|eot_id|>"

    # Dataset preparation
    _, test_data = train_test_split(dataset_name)
    test_data = convert_to_chat_format(test_data)
    test_data = combine_conversations(test_data)
    test_data = test_data.map(formatting_prompts_func, batched=True)
    test_data = test_data.map(replace_system_message)

    # For each agent evaluate performance on this dataset
    for model_name in loaded_models.keys():

        try:
            model, tokenizer= loaded_models[model_name]
            print(f"Model Loaded for {model_name}")
        except:
            print(f"COULD NOT LOAD MODEL FOR {model_name}")
            continue

        # Initialize metrics
        bleu = evaluate.load("bleu")
        rouge = evaluate.load("rouge")
        bertscore = evaluate.load("bertscore")

        # For this dataset the speaker prefix we want corresponds to the dataset
        speaker_prefix = tokenizer.encode(f"{agent_lower}:", return_tensors="pt").to("cuda")[:, 1:]

        generated_texts = []
        reference_texts = []

        print(f"Evaluating {len(test_data)} Examples of {dataset_name}")
        for example in tqdm(test_data):
            messages = [example['conversations'][:-1]]
            reference = example['conversations'][-1]['content']

            inputs = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
            ).to("cuda")

            inputs = inputs[:, 26:]  # Remove old system prompt
            custom_prompt = "<|begin_of_text|>" + custom_system_message
            custom_prompt_tokens = tokenizer.encode(custom_prompt, return_tensors="pt").to("cuda")
            inputs = torch.cat([custom_prompt_tokens, inputs], dim=1)
            inputs = torch.cat([inputs, speaker_prefix], dim=1)

            outputs = model.generate(
                input_ids=inputs,
                max_new_tokens=128,
                use_cache=True,
                temperature=1.5,
                min_p=0.1,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

            generated = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
            generated_texts.append(generated.strip())
            reference_texts.append(reference.strip())

        # Compute metrics
        bleu_score = bleu.compute(predictions=generated_texts, references=[[r] for r in reference_texts])
        rouge_score = rouge.compute(predictions=generated_texts, references=reference_texts)
        bertscore_result = bertscore.compute(predictions=generated_texts, references=reference_texts, lang="en")

        # Average BERTScore F1
        avg_bertscore_f1 = sum(bertscore_result['f1']) / len(bertscore_result['f1'])

        # Save metrics
        metrics_list.append({
            "ModelName": model_name,
            "dataset": dataset_name,
            "BLEU": bleu_score["bleu"],
            "ROUGE-L": rouge_score["rougeL"],
            "BERTScore_F1": avg_bertscore_f1
        })

        print("\n--- Evaluation Results ---")
        print(f"Dataset: {dataset_name}")
        print(f"BLEU: {bleu_score['bleu']:.4f}")
        print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")
        print(f"BERTScore F1: {avg_bertscore_f1:.4f}")

# Create DataFrame from metrics
metrics_df = pd.DataFrame(metrics_list)
print("\n=== All Evaluation Metrics ===")
print(metrics_df)

# Optionally save to CSV
# metrics_df.to_csv("evaluation_metrics.csv", index=False)


Map: 100%|██████████| 60/60 [00:00<00:00, 8659.66 examples/s]
Map: 100%|██████████| 60/60 [00:00<00:00, 13493.74 examples/s]


Model Loaded for ellenosborne
Evaluating 60 Examples of kateacuff


100%|██████████| 60/60 [02:26<00:00,  2.44s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: kateacuff
Agent Name: Kate Acuff
BLEU: 0.0102
ROUGE-L: 0.0975
BERTScore F1: 0.8259
Model Loaded for grahampaige
Evaluating 60 Examples of kateacuff


100%|██████████| 60/60 [02:54<00:00,  2.90s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: kateacuff
Agent Name: Kate Acuff
BLEU: 0.0000
ROUGE-L: 0.0662
BERTScore F1: 0.7944
Model Loaded for judyle
Evaluating 60 Examples of kateacuff


100%|██████████| 60/60 [02:36<00:00,  2.62s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: kateacuff
Agent Name: Kate Acuff
BLEU: 0.0000
ROUGE-L: 0.0988
BERTScore F1: 0.8176
Model Loaded for kateacuff
Evaluating 60 Examples of kateacuff


100%|██████████| 60/60 [02:39<00:00,  2.66s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: kateacuff
Agent Name: Kate Acuff
BLEU: 0.0106
ROUGE-L: 0.0819
BERTScore F1: 0.7960
Model Loaded for katrinacallsen
Evaluating 60 Examples of kateacuff


100%|██████████| 60/60 [02:33<00:00,  2.55s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: kateacuff
Agent Name: Kate Acuff
BLEU: 0.0000
ROUGE-L: 0.0794
BERTScore F1: 0.8120
Model Loaded for baseline
Evaluating 60 Examples of kateacuff


100%|██████████| 60/60 [01:32<00:00,  1.54s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: kateacuff
Agent Name: Kate Acuff
BLEU: 0.0077
ROUGE-L: 0.1021
BERTScore F1: 0.8260


Map: 100%|██████████| 14/14 [00:00<00:00, 3542.27 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 5329.97 examples/s]


Model Loaded for ellenosborne
Evaluating 14 Examples of ellenosborne


100%|██████████| 14/14 [00:35<00:00,  2.52s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: ellenosborne
Agent Name: Ellen Osborne
BLEU: 0.0127
ROUGE-L: 0.1166
BERTScore F1: 0.8221
Model Loaded for grahampaige
Evaluating 14 Examples of ellenosborne


100%|██████████| 14/14 [00:36<00:00,  2.62s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: ellenosborne
Agent Name: Ellen Osborne
BLEU: 0.0092
ROUGE-L: 0.0680
BERTScore F1: 0.8038
Model Loaded for judyle
Evaluating 14 Examples of ellenosborne


100%|██████████| 14/14 [00:29<00:00,  2.10s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: ellenosborne
Agent Name: Ellen Osborne
BLEU: 0.0000
ROUGE-L: 0.0820
BERTScore F1: 0.8100
Model Loaded for kateacuff
Evaluating 14 Examples of ellenosborne


100%|██████████| 14/14 [00:40<00:00,  2.88s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: ellenosborne
Agent Name: Ellen Osborne
BLEU: 0.0000
ROUGE-L: 0.0772
BERTScore F1: 0.8029
Model Loaded for katrinacallsen
Evaluating 14 Examples of ellenosborne


100%|██████████| 14/14 [00:33<00:00,  2.42s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: ellenosborne
Agent Name: Ellen Osborne
BLEU: 0.0000
ROUGE-L: 0.0813
BERTScore F1: 0.8047
Model Loaded for baseline
Evaluating 14 Examples of ellenosborne


100%|██████████| 14/14 [00:21<00:00,  1.51s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: ellenosborne
Agent Name: Ellen Osborne
BLEU: 0.0000
ROUGE-L: 0.1053
BERTScore F1: 0.8323


Map: 100%|██████████| 180/180 [00:00<00:00, 12877.37 examples/s]
Map: 100%|██████████| 180/180 [00:00<00:00, 20383.24 examples/s]


Model Loaded for ellenosborne
Evaluating 180 Examples of grahampaige


100%|██████████| 180/180 [05:48<00:00,  1.93s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: grahampaige
Agent Name: Graham Paige
BLEU: 0.0102
ROUGE-L: 0.0893
BERTScore F1: 0.8225
Model Loaded for grahampaige
Evaluating 180 Examples of grahampaige


100%|██████████| 180/180 [06:59<00:00,  2.33s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
Dataset: grahampaige
Agent Name: Graham Paige
BLEU: 0.0092
ROUGE-L: 0.0659
BERTScore F1: 0.8055
Model Loaded for judyle
Evaluating 180 Examples of grahampaige


  6%|▌         | 11/180 [00:26<06:45,  2.40s/it]

KeyboardInterrupt



In [59]:
metrics_df = pd.DataFrame(metrics_list)


In [63]:
metrics_df[metrics_df.dataset == 'ellenosborne']

Unnamed: 0,ModelName,dataset,BLEU,ROUGE-L,BERTScore_F1
6,ellenosborne,ellenosborne,0.012726,0.116645,0.82213
7,grahampaige,ellenosborne,0.009202,0.067962,0.803843
8,judyle,ellenosborne,0.0,0.081982,0.810025
9,kateacuff,ellenosborne,0.0,0.077208,0.802936
10,katrinacallsen,ellenosborne,0.0,0.081305,0.804726
11,baseline,ellenosborne,0.0,0.105304,0.832277
