In [2]:
import os
import random
import numpy as np
import json
import evaluate  # instead of datasets.load_metric
from tqdm import tqdm
import pandas as pd
from typing import List, Tuple
from datasets import Dataset
import evaluate

### Load Dataset

In [3]:
def train_test_split(member: str, test_size: float = 0.2, seed: int = 42, data_path: str = '/work/users/s/m/smerrill/Albemarle/dataset') -> Tuple[List[dict], List[dict]]:
    """
    Splits the dataset into training and test sets. Synthetic data is always added to the training set.

    Parameters:
    - member: The name identifier for the board member.
    - test_size: Proportion of the real (non-synthetic) data to include in the test split.
    - seed: Random seed for reproducibility.
    - data_path: Base directory for the dataset files.

    Returns:
    - A tuple (train_data, test_data)
    """
    real_data, synth_data = [], []

    if member == 'kateacuff':
        real_data = np.load(os.path.join(data_path, 'kateacuff_train.npy'))
        synth_data = np.load(os.path.join(data_path, 'synth_kateacuff.npy'))
        test_data = np.load(os.path.join(data_path, 'kateacuff_test.npy'), allow_pickle=True)
        train_completion_data = np.load(os.path.join(data_path, 'kateacuff_train_completion.npy'), allow_pickle=True)

        
    elif member == 'ellenosborne':
        real_data = np.load(os.path.join(data_path, 'ellenosborne_train.npy'))
        synth_data = np.load(os.path.join(data_path, 'synth_ellenosborne.npy'))
        test_data = np.load(os.path.join(data_path, 'ellenosborne_test.npy'), allow_pickle=True)
        train_completion_data = np.load(os.path.join(data_path, 'ellenosborne_train_completion.npy'), allow_pickle=True)
        
    elif member == 'grahampaige':
        real_data = np.load(os.path.join(data_path, 'grahampaige_train.npy'))
        synth_data = np.load(os.path.join(data_path, 'synth_grahampaige.npy'))
        test_data = np.load(os.path.join(data_path, 'grahampaige_test.npy'), allow_pickle=True)
        train_completion_data = np.load(os.path.join(data_path, 'grahampaige_train_completion.npy'), allow_pickle=True)                             
        
    elif member == 'judyle':
        real_data = np.load(os.path.join(data_path, 'judyle_train.npy'))
        synth_data = np.load(os.path.join(data_path, 'synth_judyle.npy'))
        test_data = np.load(os.path.join(data_path, 'judyle_test.npy'), allow_pickle=True)
        train_completion_data = np.load(os.path.join(data_path, 'judyle_train_completion.npy'), allow_pickle=True)
        
    elif member == 'katrinacallsen':
        real_data = np.load(os.path.join(data_path, 'katrinacallsen_train.npy'))
        test_data = np.load(os.path.join(data_path, 'katrinacallsen_test.npy'), allow_pickle=True)
        train_completion_data = np.load(os.path.join(data_path, 'katrinacallsen_train_completion.npy'), allow_pickle=True)
        
    elif member == 'davidoberg':
        real_data = np.load(os.path.join(data_path, 'davidoberg_train.npy'))
        test_data = np.load(os.path.join(data_path, 'davidoberg_test.npy'), allow_pickle=True)
        train_completion_data = np.load(os.path.join(data_path, 'davidoberg_train_completion.npy'), allow_pickle=True)
        
    elif member == 'jonnoalcaro':
        real_data = np.load(os.path.join(data_path, 'jonnoalcaro_train.npy'))
        test_data = np.load(os.path.join(data_path, 'jonnoalcaro_test.npy'), allow_pickle=True)
        train_completion_data = np.load(os.path.join(data_path, 'jonnoalcaro_train_completion.npy'), allow_pickle=True)
        
    else:
        raise ValueError(f"Unknown member: {member}")

    if not 0 < test_size < 1:
        raise ValueError("test_size must be a float between 0 and 1.")

    train_data = list(real_data) + list(synth_data)
    return train_data, test_data, train_completion_data


def compute_metrics(generated_texts, reference_texts):

    # Compute metrics
    bleu_score = bleu.compute(predictions=generated_texts, references=[[r] for r in reference_texts])
    rouge_score = rouge.compute(predictions=generated_texts, references=reference_texts)
    bertscore_result = bertscore.compute(predictions=generated_texts, references=reference_texts, lang="en")

    # Average BERTScore F1
    avg_bertscore_f1 = sum(bertscore_result['f1']) / len(bertscore_result['f1'])
            
    return bleu_score, rouge_score, bertscore_result, avg_bertscore_f1

### Eval Models

In [20]:
trained_path = '/work/users/s/m/smerrill/Albemarle/trained_models/'
trained_models = os.listdir(trained_path)

In [21]:
model_results = {}
for model in trained_models:
    model_path = os.path.join(trained_path, model)
    has_npy = any(f.endswith('.npy') for f in  os.listdir(model_path))
    if not has_npy:
        continue

    model_result_dict = {}
    for f in os.listdir(model_path):
        if f.endswith('.npy'):
            dataset = f.split('_')[0]
            train_or_test = f.split('_')[1]
            tmp = np.load(os.path.join(model_path, f))

            model_result_dict[dataset+ '_' + train_or_test ] = tmp
    model_results[model] = model_result_dict


In [7]:
# Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

In [23]:
model_results.keys()

dict_keys(['davidoberg_4', 'davidoberg_8', 'ellenosborne_4', 'ellenosborne_8', 'grahampaige_4', 'grahampaige_8', 'jonnoalcaro_4', 'judyle_4', 'judyle_8', 'kateacuff_4', 'kateacuff_8', 'katrinacallsen_4', 'katrinacallsen_8'])

In [17]:
verbose = True
metrics_list = []

for model in model_results.keys():
    generations = model_results[model]
    print(model)
    for dataset in generations.keys():
        dataset_name = dataset.split('_')[0]
        train_or_test = dataset.split('_')[1]
        _, test_data, train_completion_data = train_test_split(dataset_name)
        
        if train_or_test == 'train':
            data = train_completion_data
        elif train_or_test == 'test':
            data = test_data
        else:
            print("Train or Test not Valid: ", train_or_test)
            continue
            
        reference_texts = [x['completion'] for x in data]
        generated_texts = generations[dataset]
        bleu_score, rouge_score, bertscore_result, avg_bertscore_f1 = compute_metrics(generated_texts, reference_texts)
        
        metrics_list.append({
            "ModelName": model,
            "dataset": dataset_name,
            "split": train_or_test,
            "BLEU": bleu_score["bleu"],
            "ROUGE-L": rouge_score["rougeL"],
            "BERTScore_F1": avg_bertscore_f1
        })

        if verbose:
            print("\n--- Evaluation Results ---")
            print(f"Dataset: {dataset_name}")
            print(f"Split: {train_or_test}")
            print(f"BLEU: {bleu_score['bleu']:.4f}")
            print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")
            print(f"BERTScore F1: {avg_bertscore_f1:.4f}")
        

davidoberg_4





--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0060
ROUGE-L: 0.0740
BERTScore F1: 0.7558





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0138
ROUGE-L: 0.0940
BERTScore F1: 0.8023
davidoberg_8

--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0079
ROUGE-L: 0.0561
BERTScore F1: 0.7814





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0119
ROUGE-L: 0.0843
BERTScore F1: 0.7883
ellenosborne_4

--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0000
ROUGE-L: 0.0571
BERTScore F1: 0.7762





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0122
ROUGE-L: 0.0869
BERTScore F1: 0.7834
ellenosborne_8

--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0058
ROUGE-L: 0.0714
BERTScore F1: 0.7830





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0141
ROUGE-L: 0.0949
BERTScore F1: 0.7956
grahampaige_4

--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0048
ROUGE-L: 0.0574
BERTScore F1: 0.7786





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0144
ROUGE-L: 0.0936
BERTScore F1: 0.7968
grahampaige_8

--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0070
ROUGE-L: 0.0869
BERTScore F1: 0.8060





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0174
ROUGE-L: 0.1040
BERTScore F1: 0.8051
jonnoalcaro_4

--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0000
ROUGE-L: 0.0504
BERTScore F1: 0.7728





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0122
ROUGE-L: 0.0867
BERTScore F1: 0.7915
judyle_4





--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0064
ROUGE-L: 0.0710
BERTScore F1: 0.7543





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0127
ROUGE-L: 0.0891
BERTScore F1: 0.7849
judyle_8





--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0077
ROUGE-L: 0.0756
BERTScore F1: 0.7149





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0130
ROUGE-L: 0.0891
BERTScore F1: 0.7813
kateacuff_4

--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0000
ROUGE-L: 0.0785
BERTScore F1: 0.8010





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0133
ROUGE-L: 0.0907
BERTScore F1: 0.7838
kateacuff_8





--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0000
ROUGE-L: 0.0439
BERTScore F1: 0.6754





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0113
ROUGE-L: 0.0794
BERTScore F1: 0.7418
katrinacallsen_4

--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0054
ROUGE-L: 0.0688
BERTScore F1: 0.7886





--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0150
ROUGE-L: 0.1036
BERTScore F1: 0.8059
katrinacallsen_8





--- Evaluation Results ---
Dataset: kateacuff
Split: test
BLEU: 0.0000
ROUGE-L: 0.0871
BERTScore F1: 0.7674

--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0139
ROUGE-L: 0.1017
BERTScore F1: 0.7885




In [39]:
df = pd.DataFrame(metrics_list)

In [40]:
df[df.split=='train'].sort_values('ROUGE-L', ascending=False)

Unnamed: 0,ModelName,dataset,split,BLEU,ROUGE-L,BERTScore_F1
11,grahampaige_8,kateacuff,train,0.017418,0.103957,0.80514
23,katrinacallsen_4,kateacuff,train,0.01496,0.103601,0.805933
25,katrinacallsen_8,kateacuff,train,0.013902,0.101667,0.788458
7,ellenosborne_8,kateacuff,train,0.014099,0.094899,0.795604
1,davidoberg_4,kateacuff,train,0.013788,0.094017,0.802275
9,grahampaige_4,kateacuff,train,0.014401,0.093623,0.796828
19,kateacuff_4,kateacuff,train,0.013283,0.090736,0.783802
15,judyle_4,kateacuff,train,0.012728,0.089104,0.784895
17,judyle_8,kateacuff,train,0.012968,0.089081,0.781303
5,ellenosborne_4,kateacuff,train,0.012199,0.086862,0.78341


In [43]:
df[df.split=='test'].sort_values('BERTScore_F1', ascending=False)

Unnamed: 0,ModelName,dataset,split,BLEU,ROUGE-L,BERTScore_F1
10,grahampaige_8,kateacuff,test,0.007012,0.086935,0.805978
18,kateacuff_4,kateacuff,test,0.0,0.078454,0.800979
22,katrinacallsen_4,kateacuff,test,0.005441,0.068802,0.78859
6,ellenosborne_8,kateacuff,test,0.005798,0.071436,0.782972
26,No-FineTuning,kateacuff,test,0.006077,0.100115,0.78196
2,davidoberg_8,kateacuff,test,0.007942,0.056078,0.781399
8,grahampaige_4,kateacuff,test,0.004844,0.057371,0.778644
4,ellenosborne_4,kateacuff,test,0.0,0.057121,0.776156
12,jonnoalcaro_4,kateacuff,test,0.0,0.050421,0.77284
24,katrinacallsen_8,kateacuff,test,0.0,0.087073,0.767417


In [36]:
bleu_score, rouge_score, bertscore_result, avg_bertscore_f1 = ({'bleu': 0.006077129561234383,
  'precisions': [0.15736934820904286,
   0.011876484560570071,
   0.0012012012012012011,
   0.0006075334143377885],
  'brevity_penalty': 1.0,
  'length_ratio': 1.5724838411819022,
  'translation_length': 1703,
  'reference_length': 1083},
 {'rouge1': np.float64(0.1541466945728513),
  'rouge2': np.float64(0.017248572697572115),
  'rougeL': np.float64(0.10011529596140455),
  'rougeLsum': np.float64(0.10759896171220223)},
 {'precision': [0.8012092113494873,
   0.8241617679595947,
   0.689644455909729,
   0.8287801742553711,
   0.833304762840271,
   0.828080415725708,
   0.847261905670166,
   0.8211007118225098,
   0.8315057754516602,
   0.8339003324508667,
   0.0,
   0.8351423144340515,
   0.8233726620674133,
   0.8133792877197266,
   0.8384840488433838,
   0.8493857383728027,
   0.8207684755325317,
   0.8209556341171265,
   0.8152289986610413,
   0.8355612754821777],
  'recall': [0.8248466849327087,
   0.8207997679710388,
   0.7795955538749695,
   0.8241584897041321,
   0.812791109085083,
   0.8288900852203369,
   0.8190089464187622,
   0.8341827988624573,
   0.8611392378807068,
   0.8294829726219177,
   0.0,
   0.8128516674041748,
   0.8051705360412598,
   0.787691593170166,
   0.8530493974685669,
   0.8260535597801208,
   0.8206564784049988,
   0.8531922698020935,
   0.8612176179885864,
   0.8429147005081177],
  'f1': [0.812856137752533,
   0.8224772810935974,
   0.7318664789199829,
   0.8264629244804382,
   0.8229201436042786,
   0.8284850716590881,
   0.8328958749771118,
   0.8275901079177856,
   0.8460630178451538,
   0.8316857814788818,
   0.0,
   0.8238462209701538,
   0.8141698837280273,
   0.8003293871879578,
   0.8457040190696716,
   0.8375571370124817,
   0.8207125067710876,
   0.8367636203765869,
   0.8375924825668335,
   0.8392218947410583],
  'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.51.3)'},
 0.7819599986076355)


In [37]:
            print("\n--- Evaluation Results ---")
            print(f"Dataset: {dataset_name}")
            print(f"Split: {train_or_test}")
            print(f"BLEU: {bleu_score['bleu']:.4f}")
            print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")
            print(f"BERTScore F1: {avg_bertscore_f1:.4f}")



--- Evaluation Results ---
Dataset: kateacuff
Split: train
BLEU: 0.0061
ROUGE-L: 0.1001
BERTScore F1: 0.7820


In [38]:
metrics_list.append({
    "ModelName": 'No-FineTuning',
    "dataset": dataset_name,
    "split": 'test',
    "BLEU": bleu_score["bleu"],
    "ROUGE-L": rouge_score["rougeL"],
    "BERTScore_F1": avg_bertscore_f1
})
