Code to score LLM output

In [None]:
%pip install sentencepiece==0.1.97 # same as silnlp
%pip install nltk==3.7 # same as silnlp
%pip install sacrebleu==2.3.1 # same as silnlp
%pip install rouge_score
%pip install editdistance

In [None]:
import sacrebleu
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.nist_score import corpus_nist
import nltk
from rouge_score import rouge_scorer
import sacrebleu
import editdistance

In [None]:
# NLTK data
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
#path = "C:/mySIL/preprocessed/"     # For Windows
path = "/Users/laura/silnlp/scripts/llms/data/preprocessed/"
language = "xxx" # fill in language name
dataset = "train"

In [None]:
with open(path + "/" + "test_" + dataset + "_" + language + "_generated.txt", "r", encoding="utf-8") as file:
    trg_predictions = file.readlines()
with open(path + language + "/" + dataset + ".trg.detok.txt", "r", encoding="utf-8") as file:
    trg = file.readlines()
with open(path + language + "/" + dataset + ".src.detok.txt", "r", encoding="utf-8") as file:
    src = file.readlines()

In [None]:
# Optional: read in src and trg from a json file instead of directly from a txt file

import json

# Initialize a dictionary to hold the lists for each field
dataset_dict = {'input': [], 'output': []}
    
# Open the file and read line by line
with open("data/all_llm_data/xxx_smaller_test_data.jsonl", 'r', encoding='utf-8') as file: # fill in language name
  for line in file:
    # Each line is a complete JSON object
    json_object = json.loads(line)
    # Append each field to the appropriate list
    instruction = json_object.get('model_inputs', '')[:30]
    dataset_dict['input'].append(json_object.get('model_inputs', '')[32:]) #remove prompt from input  
    dataset_dict['output'].append(json_object.get('completion', '')[:-2]) #remove \r\n from end of output

trg = dataset_dict['output']
src = dataset_dict['input']

In [None]:
# Optional: Group src and trg in ten verse spans
trg_grouped = []
src_grouped = []

starting = 0
while starting < len(src):
    ending = starting + 10
    if ending > len(src):
        ending = len(src)
    trg_grouped.append(' '.join(trg[starting:ending]))
    src_grouped.append(' '.join(src[starting:ending]))
    starting = ending

In [None]:
# Optional: format trg predictions, remove LLM tags
trg_predictions = [i.replace(' </VERSE> ',' ').replace('</VERSE>','') for i in trg_predictions]
trg_predictions = [i.replace('### Response:','') for i in trg_predictions]
trg_predictions = [i.replace('### Input:','') for i in trg_predictions]
trg_predictions = [i.replace('### Instruction:','') for i in trg_predictions]
trg_predictions = [i.strip() for i in trg_predictions]

In [None]:
# Optional: just remove <end_of_text|> from LLM output
trg_predictions = [i.replace('<|end_of_text|>','').replace('<|endoftext|','').replace('<|endoftext','').strip() + '\n' for i in trg_predictions]

In [None]:
# Make sure trg predictions look right
trg_predictions[:10]

In [None]:
# Show first set of trg prediction, trg, and src
print(trg_predictions[0])
print(trg[0])
print(src[0])

In [None]:
# Optional: automatically remove prompt from trg predictions
#trg_predictions = [i[i.find(":")+2:] for i in trg_predictions] #32

for i in range(len(trg_predictions)):
  source_sentence = src[i][:-1] #remove new line character from the end of the source sentence
  prediction = trg_predictions[i]
  print(source_sentence)
  print(prediction)
  
  if prediction[:len(source_sentence)]==source_sentence: #prediction repeats the source sentence
    trg_predictions[i] = prediction[len(source_sentence)+1:] #remove source sentence from prediction
    print(trg_predictions[i])
  break

In [None]:
# Scoring code
pair_sys = trg_predictions
pair_refs = [trg]

scores = {}

bleu_score = sacrebleu.corpus_bleu(pair_sys,pair_refs,lowercase=True,tokenize="13a")
scores["BLEU"] = bleu_score.score

chrf3_score = sacrebleu.corpus_chrf(pair_sys, pair_refs, char_order=6, beta=3, remove_whitespace=True)
scores["chrF3"] = chrf3_score.score

chrfp_score = sacrebleu.corpus_chrf(pair_sys, pair_refs, char_order=6, beta=3, word_order=1, remove_whitespace=True, eps_smoothing=True)
scores["chrF3+"] = chrfp_score.score

chrfpp_score = sacrebleu.corpus_chrf(pair_sys, pair_refs, char_order=6, beta=3, word_order=2, remove_whitespace=True, eps_smoothing=True)
scores["chrF3++"] = chrfpp_score.score

spbleu_score = sacrebleu.corpus_bleu(pair_sys, pair_refs, lowercase=True,tokenize="flores200",)
scores["spBLEU"] = spbleu_score.score

In [None]:
# Make sure these are all the same len
print(len(trg_predictions), len(src), len(trg))

In [None]:
# Print out scores
scores

In [None]:
# Additional scoring metrics
# METEOR score to evaluate translation quality (synonyms, alignment, stemming, etc.)
def compute_meteor(translations, references):
    scores = [meteor_score([ref.split()], trans.split()) for trans, ref in zip(translations, references)]
    return sum(scores) / len(scores)

# ROUGE score is sensitive to missing words which is useful for identifying missing words from translations
def compute_rouge(translations, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, trans) for trans, ref in zip(translations, references)]
    averaged_scores = {key: sum(score[key].fmeasure for score in scores) / len(scores) for key in scores[0]}
    return averaged_scores

# GLEU score to evaluate sentence level quality
def compute_gleu(translations, references):
    scores = [sentence_gleu([ref.split()], trans.split()) for trans, ref in zip(translations, references)]
    return sum(scores) / len(scores)

# TER score to evalute missing words, segment alignment, etc.
def compute_ter(translations, references):
    ter_score = sacrebleu.corpus_ter(translations, references).score
    return ter_score

# Edit Distance to identify misspelled words, punctuation, missing/extra word checking
def compute_edit_distance(translations, references):
    distances = [editdistance.eval(trans, ref) for trans, ref in zip(translations, references)]
    return sum(distances) / len(distances)

# precision and recall to measure how well the model captures keywords
def compute_precision_recall(translations, references):
    precisions = []
    recalls = []
    for trans, ref in zip(translations, references):
        reference_tokens = set(ref.split())
        translation_tokens = set(trans.split())
        common_tokens = reference_tokens & translation_tokens
        precision = len(common_tokens) / len(translation_tokens)
        recall = len(common_tokens) / len(reference_tokens)
        precisions.append(precision)
        recalls.append(recall)
    return sum(precisions) / len(precisions), sum(recalls) / len(recalls)

# NIST score (weighting rare n-grams more heavily)
def compute_nist(translations, references):
    tokenized_translations = [trans.split() for trans in translations]
    tokenized_references = [[ref.split()] for ref in references]
    nist_score = corpus_nist(tokenized_references, tokenized_translations, n=5)  # Use n-gram size of 5
    return nist_score


# Compute scores  
#scores["METEOR"] = compute_meteor(pair_sys, pair_refs[0])
rouge_scores = compute_rouge(pair_sys, pair_refs[0])
for key, value in rouge_scores.items():
    scores[f"ROUGE-{key.upper()}"] = value
scores["GLEU"] = compute_gleu(pair_sys, pair_refs[0])
scores["TER"] = compute_ter(pair_sys, pair_refs[0])
scores["Edit Distance"] = compute_edit_distance(pair_sys, pair_refs[0])
precision, recall = compute_precision_recall(pair_sys, pair_refs[0])
scores["Precision"] = precision
scores["Recall"] = recall
scores["NIST"] = compute_nist(pair_sys, pair_refs[0])

# Print the computed scores
for score_name, score_value in scores.items():
    print(f"{score_name}: {score_value}")



In [None]:
# Save scores to files

import json

path = "/Users/laura/llmResearch/scores/"

def save_to_jsonl(file_path, data, language):

    data_with_language_name = {"language": language}
    data_with_language_name.update(data)
    
    # Save scores to JSONL file and not overwritting existing rows
    with open(file_path, 'a') as file:
        file.write(json.dumps(data_with_language_name) + "\n")

data = scores
file_path = path + language + "_" + dataset + "_scores.jsonl"
save_to_jsonl(file_path, data, language)