In [None]:
!pip install -U nltk
!pip install nltk==3.5
!pip install langchain
!pip install rouge_score
!pip install bert_score

In [None]:
#this cell solves a problem with unzipping corpora/wordnet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
import json
from langchain.evaluation import ExactMatchStringEvaluator
import nltk
from nltk.translate.meteor_score import meteor_score as meteor
from rouge_score import rouge_scorer
from bert_score import score as bert_score

# Initialize the ExactMatchStringEvaluator
evaluator = ExactMatchStringEvaluator(
    ignore_case=True,
    ignore_punctuation=True,
)

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

with open('add-file-path', 'r') as f:
    data = json.load(f)

# Initialize the results
exact_match_result = 0
total_meteor_score = 0
total_rouge1_score = 0
total_rouge2_score = 0
total_rougeL_score = 0
all_responses = []
all_answers = []

# Iterate through the data and compute the metrics
for question, (response, answer) in data.items():
    # Calculate Exact Match
    ex = float(evaluator.evaluate_strings(prediction=response.strip(), reference=answer.strip())['score'])
    exact_match_result += ex
    
    meteor_scr = meteor([answer], response)
    total_meteor_score += meteor_scr
    
    # Calculate ROUGE
    rouge_scores = rouge.score(response, answer)
    total_rouge1_score += rouge_scores['rouge1'].fmeasure
    total_rouge2_score += rouge_scores['rouge2'].fmeasure
    total_rougeL_score += rouge_scores['rougeL'].fmeasure
    
    # Collect responses and answers for BERTScore
    all_responses.append(response)
    all_answers.append(answer)

# Calculate BERTScore for all responses and answers
P, R, F1 = bert_score(all_responses, all_answers, lang="en")

# Aggregate BERTScore results
total_bert_precision = sum(P.tolist())
total_bert_recall = sum(R.tolist())
total_bert_f1 = sum(F1.tolist())

# Final results
num_samples = len(data)
exact_match_result /= num_samples
total_meteor_score /= num_samples
total_rouge1_score /= num_samples
total_rouge2_score /= num_samples
total_rougeL_score /= num_samples
total_bert_precision /= num_samples
total_bert_recall /= num_samples
total_bert_f1 /= num_samples

# Print the results
print("oracle_adore_1_few_1_esit.json")
print(f"Exact Match: {exact_match_result}")
print(f"METEOR: {total_meteor_score}")
print(f"ROUGE-1: {total_rouge1_score}")
print(f"ROUGE-2: {total_rouge2_score}")
print(f"ROUGE-L: {total_rougeL_score}")
print(f"BERT Precision: {total_bert_precision}")
print(f"BERT Recall: {total_bert_recall}")
print(f"BERT F1: {total_bert_f1}")

