In [None]:
!pip install sacrebleu rouge-score nltk


In [None]:
# Download required NLTK resources for METEOR
nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import single_meteor_score
from tqdm import tqdm
import nltk


# 1) Read the dataset and create a working copy
df_original = pd.read_csv("test_with_both_titles2.csv")
df_scores = df_original.copy()
if "abstract" in df_scores.columns:
    df_scores.drop(columns=["abstract"], inplace=True)
del df_original

# 2) Extract reference and hypothesis lists
refs = df_scores["title"].tolist()
hyps = df_scores["generated_title_finetuned"].tolist()

# 3) Compute BLEU-1 and BLEU-2 scores
smooth = SmoothingFunction().method1
bleu1, bleu2 = [], []

print("Calculating BLEU scores...")
for ref, hyp in tqdm(zip(refs, hyps), total=len(refs)):
    ref_tok = ref.split()
    hyp_tok = hyp.split()

    score1 = sentence_bleu([ref_tok], hyp_tok, weights=(1, 0, 0, 0), smoothing_function=smooth)
    bleu1.append(score1 * 100)

    score2 = sentence_bleu([ref_tok], hyp_tok, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
    bleu2.append(score2 * 100)

# 4) Compute ROUGE-1, ROUGE-2, and ROUGE-L
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge1, rouge2, rougeL = [], [], []

print("Calculating ROUGE scores...")
for ref, hyp in tqdm(zip(refs, hyps), total=len(refs)):
    scores = scorer.score(ref, hyp)
    rouge1.append(scores["rouge1"].fmeasure * 100)
    rouge2.append(scores["rouge2"].fmeasure * 100)
    rougeL.append(scores["rougeL"].fmeasure * 100)

# 5) Compute METEOR scores
print("Calculating METEOR scores...")
meteor = [
    single_meteor_score(ref.split(), hyp.split()) * 100
    for ref, hyp in tqdm(zip(refs, hyps), total=len(refs))
]

# 6) Add scores to the DataFrame
df_scores["BLEU-1"]  = bleu1
df_scores["BLEU-2"]  = bleu2
df_scores["ROUGE-1"] = rouge1
df_scores["ROUGE-2"] = rouge2
df_scores["ROUGE-L"] = rougeL
df_scores["METEOR"]  = meteor

# 7) Save the DataFrame to a new CSV file
df_scores = df_scores.round(3)


# 8) Compute and print average scores
metrics = ["BLEU-1", "BLEU-2", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR"]
means = df_scores[metrics].mean()

print("\n--- Average Metric Scores ---")
for metric, value in means.items():
    print(f"{metric}: {value:.2f}")


In [None]:
df_scores.head()

In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import single_meteor_score
from tqdm import tqdm
import nltk


# 1) Read the dataset and create a working copy
df_original = pd.read_csv("test_with_both_titles2.csv")
df_scores_base = df_original.copy()
if "abstract" in df_scores_base.columns:
    df_scores_base.drop(columns=["abstract"], inplace=True)
del df_original

# 2) Extract reference and hypothesis lists
refs = df_scores_base["title"].tolist()
hyps = df_scores_base["generated_title_base"].tolist()

# 3) Compute BLEU-1 and BLEU-2 scores
smooth = SmoothingFunction().method1
bleu1, bleu2 = [], []

print("Calculating BLEU scores...")
for ref, hyp in tqdm(zip(refs, hyps), total=len(refs)):
    ref_tok = ref.split()
    hyp_tok = hyp.split()

    score1 = sentence_bleu([ref_tok], hyp_tok, weights=(1, 0, 0, 0), smoothing_function=smooth)
    bleu1.append(score1 * 100)

    score2 = sentence_bleu([ref_tok], hyp_tok, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
    bleu2.append(score2 * 100)

# 4) Compute ROUGE-1, ROUGE-2, and ROUGE-L
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
rouge1, rouge2, rougeL = [], [], []

print("Calculating ROUGE scores...")
for ref, hyp in tqdm(zip(refs, hyps), total=len(refs)):
    scores = scorer.score(ref, hyp)
    rouge1.append(scores["rouge1"].fmeasure * 100)
    rouge2.append(scores["rouge2"].fmeasure * 100)
    rougeL.append(scores["rougeL"].fmeasure * 100)

# 5) Compute METEOR scores
print("Calculating METEOR scores...")
meteor = [
    single_meteor_score(ref.split(), hyp.split()) * 100
    for ref, hyp in tqdm(zip(refs, hyps), total=len(refs))
]

# 6) Add scores to the DataFrame
df_scores_base["BLEU-1"]  = bleu1
df_scores_base["BLEU-2"]  = bleu2
df_scores_base["ROUGE-1"] = rouge1
df_scores_base["ROUGE-2"] = rouge2
df_scores_base["ROUGE-L"] = rougeL
df_scores_base["METEOR"]  = meteor

# 7)
df_scores_base = df_scores_base.round(3)


# 8) Compute and print average scores
metrics = ["BLEU-1", "BLEU-2", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR"]
df_scores_base = df_scores_base.round(3)
means = df_scores_base[metrics].mean()

print("\n--- Average Metric Scores ---")
for metric, value in means.items():
    print(f"{metric}: {value:.2f}")


In [None]:
df_scores_base.head()