In [None]:
!pip install rouge-score
!pip install bert-score
!pip install nltk

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ff8bfcbc6f6c91ebecb6b858892cd301db9c4b5e85b03e81e77bf4961f2003ae
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
import pandas as pd
import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bertscore_score

In [None]:
smooth = SmoothingFunction().method1
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# -------------------------------
# LOAD THE OUTPUT FILE
# -------------------------------
df = pd.read_csv("live_demo_inference.csv")

In [None]:
# ===============================================================
# 3. FIX NAN ISSUES
# ===============================================================
df["cleaned_response2"] = df["cleaned_response2"].fillna("").astype(str)
df["generated_reply"]   = df["generated_reply"].fillna("").astype(str)

In [None]:
# ===============================================================
# 4. BLEU SCORE
# ===============================================================
smooth = SmoothingFunction().method1

def get_bleu(reference, hypothesis):
    if not isinstance(reference, str):
        reference = ""
    if not isinstance(hypothesis, str):
        hypothesis = ""
    return sentence_bleu(
        [reference.split()],
        hypothesis.split(),
        smoothing_function=smooth
    )

df["bleu"] = df.apply(
    lambda r: get_bleu(r["cleaned_response2"], r["generated_reply"]),
    axis=1
)
print("BLEU completed")


BLEU completed


In [None]:
# ===============================================================
# 5. ROUGE-L SCORE
# ===============================================================
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

def get_rouge_l(reference, hypothesis):
    try:
        scores = rouge.score(reference, hypothesis)
        return scores["rougeL"].fmeasure
    except:
        return 0.0

df["rougeL"] = df.apply(
    lambda r: get_rouge_l(r["cleaned_response2"], r["generated_reply"]),
    axis=1
)

print("ROUGE-L completed")

ROUGE-L completed


In [None]:

# ===============================================================
# 6. BERTScore
# ===============================================================

references = df["cleaned_response2"].tolist()
candidates = df["generated_reply"].tolist()

P, R, F1 = bertscore_score(
    candidates,
    references,
    lang="en",
    verbose=True
)

df["bertscore_precision"] = P.numpy()
df["bertscore_recall"]    = R.numpy()
df["bertscore_f1"]        = F1.numpy()

print("BERTScore completed")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
# ===============================================================
# 7. SHOW SUMMARY METRICS
# ===============================================================
print("\n=========== FINAL EVALUATION SUMMARY ===========\n")
print("Average BLEU:     ", df["bleu"].mean())
print("Average ROUGE-L:  ", df["rougeL"].mean())
print("Average BERT-P:   ", df["bertscore_precision"].mean())
print("Average BERT-R:   ", df["bertscore_recall"].mean())
print("Average BERT-F1:  ", df["bertscore_f1"].mean())



Average BLEU:      0.09607937127775451
Average ROUGE-L:   0.29230879085825257
Average BERT-P:    0.87116396
Average BERT-R:    0.8531952
Average BERT-F1:   0.86184156


In [None]:
# ===============================================================
# 8. BEST & WORST EXAMPLES
# ===============================================================
print("\n=========== TOP 5 BEST BLEU EXAMPLES ===========\n")
best = df.nlargest(5, "bleu")[["cleaned_review2", "cleaned_response2", "generated_reply", "bleu"]]
print(best)

print("\n=========== TOP 5 WORST BLEU EXAMPLES ===========\n")
worst = df.nsmallest(5, "bleu")[["cleaned_review2", "cleaned_response2", "generated_reply", "bleu"]]
print(worst)



                                        cleaned_review2  \
167   Two days to stay there. Both day we got hot wa...   
825   While the property can have better response to...   
552   What a bad experience and what a bad hospitali...   
82    It's all a good hotel with all the new propert...   
1113  The most snooty and unhelpful front desk staff...   

                                      cleaned_response2  \
167   Thank you for your valuable feedback and we ar...   
825   Dear guest, we are extremely sorry for your ex...   
552   Dear siddharthvyas We would like to thank you ...   
82    Dear Guest, Thank you for choosing to stay wit...   
1113  Dear Guest, We apologize for the inconvenience...   

                                        generated_reply      bleu  
167   Thank you for your valuable feedback and we ar...  1.000000  
825   Dear guest, we are extremely sorry for your ex...  1.000000  
552   Dear siddharthvyas We would like to thank you ...  0.991701  
82    Dear Guest

In [None]:
# ===============================================================
# 9. SAVE RESULTS
# ===============================================================
df.to_csv("/content/final_evaluation_scores.csv", index=False)
print("\nSaved: /content/final_evaluation_scores.csv")



Saved: /content/final_evaluation_scores.csv
