<a href="https://colab.research.google.com/github/rajdeepbanerjee-git/recommender_topK_topP_minP_search/blob/main/article5_summary_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1


In [3]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [5]:
# load generated summaries
summary_results = pd.read_csv('/content/all_results_fin.csv')
summary_results.head()

Unnamed: 0,user_query,top_k_gt,top_p_gt,min_p_gt,summary_top_k,summary_top_p,summary_min_p
0,JIRA software issue,"[""Dear <name>,\n\nThank you for reaching out. ...","[""Dear <name>,\n\nThank you for reaching out. ...","[""Dear <name>,\n\nThank you for reaching out. ...",our team is investigating the access problem w...,our team is investigating the access problem w...,our team is investigating the access problem w...
1,server issue,"['Dear <name>,\n\nThank you for reaching out. ...","['Dear <name>,\n\nThank you for reaching out. ...","['Dear <name>,\n\nThank you for reaching out. ...",IT Services Customer Support understands the u...,IT Services Customer Support received your rep...,server configuration changes for name>'s IT Co...
2,windows os issue,"[""Dear Customer,\n\nWe apologize for the incon...","[""Dear Customer,\n\nWe apologize for the incon...","[""Dear Customer,\n\nWe apologize for the incon...","restart your computer and press F8 repeatedly,...","name>, We're sorry to hear about the blue scre...","Dell xps 13 9310 has blue screen error, please..."
3,cisco router issue,"['Dear <name>,\n\nThank you for contacting Cus...","['Dear <name>,\n\nThank you for contacting Cus...","['Dear <name>,\n\nThank you for contacting Cus...",Ensure that your router is connected to a work...,check that your router is connected to a worki...,check that the power button on your router is ...
4,laptop hardware issue,"[""Dear <name>,\n\nThank you for reaching out t...","[""Dear <name>,\n\nThank you for reaching out t...","[""Dear <name>,\n\nThank you for reaching out t...","name>, Sorry to hear about the screen flickeri...","name>, We are sorry to hear about the flickeri...",screen flickering on new Dell xps 13 9310 lapt...


#### Evaluations: We will use ROUGE, BLEU and BERTscore

In [12]:
def evaluate_summary_rouge(generated_summary, ground_truth):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, ground_truth)
    return scores

In [55]:
# ROUGE calculation - we take average ROUGE-L for all the summaries
rouge_tk = []
rouge_tp = []
rouge_mp = []

for i in range(len(summary_results)):

  # get gt
  gt_res_top_k = "".join(summary_results['top_k_gt'].iloc[i])
  gt_res_top_p = "".join(summary_results['top_p_gt'].iloc[i])
  gt_res_min_p = "".join(summary_results['min_p_gt'].iloc[i])

  # Evaluate
  rouge_top_k = evaluate_summary_rouge(summary_results['summary_top_k'].iloc[i], gt_res_top_k)
  rouge_top_p = evaluate_summary_rouge(summary_results['summary_top_p'].iloc[i], gt_res_top_p)
  rouge_min_p = evaluate_summary_rouge(summary_results['summary_min_p'].iloc[i], gt_res_min_p)

  # Store rouge-l
  rouge_tk.append(rouge_top_k[0]['rouge-l']['r'])
  rouge_tp.append(rouge_top_p[0]['rouge-l']['r'])
  rouge_mp.append(rouge_min_p[0]['rouge-l']['r'])

# calculate average
avg_r_tk = np.mean(np.array(rouge_tk))
avg_r_tp = np.mean(np.array(rouge_tp))
avg_r_mp = np.mean(np.array(rouge_mp))


print(f"ROUGE Scores for Top-k : {avg_r_tk} \n ROUGE Scores for Top-p : {avg_r_tp} \n ROUGE Scores for min p : {avg_r_mp}")


ROUGE Scores for Top-k : 0.21599663850033446 
 ROUGE Scores for Top-p : 0.19031099431800516 
 ROUGE Scores for min p : 0.18203327658269622


In [56]:
def evaluate_summary_bleu(generated_summary, ground_truth):
    reference = ground_truth.split()
    candidate = generated_summary.split()
    score = sentence_bleu([reference], candidate)
    return score

In [57]:
# BLEU calculation - we take average BLEU of all the summaries
bleu_tk = []
bleu_tp = []
bleu_mp = []

for i in range(len(summary_results)):

  # get gt
  gt_res_top_k = "".join(summary_results['top_k_gt'].iloc[i])
  gt_res_top_p = "".join(summary_results['top_p_gt'].iloc[i])
  gt_res_min_p = "".join(summary_results['min_p_gt'].iloc[i])

  # Evaluate
  bleu_top_k = evaluate_summary_bleu(summary_results['summary_top_k'].iloc[i], gt_res_top_k)
  bleu_top_p = evaluate_summary_bleu(summary_results['summary_top_p'].iloc[i], gt_res_top_p)
  bleu_min_p = evaluate_summary_bleu(summary_results['summary_min_p'].iloc[i], gt_res_min_p)

  # Store bleu
  bleu_tk.append(bleu_top_k)
  bleu_tp.append(bleu_top_p)
  bleu_mp.append(bleu_min_p)


# calculate average
avg_bl_tk = np.mean(np.array(bleu_tk))
avg_bl_tp = np.mean(np.array(bleu_tp))
avg_bl_mp = np.mean(np.array(bleu_mp))


print(f"BLEU Scores for Top-k : {avg_bl_tk} \n BLEU Scores for Top-p : {avg_bl_tp} \n BLEU Scores for min p : {avg_bl_mp}")


BLEU Scores for Top-k : 0.005051816320244709 
 BLEU Scores for Top-p : 0.0006624947740051266 
 BLEU Scores for min p : 0.008337198733564099


#### Evaluation using BERTscore

In [36]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [77]:
from bert_score import score
from tqdm import tqdm

# Example ground truth resolution
bert_scores_tk = []
bert_scores_tp = []
bert_scores_mp = []

for i in tqdm(range(len(summary_results))):

  # get gt
  gt_res_top_k = ast.literal_eval(summary_results['top_k_gt'].iloc[i])
  gt_res_top_p = ast.literal_eval(summary_results['top_p_gt'].iloc[i])
  gt_res_min_p = ast.literal_eval(summary_results['min_p_gt'].iloc[i])

  # calculate BERT score
  p_top_k, r_top_k, F1_top_k = score([summary_results['summary_top_k'].iloc[i]], ["".join(gt_res_top_k)], lang='en', verbose=False)
  p_top_p, r_top_p, F1_top_p = score([summary_results['summary_top_p'].iloc[i]], ["".join(gt_res_top_p)], lang='en', verbose=False)
  p_min_p, r_min_p, F1_min_p = score([summary_results['summary_min_p'].iloc[i]], ["".join(gt_res_min_p)], lang='en', verbose=False)

  # Store bleu
  bert_scores_tk.append({'precision': p_top_k, 'recall': r_top_k, 'f1-score': F1_top_k})
  bert_scores_tp.append({'precision': p_top_p, 'recall': r_top_p, 'f1-score': F1_top_p})
  bert_scores_mp.append({'precision': p_min_p, 'recall': r_min_p, 'f1-score': F1_min_p})



  0%|          | 0/10 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 10%|█         | 1/10 [00:30<04:31, 30.16s/it]Some weights of RobertaModel were not i

NameError: name 'bert_score_mp' is not defined

In [79]:
# put scores in df
bert_scores_tk_df = pd.DataFrame(bert_scores_tk)
bert_scores_tp_df = pd.DataFrame(bert_scores_tp)
bert_scores_mp_df = pd.DataFrame(bert_scores_mp)

# calculate average
avg_f1_tk = bert_scores_tk_df['f1-score'].mean()
avg_f1_tp = bert_scores_tp_df['f1-score'].mean()
avg_f1_mp = bert_scores_mp_df['f1-score'].mean()

# Display BERTScore results
print(f"F1 Scores for Top-k : {avg_f1_tk} \n F1 Scores for Top-p : {avg_f1_tp} \n F1 Scores for min p : {avg_f1_mp}")

F1 Scores for Top-k : 0.8576225280761719 
 F1 Scores for Top-p : 0.8504076957702636 
 F1 Scores for min p : 0.8488237380981445


In [81]:
# calculate average
avg_pre_tk = bert_scores_tk_df['precision'].mean()
avg_pre_tp = bert_scores_tp_df['precision'].mean()
avg_pre_mp = bert_scores_mp_df['precision'].mean()

# Display BERTScore results
print(f"Precision for Top-k : {avg_pre_tk} \n Precision for Top-p : {avg_pre_tp} \n Precision for min p : {avg_pre_mp}")

Precision for Top-k : 0.9117547988891601 
 Precision for Top-p : 0.9069196701049804 
 Precision for min p : 0.9020003318786621


In [82]:
# calculate average
avg_rec_tk = bert_scores_tk_df['recall'].mean()
avg_rec_tp = bert_scores_tp_df['recall'].mean()
avg_rec_mp = bert_scores_mp_df['recall'].mean()

# Display BERTScore results
print(f"recall for Top-k : {avg_rec_tk} \n recall for Top-p : {avg_rec_tp} \n recall for min p : {avg_rec_mp}")

recall for Top-k : 0.8097028732299805 
 recall for Top-p : 0.8006174087524414 
 recall for min p : 0.8016838073730469
