<a href="https://colab.research.google.com/github/sbassam/nub-summarizer/blob/master/Results_Analysis_and_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this Colab notebook, the goal is to compare several existing summarizers to nub 1.0 and compare the results to get a better sense of its strengths and weaknesses. Since we've already created a validation dataset in `Run Evaluations on Fine-tuned T-5 Summarizer.ipynb`, we first grab the results. Then run the examples through the competitor summarizers, calculate ROUGE scores, and finally compare them all.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Install the summarizer developed by Derek Miller: https://pypi.org/project/bert-extractive-summarizer/ 
!pip install bert-extractive-summarizer

In [None]:
#ROUGE library
!pip install rouge-score

In [None]:
# transforerms library
!pip install transformers


In [None]:
# import libraries
import pandas as pd
import numpy as np
from summarizer import Summarizer
from rouge_score import rouge_scorer

In [None]:
# navigate to the nub 1.0 summarizer's validation results 
%cd /content/drive/My Drive/summarizer/nub-training-evaluation/result

/content/drive/My Drive/summarizer/nub-training-evaluation/result


## Run the validation through `bert-extractive-summarizer 0.4.2`

In [None]:
# read in the validation source and targets
eval_results = pd.read_csv('eval_results_t5_base.csv')
# initialize the model
model = Summarizer()

In [None]:
# define the scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# run each example in the validation through the model and store the scores
rouge_1_fs = []
rouge_2_fs = []
rouge_l_fs = []
system_summaries = []
i = 0
for ind, row in eval_results.iterrows():
    # overwrite the systems summary
    system_summary = ''.join(model(row.full_text, min_length=60))
    scores = scorer.score(row.summary, system_summary)
    rouge_1_fs.append(scores['rouge1'].fmeasure)
    rouge_2_fs.append(scores['rouge2'].fmeasure)
    rouge_l_fs.append(scores['rougeL'].fmeasure)
    system_summaries.append(system_summary)
    i += 1
    print(i)
    print(scores)
    # overwrite the scores
eval_results['rouge1_f']=np.array(rouge_1_fs)
eval_results['rouge2_f']=np.array(rouge_2_fs)
eval_results['rougeL_f']=np.array(rouge_l_fs)
eval_results['system_summary']=np.array(system_summaries)
eval_results.to_csv('eval_results_bert_extractive_summarizer.csv')

## Run the validation set through `mrm8488/t5-base-finetuned-summarize-news`
This is a T5 model cound on huggingface's model hub. It was developed by Manuel Romero. https://huggingface.co/mrm8488/t5-base-finetuned-summarize-news


In [None]:
# read in the validation source and targets
eval_results = pd.read_csv('eval_results_t5_base.csv')
# initialize the model
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-summarize-news")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-summarize-news")


In [None]:

def summarize(text, max_length=150):
  '''source https://huggingface.co/mrm8488/t5-base-finetuned-summarize-news'''
  input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)

  generated_ids = model.generate(input_ids=input_ids, num_beams=2, max_length=max_length,  repetition_penalty=2.5, length_penalty=1.0, early_stopping=True)

  preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

  return preds[0]

In [None]:
# define the scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# run each example in the validation through the model and store the scores
# as well as the generated summaries
rouge_1_fs = []
rouge_2_fs = []
rouge_l_fs = []
system_summaries = []
i = 0
for ind, row in eval_results.iterrows():
    # overwrite the systems summary
    system_summary = summarize(row.full_text)
    scores = scorer.score(row.summary, system_summary)
    rouge_1_fs.append(scores['rouge1'].fmeasure)
    rouge_2_fs.append(scores['rouge2'].fmeasure)
    rouge_l_fs.append(scores['rougeL'].fmeasure)
    system_summaries.append(system_summary)
    i+=1
    print(i)
    print(scores)
# overwrite the scores
eval_results['rouge1_f']=np.array(rouge_1_fs)
eval_results['rouge2_f']=np.array(rouge_2_fs)
eval_results['rougeL_f']=np.array(rouge_l_fs)
eval_results['system_summary']=np.array(system_summaries)
eval_results.to_csv('eval_results_t5_mrm8488.csv')

## Comparative Analysis

In [None]:
# create an empty master df
eval_results_all = pd.DataFrame(
    columns=['nub_rouge1_f', 't5_base_rouge1_f', 'bert_ext_summ_rouge1_f', 't5_mrm8488_rouge1_f',
             'nub_rouge2_f', 't5_base_rouge2_f', 'bert_ext_summ_rouge2_f', 't5_mrm8488_rouge2_f',
             'nub_rougeL_f', 't5_base_rougeL_f', 'bert_ext_summ_rougeL_f', 't5_mrm8488_rougeL_f'])

In [None]:
# read in all the results sets
eval_results_nub = pd.read_csv('eval_results_nub.csv')[['rouge1_f', 'rouge2_f', 'rougeL_f']]
eval_results_bert_extractive_summarizer = pd.read_csv('eval_results_bert_extractive_summarizer.csv')[['rouge1_f', 'rouge2_f', 'rougeL_f']]
eval_results_t5_base = pd.read_csv('eval_results_t5_base.csv')[['rouge1_f', 'rouge2_f', 'rougeL_f']]
eval_results_t5_mrm8488 = pd.read_csv('eval_results_t5_mrm8488.csv')[['rouge1_f', 'rouge2_f', 'rougeL_f']]

In [None]:
# insert into the master df
eval_results_all = pd.concat([eval_results_nub, eval_results_t5_base, eval_results_bert_extractive_summarizer, eval_results_t5_mrm8488], axis=1)

In [None]:
eval_results_all.columns=['nub_rouge1_f', 'nub_rouge2_f', 'nub_rougeL_f',
         't5_base_rouge1_f', 't5_base_rouge2_f', 't5_base_rougeL_f',
         'bert_ext_summ_rouge1_f', 'bert_ext_summ_rouge2_f', 'bert_ext_summ_rougeL_f',
         't5_mrm8488_rouge1_f', 't5_mrm8488_rouge2_f', 't5_mrm8488_rougeL_f']

In [None]:
means = eval_results_all.mean()
errors = eval_results_all.std()

In [None]:
labels = ['R-1', 'R-2', 'R-L']
nub_means = 100*eval_results_all[['nub_rouge1_f', 'nub_rouge2_f', 'nub_rougeL_f']].mean().round(4)
nub_err = 100*eval_results_all[['nub_rouge1_f', 'nub_rouge2_f', 'nub_rougeL_f']].std().round(4)
t5base_means = 100*eval_results_all[['t5_base_rouge1_f', 't5_base_rouge2_f', 't5_base_rougeL_f']].mean().round(4)
t5base_err = 100*eval_results_all[['t5_base_rouge1_f', 't5_base_rouge2_f', 't5_base_rougeL_f']].std().round(4)
bertextsumm_means = 100*eval_results_all[['bert_ext_summ_rouge1_f', 'bert_ext_summ_rouge2_f', 'bert_ext_summ_rougeL_f']].mean().round(4)
bertextsumm_err = 100*eval_results_all[['bert_ext_summ_rouge1_f', 'bert_ext_summ_rouge2_f', 'bert_ext_summ_rougeL_f']].std().round(4)
t5mrm8488_means = 100*eval_results_all[['t5_mrm8488_rouge1_f', 't5_mrm8488_rouge2_f', 't5_mrm8488_rougeL_f']].mean().round(4)
t5mrm8488_err = 100*eval_results_all[['t5_mrm8488_rouge1_f', 't5_mrm8488_rouge2_f', 't5_mrm8488_rougeL_f']].std().round(4)


In [None]:
# https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/barchart.html
x = np.arange(len(labels))  # the label locations
width = 0.2  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - 3*width/2, nub_means, width, label='Nub 1.0')
rects2 = ax.bar(x - width/2, t5base_means, width, label='T5-base')
rects3 = ax.bar(x + width/2, bertextsumm_means, width, label='BERT extractive summarizer')
rects4 = ax.bar(x + 3*width/2, t5mrm8488_means, width, label='T5 finetuned on Kaggle dataset')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('F-Measure')
ax.set_title('ROUGE-1, ROUGE-2 and ROUGE-L for Summarizers')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()




fig.tight_layout()
plt.savefig('bar_chart_rouge.png')
plt.close()

In [None]:
file = open('/content/drive/My Drive/summarizer/t-5-fine-tuned-7-2/resoomer_gold_summary.txt', "r")
resoomer_gold_summary = ''.join(file.readlines())
file.close()

In [None]:
file = open('/content/drive/My Drive/summarizer/t-5-fine-tuned-7-2/resoomer_validation_output.txt', "r")
resoomer_system_summary = ''.join(file.readlines())
file.close()

In [None]:
resoomer_scores = scorer.score(resoomer_gold_summary, resoomer_system_summary)

In [None]:
resoomer_scores

{'rouge1': Score(precision=0.6404040404040404, recall=0.4283783783783784, fmeasure=0.5133603238866397),
 'rouge2': Score(precision=0.151270207852194, recall=0.10117783355860205, fmeasure=0.12125419414555133),
 'rougeL': Score(precision=0.14343434343434344, recall=0.09594594594594595, fmeasure=0.11497975708502026)}

In [None]:
labels = ['R-1', 'R-2', 'R-L']
nub_means = 100*eval_results_all[['nub_rouge1_f', 'nub_rouge2_f', 'nub_rougeL_f']].mean().round(4)
t5base_means = 100*eval_results_all[['t5_base_rouge1_f', 't5_base_rouge2_f', 't5_base_rougeL_f']].mean().round(4)
bertextsumm_means = 100*eval_results_all[['bert_ext_summ_rouge1_f', 'bert_ext_summ_rouge2_f', 'bert_ext_summ_rougeL_f']].mean().round(4)
t5mrm8488_means = 100*eval_results_all[['t5_mrm8488_rouge1_f', 't5_mrm8488_rouge2_f', 't5_mrm8488_rougeL_f']].mean().round(4)

In [None]:
nub_means

nub_rouge1_f    40.79
nub_rouge2_f    17.76
nub_rougeL_f    26.87
dtype: float64

In [None]:
t5base_means

t5_base_rouge1_f    40.19
t5_base_rouge2_f    17.35
t5_base_rougeL_f    27.65
dtype: float64

In [None]:
bertextsumm_means

bert_ext_summ_rouge1_f    30.89
bert_ext_summ_rouge2_f    11.31
bert_ext_summ_rougeL_f    18.86
dtype: float64

In [None]:
t5mrm8488_means

t5_mrm8488_rouge1_f    37.23
t5_mrm8488_rouge2_f    12.98
t5_mrm8488_rougeL_f    22.64
dtype: float64

In [None]:
nub_err

nub_rouge1_f    11.28
nub_rouge2_f    10.09
nub_rougeL_f     9.77
dtype: float64