In [None]:
import pprint 
pp = pprint.PrettyPrinter(indent=4)

In [None]:
reference = [['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']]
candidate = ['the', 'fast', 'brown', 'fox', 'jumped', 'over', 'the', 'sleepy', 'dog']

# BLEU

## Sentence BLEU Score

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

### BLEU 1

In [None]:
score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
print(score)

0.7777777777777778


### BLEU 2

In [None]:
score = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0))
print(score)

0.6236095644623236


### BLEU 3

In [None]:
score = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0))
print(score)

0.5536178537956702


### BLEU 4

In [None]:
score = sentence_bleu(reference, candidate)
print(score)

0.4854917717073234


OR

In [None]:
score = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
print(score)

0.4854917717073234


# ROUGE

## About ROUGE

ROUGE stands for Recall-Oriented Understudy for Gisting Evaluation. It includes measures to automatically determine the quality of a summary by comparing it to other (ideal) summaries created by humans. The measures count the number of overlapping units such as n-gram, word sequences, and word pairs between the computer-generated summary to be evaluated and the ideal summaries created by humans. 



1.   ROUGE-N: N-gram Co-Occurrence Statistics 
  
  a. ROUGE-N is an n-gram recall between a candidate summary and a set of reference summaries.
2.   ROUGE-L: Longest Common Subsequence 
  
  a. Given two sequences X and Y, the longest common subsequence (LCS) of X and
Y is a common subsequence with maximum length. 

3.   ROUGE-W: Weighted Longest Common Subsequence 

  a.  Unfortunately, the basic LCS also has a problem that it does not differentiate LCSes of different spatial relations within their embedding sequences. ROUGE-W addresses this by simply remembering the length of consecutive matches encountered so far to a regular two dimensional dynamic program table computing LCS.

4. ROUGE-S: Skip-Bigram Co-Occurrence Statistics 

  a. Skip-bigram co-occurrence statistics measure the overlap of skip-bigrams between a candidate translation and a set of reference translations.

5. ROUGE-SU: Extension of ROUGE-S 

  a. One potential problem for ROUGE-S is that it does not give any credit to a candidate sentence if the sentence does not have any word pair co-occurring with its references. To addres this, ROUGE-S has been extended with the addition of unigram as counting unit to give ROUGE-SU.


Source:  https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/was2004.pdf 
    


In [None]:
!pip install rouge-metric



In [None]:
from rouge_metric import PyRouge
rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=True,
                rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4)

In [None]:
scores = rouge.evaluate_tokenized([candidate], [reference])

In [None]:
pp.pprint(scores)

{   'rouge-1': {   'f': 0.8219178082191781,
                   'p': 0.8108108108108109,
                   'r': 0.8333333333333334},
    'rouge-2': {   'f': 0.7042253521126761,
                   'p': 0.6944444444444444,
                   'r': 0.7142857142857143},
    'rouge-4': {   'f': 0.5671641791044776,
                   'p': 0.5588235294117647,
                   'r': 0.5757575757575758},
    'rouge-l': {   'f': 0.8219178082191781,
                   'p': 0.8108108108108109,
                   'r': 0.8333333333333334},
    'rouge-s4': {   'f': 0.6626865671641792,
                    'p': 0.6529411764705882,
                    'r': 0.6727272727272727},
    'rouge-su4': {'f': 0.6896551724137931, 'p': 0.6796116504854369, 'r': 0.7},
    'rouge-w-1.2': {   'f': 0.49172167622776264,
                       'p': 0.5640708504439144,
                       'r': 0.4358220253042036}}


# Perplexity

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 42.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
# import transformers
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import numpy as np

In [None]:
model_name = 'cointegrated/rubert-tiny'
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.5M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def score(model, tokenizer, sentence):
    tensor_input = tokenizer.encode(sentence, return_tensors='pt')
    repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
    mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
    masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
    labels = repeat_input.masked_fill( masked_input != tokenizer.mask_token_id, -100)
    with torch.inference_mode():
        loss = model(masked_input, labels=labels).loss
    return np.exp(loss.item())


In [None]:
print(score(sentence=" ".join(candidate), model=model, tokenizer=tokenizer)) 


207.85844993512197


In [None]:
print(score(sentence=" ".join(reference[0]), model=model, tokenizer=tokenizer)) 


196.93592044504416


# Combined Function

In [None]:
!pip install rouge-metric

Collecting rouge-metric
  Downloading rouge_metric-1.0.1-py3-none-any.whl (151 kB)
[?25l[K     |██▏                             | 10 kB 20.8 MB/s eta 0:00:01[K     |████▎                           | 20 kB 15.6 MB/s eta 0:00:01[K     |██████▌                         | 30 kB 6.5 MB/s eta 0:00:01[K     |████████▋                       | 40 kB 5.8 MB/s eta 0:00:01[K     |██████████▉                     | 51 kB 4.6 MB/s eta 0:00:01[K     |█████████████                   | 61 kB 5.4 MB/s eta 0:00:01[K     |███████████████▏                | 71 kB 5.7 MB/s eta 0:00:01[K     |█████████████████▎              | 81 kB 5.1 MB/s eta 0:00:01[K     |███████████████████▍            | 92 kB 5.7 MB/s eta 0:00:01[K     |█████████████████████▋          | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████▊        | 112 kB 5.3 MB/s eta 0:00:01[K     |██████████████████████████      | 122 kB 5.3 MB/s eta 0:00:01[K     |████████████████████████████    | 133 kB 5.3 MB/s eta 0:

In [None]:
import torch
import numpy as np
import nltk

from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from rouge_metric import PyRouge
from transformers import AutoModelForMaskedLM, AutoTokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def get_bleu_score(sent1, sent2):
  sent1_tokens = word_tokenize(sent1)
  sent2_tokens = word_tokenize(sent2)

  bleu_1 = sentence_bleu(sent1, sent2, weights=(1, 0, 0, 0))
  bleu_2 = sentence_bleu(sent1, sent2, weights=(0.5, 0.5, 0, 0))
  bleu_3 = sentence_bleu(sent1, sent2, weights=(0.33, 0.33, 0.33, 0))
  bleu_4 = sentence_bleu(sent1, sent2, weights=(0.25, 0.25, 0.25, 0.25))

  return bleu_1, bleu_4

In [None]:
def get_rouge_scoe(sent1, sent2):
  sent1_tokens = word_tokenize(sent1)
  sent2_tokens = word_tokenize(sent2)

  rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=True,
                rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4)
  scores = rouge.evaluate_tokenized([sent1_tokens], [sent2_tokens])

  return scores['rouge-1']['f'], scores['rouge-4']['f']

In [None]:
preplexity_model_name = 'cointegrated/rubert-tiny'
preplexity_model = AutoModelForMaskedLM.from_pretrained(preplexity_model_name)
preplexity_tokenizer = AutoTokenizer.from_pretrained(preplexity_model_name)

def preplexity_score(model, tokenizer, sentence):
    tensor_input = tokenizer.encode(sentence, return_tensors='pt')
    repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
    mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
    masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
    labels = repeat_input.masked_fill( masked_input != tokenizer.mask_token_id, -100)
    with torch.inference_mode():
        loss = model(masked_input, labels=labels).loss
    return np.exp(loss.item())

def get_preplexity_score(sent1, sent2):
  sent1_perplexity = preplexity_score(sentence=sent1, model=preplexity_model, tokenizer=preplexity_tokenizer)
  sent2_perplexity = preplexity_score(sentence=sent2, model=preplexity_model, tokenizer=preplexity_tokenizer)

  return sent1_perplexity, sent2_perplexity

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def get_evaluation_metrics(sent1, sent2):
  metrics = {}

  metrics['bleu1'], metrics['bleu4'] = get_bleu_score(sent1, sent2)
  metrics['rouge1_f1'], metrics['rouge4_f1'] = get_rouge_scoe(sent1, sent2)
  metrics['listener_preplexity'], metrics['gpt2_preplexity'] = get_preplexity_score(sent1, sent2)

  return metrics


In [None]:
m = get_evaluation_metrics('That is bad. I hope you found a solution with him.', "I'm sorry to hear that.")

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [None]:
m

{'bleu1': 0.4782608695652174,
 'bleu4': 0.8316033157750904,
 'gpt2_preplexity': 2.627046125519615,
 'listener_preplexity': 11.621917793829137,
 'rouge1_f1': 0.1003344481605351,
 'rouge4_f1': 0.0}