In [2]:
# codes to mount your google drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/Othercomputers/My Computer (1)/CS605_NLP_for_Smart_Assistants/Project/NLP-Lyric-Generator/src/bin

## How to use bleu_rouge class to compute bleu and rouge scores

In [1]:
import sys
import os
import re
import numpy as np

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TeYang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
from lib.bleu_rouge import bleu_rouge 

In [3]:
PATH = '../../data'

In [4]:
train_corpus, val_corpus, train_files, val_files = utils.split_corpus()

In [5]:
val_clean_corpus = []
for i,file in enumerate(val_files):
    text = open(PATH + '/' + file, mode='r').read()
    val_clean_corpus.append(text)

for i,song in enumerate(val_clean_corpus):
    song = re.sub(r'<[A-Z]+>|', '', song)
    # val_clean_corpus = re.sub(r'\n', '<break>')
    song = [sent for sent in song.split('\n') if sent]
    song = [utils.preprocess_text(sent) for sent in song]
    song.append('')
    val_clean_corpus[i] = song

val_clean_corpus = [sent for song in val_clean_corpus for sent in song ]

In [6]:
# NOTE THAT SONGS/DOCUMENTS SHOULD BE SEPARATED BY AN EMPTY STRING
val_clean_corpus[20:40]

['in the course we are set upon',
 'work together with a will',
 'and we will do what must be done',
 'put your heart your mind your skill to our defence',
 'put your heart your mind your skill to our defence',
 'put your art your work and will',
 'to the defence of singapore',
 'put your heart your mind your skill to our defence',
 '',
 'we are voices from the heart',
 'singing loud we will do our part',
 'working for ourselves and for the family',
 'do our best whenever we can',
 'we will help our fellow man',
 'we are happy people living hand in hand',
 'life is simple',
 'life is free',
 'life is joy and harmony',
 'we will strive each day',
 'be the best that we can be']

In [7]:
br = bleu_rouge()

In [8]:
prompt_ref = br.get_prompt_reference(val_clean_corpus)
prompt_ref

{'deep inside your heart where it belongs': 'it will always stay strive for your goals',
 'and we will do what must be done': 'put your heart your mind your skill to our defence',
 'in the course we are set upon': 'work together with a will',
 'will you live each moment': 'will you dare to find new ways',
 'if you believe that every vision begins with you': 'shine for singapore',
 'reach out for the moon above': 'savour freedom truth and love',
 'you and me we will work together': 'hand in hand in joy and harmony',
 'yet white and pure and free': 'reach out for the flag above',
 'we will conquer the skies': 'we know we will try',
 'working for ourselves and for the family': 'do our best whenever we can'}

In [9]:
br.compute_bleu('and we will do what must be done', 'put your heart your mind your skill to our defence')

Reference is [['put', 'your', 'heart', 'your', 'mind', 'your', 'skill', 'to', 'our', 'defence']]
Generated text is ['put', 'your', 'heart', 'your', 'mind', 'your', 'skill', 'to', 'our', 'defence']


{'BLEU-1': 1.0, 'BLEU-2': 1.0, 'BLEU-3': 1.0, 'BLEU-4': 1.0, 'Avg': 1.0}

In [12]:
# the average is taken as average of bleu 1 to 3 as 4-grams have no overlap
bleu_scores = br.compute_bleu('and we will do what must be done', 'put heart mind your to our defence')
bleu_scores

Reference is [['put', 'your', 'heart', 'your', 'mind', 'your', 'skill', 'to', 'our', 'defence']]
Generated text is ['put', 'heart', 'mind', 'your', 'to', 'our', 'defence']


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'BLEU-1': 0.6514390575310556,
 'BLEU-2': 0.4606369751099829,
 'BLEU-3': 0.3047009396142875,
 'BLEU-4': 4.474143418971061e-78,
 'Avg': 0.4722589907517753}

## Below codes are for testing (can ignore)

In [9]:
import random
from nltk.translate.bleu_score import sentence_bleu

class bleu_rouge:
    """
    A class for comparing bleu and rouge scores of generated text with referenced text.
    It also generates samples of prompt and their next sentence as reference
    from a corpus.
    ----------------------------------------------------------------------
    """    

    def get_prompt_reference(self, corpus, num_ref=10, seed=2022):
        """Get samples of prompt and their next sentence as reference

        Args:
        corpus (list): corpus consisting of list of sentences to sample the prompts from
        num_ref (int): number of prompts and refs to sample (defaults to 10)
        seed (int): seed for replicability
        
        Corpus should be a list of sentences. To separate a document from another, 
        insert an empty string between them.

        Returns:
        dict containing sampled prompts and their reference (next line)
        """

        random.seed(seed)
        refs = set()

        while len(refs) < num_ref:
            idx = random.randint(0,len(corpus)-2) # minus 2 as last sent has no next sent
            sample = corpus[idx]
            next_sent = corpus[idx+1]
            if sample and next_sent: # check sample and next line is not empty
                refs.add((sample, next_sent))

        prompt_ref = {}
        for k,v in refs:
            prompt_ref[k] = v
        
        self.num_ref = num_ref
        self.prompt_ref = prompt_ref

        return prompt_ref


    def compute_bleu(self, prompt, generated_text, verbose=True):
        """Computes the cumulative n-gram bleu score up to 4-gram. The average is also
        returned.

        Args:
        prompt (str): prompt used to generate text. this should be from the sampled prompts
        generated_text (str): the generated text using the prompt
        verbose (bool): prints out the tokenized references and generated text 
        
        Returns:
        dict containing scores for BLEU-1 to BLEU-4 and the average of them
        """

        ref = self.prompt_ref.get(prompt)
        if not ref:
            error_text = """PROMPT does NOT exist in sampled prompts.
            Run get_prompt_reference() to get prompt samples and check
            bleu_rogue.prompt_ref for the set of prompts and references"""
            raise AttributeError(error_text)

        ref = [ref.split(' ')] # ref is list of tokens in list of ref
        generated_text = generated_text.split(' ')
        if verbose:
            print('Reference is {}'.format(ref))
            print('Generated text is {}'.format(generated_text))

        bleu1 = sentence_bleu(ref, generated_text, weights=(1, 0, 0, 0))
        bleu2 = sentence_bleu(ref, generated_text, weights=(0.5, 0.5, 0, 0))
        bleu3 = sentence_bleu(ref, generated_text, weights=(0.33, 0.33, 0.33, 0))
        bleu4 = sentence_bleu(ref, generated_text, weights=(0.25, 0.25, 0.25, 0.25))
        avg_bleu = sum([bleu1, bleu2, bleu3, bleu4]) / 4

        self.bleu_scores = {'BLEU-1':bleu1, 'BLEU-2':bleu2, 'BLEU-3':bleu3, 'BLEU-4':bleu4, 'Avg':avg_bleu}

        return {'BLEU-1':bleu1, 'BLEU-2':bleu2, 'BLEU-3':bleu3, 'BLEU-4':bleu4, 'Avg':avg_bleu}

    

In [11]:
br = bleu_rouge()
prompt_ref = br.get_prompt_reference(val_clean_corpus)
br.prompt_ref


{'will you write us grand new stories': 'songs that everyone will feel',
 'shine for singapore': 'this is our song',
 'step by step together we will build our dreams': 'heart to heart together we will stay as one nation undivided',
 'amazing in all ways': 'surprises every corner',
 'working for ourselves and for the family': 'do our best whenever we can',
 'stand together heart to heart': 'we are going to show the world what singapore can be',
 'we must all do what we can': 'together hand in hand',
 'you will achieve with visions so bold': 'shine for singapore',
 'back to back together we will brave the heat the cold the storms': 'hand in hand together we will grow this land that we call home',
 'we are going to build a better life for you and me': 'we can achieve we can achieve'}

In [12]:
br.compute_bleu(
    prompt='have you heard a song',
    generated_text='one that moved you one that made you sing along',
    verbose=True,
)

AttributeError: PROMPT does NOT exist in sampled prompts.
            Run get_prompt_reference() to get prompt samples and check
            bleu_rogue.prompt_ref for the set of prompts and references

In [14]:

def get_references(val_corpus, num_ref=10, seed=2022):
    random.seed(seed)
    refs = set()

    while len(refs) < num_ref:
        idx = random.randint(0,len(val_corpus)-2) # minus 2 as last sent has no next sent
        sample = val_corpus[idx]
        next_sent = val_corpus[idx+1]
        if sample and next_sent: # check sample and next line is not empty
            refs.add((sample, next_sent))

    prompt_ref = {}
    for k,v in refs:
        prompt_ref[k] = v
    
    return prompt_ref

prompt_ref = get_references(val_clean_corpus)
prompt_ref

{'will you write us grand new stories': 'songs that everyone will feel',
 'shine for singapore': 'this is our song',
 'step by step together we will build our dreams': 'heart to heart together we will stay as one nation undivided',
 'amazing in all ways': 'surprises every corner',
 'working for ourselves and for the family': 'do our best whenever we can',
 'stand together heart to heart': 'we are going to show the world what singapore can be',
 'we must all do what we can': 'together hand in hand',
 'you will achieve with visions so bold': 'shine for singapore',
 'back to back together we will brave the heat the cold the storms': 'hand in hand together we will grow this land that we call home',
 'we are going to build a better life for you and me': 'we can achieve we can achieve'}

In [124]:
def compute_bleu(prompt, generated_text, verbose=True):
    ref = prompt_ref[prompt]
    ref = [ref.split(' ')] # ref is list of tokens in list of ref
    generated_text = generated_text.split(' ')
    if verbose:
        print('Reference is {}'.format(ref))
        print('Generated text is {}'.format(generated_text))

    bleu1 = sentence_bleu(ref, generated_text, weights=(1, 0, 0, 0))
    bleu2 = sentence_bleu(ref, generated_text, weights=(0.5, 0.5, 0, 0))
    bleu3 = sentence_bleu(ref, generated_text, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = sentence_bleu(ref, generated_text, weights=(0.25, 0.25, 0.25, 0.25))
    avg_bleu = sum([bleu1, bleu2, bleu3, bleu4]) / 4

    return {'BLEU-1':bleu1, 'BLEU-2':bleu2, 'BLEU-3':bleu3, 'BLEU-4':bleu4, 'Avg':avg_bleu}

compute_bleu('shine for singapore', 'this is our song')

Reference is [['this', 'is', 'our', 'song']]
Generated text is ['this', 'is', 'our', 'song']


{'BLEU-1': 1.0, 'BLEU-2': 1.0, 'BLEU-3': 1.0, 'BLEU-4': 1.0, 'Avg': 1.0}

In [125]:
compute_bleu('step by step together we will build our dreams', 'heart to heart will stay as one nation undivided')

Reference is [['heart', 'to', 'heart', 'together', 'we', 'will', 'stay', 'as', 'one', 'nation', 'undivided']]
Generated text is ['heart', 'to', 'heart', 'will', 'stay', 'as', 'one', 'nation', 'undivided']


{'BLEU-1': 0.800737402916808,
 'BLEU-2': 0.749021254622464,
 'BLEU-3': 0.6856942708759817,
 'BLEU-4': 0.598690849764947,
 'Avg': 0.7085359445450501}