In [7]:
# !pip install tqdm 
# from tdqm import tqdm
import matplotlib
import os
from tqdm import tqdm
import json
from json import JSONDecodeError
import pickle
import numpy as np
from nltk.translate import bleu_score
from matplotlib import pyplot as plt

Note: most relevant stuff can be found in `test captions.ipynb`

In [8]:
from utils import load_vocab, decode_caption, load_caption, rrv_votes, load_annotations, print_image

In [9]:
ngram_similarities = ["unigram_overlap", "unigram_multiplicity", "bigram_overlap", "bigram_precision"]
other_similarities = ["lstm_states"]

In [10]:
vocab = load_vocab(dict_file = "../../outputs/vocab/5000/coco2014_vocab.json")
image_id_to_index, index_to_image_id, annotations_dict = load_annotations(annotations_dir="../../annotations/", 
                                                                          annotations_file='captions_val2014.json',
                                                                         map_file = "../../outputs/val_image_id_to_idx.csv")
print("Processed {} images".format(len(image_id_to_index)))
print("Processed {} images".format(len(annotations_dict.keys())))

idx_to_word
word_to_idx
Loaded dictionary...
Dictionary size: 5004
Error proccessing image_id: image_index
Skipping file person_keypoints_train2014.json
Skipping file instances_train2014.json
Skipping file instances_val2014.json
Skipping file person_keypoints_val2014.json
Processed 40504 images
Processed 40504 images


## Generate and save voted captions

In [11]:
def load_best_beam_captions(beam_size):
#     beam_captions = {}
#     beam_captions_dir = "../../outputs/beam_captions_{}/".format(beam_size)
#     for image_id in sorted(annotations_dict):
#         caption_object = load_caption(image_id, image_dir=beam_captions_dir)
#         best_beam_caption = caption_object['captions'][0]['sentence']
#         beam_captions[image_id] = best_beam_caption
#     return beam_captions
    file_name = '../../outputs/voted_captions/{}/best_beam.pickle'.format(beam_size)
#     with open(file_name, 'wb') as file:
#         pickle.dump(beam_captions, file, pickle.HIGHEST_PROTOCOL)
    with open(file_name, 'rb') as file:
        best_beams = pickle.load(file)
    return [decode_caption(best_beams[im_id], vocab) for im_id in sorted(best_beams)]

In [12]:
def save_beam_baseline_captions(beam_size):
    beam_baseline_captions = []
    beam_captions_dir = "../../outputs/beam_captions_{}/".format(beam_size)
    for image_id in sorted(annotations_dict):
        caption_object = load_caption(image_id, image_dir=beam_captions_dir)
        probabilities = caption_object['probabilities']
        captions = [caption_object['captions'][i]['sentence'] for i in range(len(caption_object['captions']))]
        log_probs = np.log(probabilities)
        log_probs_and_captions = list(zip(log_probs, captions))
        normalilzed_log_probs_and_captions = [(p / len(c), c) for p,c in log_probs_and_captions]
        baseline_caption = sorted(normalilzed_log_probs_and_captions, reverse=True)[0][1]
        beam_baseline_captions.append(baseline_caption)
    print("saving {} baseline captions for k={}".format(len(beam_baseline_captions), beam_size))
    file_name = '../../outputs/voted_captions/{}/beam_baseline.pickle'.format(beam_size)
    with open(file_name, 'wb') as file:
        best_beams = pickle.dump(beam_baseline_captions, file, pickle.HIGHEST_PROTOCOL)
        
def load_beam_baseline_captions(beam_size):
    file_name = '../../outputs/voted_captions/{}/beam_baseline.pickle'.format(beam_size)
    with open(file_name, 'rb') as file:
        return pickle.load(file)
    
def extract_baseline_sentences(captions):
    return [decode_caption(c, vocab) for c in captions]

In [13]:
def generate_vote_captions(beam_size, similarity):
    """Compute a dictionary of captions, generated using range voting."""
    vote_captions = {}
    beam_captions_dir = "../../outputs/beam_captions_{}/".format(beam_size)
    images = os.listdir(beam_captions_dir)
    print("Number of images with beam captions found: {}".format(len(images)))

    for i, image in enumerate(tqdm(images)):
        image_id = int(image.split('.')[0])
        try:
            caption_object = load_caption(image_id, image_dir=beam_captions_dir)
            voted_caption_object = rrv_votes(caption_object, num_winners=1, similarity=similarity)
            vote_captions[image_id] = voted_caption_object
        except JSONDecodeError:
            print("Error on ", image_id)
    return vote_captions

In [14]:
def save_vote_captions(captions, beam_size, similarity):
    file_name = '../../outputs/voted_captions/{}/{}.pickle'.format(beam_size, similarity)
    if os.path.isfile(file_name): 
        raise ValueError("File {} already exists".format(file_name))
    with open(file_name, 'wb') as file:
        pickle.dump(captions, file, pickle.HIGHEST_PROTOCOL)

def load_voted_captions(beam_size, similarity):
    file_name = '../../outputs/voted_captions/{}/{}.pickle'.format(beam_size, similarity)
    with open(file_name, 'rb') as file:
        return pickle.load(file)

def extract_caption_tokens(voted_captions):
    caption_tokens = []
    for image_id in sorted(annotations_dict):
        caption_tokens.append(voted_captions[image_id][0][0])
    return caption_tokens

def extract_caption_sentences(voted_captions):
    caption_sentences = []
    for image_id in sorted(voted_captions):
        caption_sentences.append(decode_caption(voted_captions[image_id][0][0], vocab))
    return caption_sentences

In [15]:
# beam_size = 100
# for similarity in ngram_similarities:
#     voted_captions = generate_vote_captions(beam_size, similarity)
#     save_vote_captions(voted_captions, beam_size, similarity)

## Bleu score

In [16]:
annotations_list = [annotations_dict[image_id] for image_id in sorted(annotations_dict)]

In [17]:
beam_baseline_captions = load_voted_captions(10, 'unigram_overlap')
print(beam_baseline_captions[0])

([array([  1,   4,  13,  47,   4,  92,  36,   7,  70,   6,   4, 363,   2])], [0.008501648821525572], [0.02093808164473165])


In [18]:
for k in [1, 2, 10, 100]:
    beam_captions = load_best_beam_captions(k)
    bleu = bleu_score.corpus_bleu(annotations_list, beam_captions)
    print("Bleu score for {}/beam: {}".format(k, bleu))

Bleu score for 1/beam: 0.2538667841440404
Bleu score for 2/beam: 0.2682792451263592
Bleu score for 10/beam: 0.2716384032063896
Bleu score for 100/beam: 0.2630932525817317


In [19]:
for k in [2, 10, 100]:
    beam_baseline_captions = extract_baseline_sentences(load_beam_baseline_captions(k))
    bleu = bleu_score.corpus_bleu(annotations_list, beam_baseline_captions)
    print("Bleu score for {}/baseline: {}".format(k, bleu))

Bleu score for 2/baseline: 0.26719194279794856
Bleu score for 10/baseline: 0.2575805705250495
Bleu score for 100/baseline: 0.24728679506549345


In [20]:
for k in [2, 10, 100]:
    for similarity in ngram_similarities:
#     for similarity in ["unigram_overlap"]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        bleu = bleu_score.corpus_bleu(annotations_list, caption_sentences)
        print("Bleu score for {}/{}: {}".format(k, similarity, bleu))


Bleu score for 2/unigram_overlap: 0.26471592485004636
Bleu score for 2/unigram_multiplicity: 0.26465035752639743
Bleu score for 2/bigram_overlap: 0.26819492239283405
Bleu score for 2/bigram_precision: 0.2682133866459531
Bleu score for 10/unigram_overlap: 0.256075134051543
Bleu score for 10/unigram_multiplicity: 0.25617128598109506
Bleu score for 10/bigram_overlap: 0.2722419757177174
Bleu score for 10/bigram_precision: 0.27225372664595027
Bleu score for 100/unigram_overlap: 0.24599352328777574
Bleu score for 100/unigram_multiplicity: 0.2458465783961177
Bleu score for 100/bigram_overlap: 0.27129324857796216


KeyboardInterrupt: 

In [None]:
for k in [2, 10, 100]:
    for similarity in other_similarities:
#     for similarity in ["unigram_overlap"]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        bleu4 = bleu_score.corpus_bleu(annotations_list, caption_sentences)
        bleu1 = bleu_score.corpus_bleu(annotations_list, caption_sentences, weights=[1.])
        print("Bleu score for {}/{}: {} and {}".format(k, similarity, bleu1, bleu4))

In [None]:
for similarity in ngram_similarities:
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        captions_tokens = extract_caption_tokens(voted_captions)
        file_name = "../../outputs/captions_tokens/{}/{}.pickle".format(k, similarity)
        with open(file_name, "wb") as file:
            pickle.dump(captions_tokens, file, pickle.HIGHEST_PROTOCOL)

In [None]:
for similarity in other_similarities:
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        captions_tokens = extract_caption_tokens(voted_captions)
        file_name = "../../outputs/captions_tokens/{}/{}.pickle".format(k, similarity)
        with open(file_name, "wb") as file:
            pickle.dump(captions_tokens, file, pickle.HIGHEST_PROTOCOL)

## Caption lengths

In [21]:
def caption_length(captions):
    return np.mean([len(c) for c in captions])

In [22]:
for k in [1, 2, 10, 100]:
    beam_captions = load_best_beam_captions(k)
    print("Caption length for {}/beam: {}".format(k, caption_length(beam_captions)))

Caption length for 1/beam: 8.406478372506418
Caption length for 2/beam: 8.788169069721508
Caption length for 10/beam: 9.183562117321745
Caption length for 100/beam: 9.109668180920403


In [23]:
for k in [2, 10, 100]:
    beam_baseline_captions = extract_baseline_sentences(load_beam_baseline_captions(k))
    print("Caption length for {}/baseline: {}".format(k, caption_length(beam_baseline_captions)))

Caption length for 2/baseline: 9.188845546118902
Caption length for 10/baseline: 10.238741852656528
Caption length for 100/baseline: 10.432846138652973


In [24]:
for similarity in ngram_similarities:
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        print("Caption length for {}/{}: {}".format(k, similarity, caption_length(caption_sentences)))

Caption length for 2/unigram_overlap: 9.215929290934229
Caption length for 10/unigram_overlap: 10.397121271973138
Caption length for 100/unigram_overlap: 11.194449930871025
Caption length for 2/unigram_multiplicity: 9.20684376851669
Caption length for 10/unigram_multiplicity: 10.375913490025676
Caption length for 100/unigram_multiplicity: 11.149244519059845
Caption length for 2/bigram_overlap: 8.961855619198104
Caption length for 10/bigram_overlap: 9.862433339917045
Caption length for 100/bigram_overlap: 10.54631641319376
Caption length for 2/bigram_precision: 8.961682796760813
Caption length for 10/bigram_precision: 9.860112581473434
Caption length for 100/bigram_precision: 10.545872012640727


In [25]:
for similarity in other_similarities:
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        print("Caption length for {}/{}: {}".format(k, similarity, caption_length(caption_sentences)))

Caption length for 2/lstm_states: 8.788169069721508
Caption length for 10/lstm_states: 9.167366186055698
Caption length for 100/lstm_states: 8.815870037527159


## Unique captions

In [26]:
def num_unique(captions):
    caption_strings = [" ".join(c) for c in captions]
    return len(np.unique(caption_strings))

In [27]:
for k in [1, 2, 10, 100]:  
    beam_captions = load_best_beam_captions(k)
    print("Unique captions for beam/{}: {}".format(k, num_unique(beam_captions)))       

Unique captions for beam/1: 9141
Unique captions for beam/2: 9208
Unique captions for beam/10: 5488
Unique captions for beam/100: 4150


In [28]:
for k in [2, 10, 100]:
    beam_baseline_captions = extract_baseline_sentences(load_beam_baseline_captions(k))
    print("Unique captions for {}/baseline: {}".format(k, num_unique(beam_baseline_captions)))

Unique captions for 2/baseline: 9978
Unique captions for 10/baseline: 6418
Unique captions for 100/baseline: 5039


In [29]:
for similarity in ngram_similarities:
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        print("Unique captions for {}/{}: {}".format(k, similarity, num_unique(caption_sentences)))       


Unique captions for 2/unigram_overlap: 10727
Unique captions for 10/unigram_overlap: 8916
Unique captions for 100/unigram_overlap: 10808
Unique captions for 2/unigram_multiplicity: 10727
Unique captions for 10/unigram_multiplicity: 8902
Unique captions for 100/unigram_multiplicity: 10768
Unique captions for 2/bigram_overlap: 9519
Unique captions for 10/bigram_overlap: 7598
Unique captions for 100/bigram_overlap: 9221
Unique captions for 2/bigram_precision: 9522
Unique captions for 10/bigram_precision: 7590
Unique captions for 100/bigram_precision: 9248


In [30]:
for similarity in other_similarities:
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        print("Unique captions for {}/{}: {}".format(k, similarity, num_unique(caption_sentences)))       


Unique captions for 2/lstm_states: 9208
Unique captions for 10/lstm_states: 7613
Unique captions for 100/lstm_states: 10133


## Captions used once

In [31]:
def num_used_once(captions):
    caption_strings = [" ".join(c) for c in captions]
    _, num_uses = np.unique(caption_strings, return_counts=True)
    return np.sum(num_uses == 1)

In [32]:
for k in [1, 2, 10, 100]:  
    beam_captions = load_best_beam_captions(k)
    print("Captions used once only forb beam/{}: {}".format(k, num_used_once(beam_captions)))       

Captions used once only forb beam/1: 5421
Captions used once only forb beam/2: 5491
Captions used once only forb beam/10: 2888
Captions used once only forb beam/100: 1955


In [33]:
for k in [2, 10, 100]:
    beam_baseline_captions = extract_baseline_sentences(load_beam_baseline_captions(k))
    print("Caption used once only for {}/baseline: {}".format(k, num_used_once(beam_baseline_captions)))

Caption used once only for 2/baseline: 6096
Caption used once only for 10/baseline: 3584
Caption used once only for 100/baseline: 2621


In [34]:
for similarity in ngram_similarities:
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        print("Captions used once only for {}/{}: {}".format(k, similarity, num_used_once(caption_sentences)))    

Captions used once only for 2/unigram_overlap: 6662
Captions used once only for 10/unigram_overlap: 5273
Captions used once only for 100/unigram_overlap: 6658
Captions used once only for 2/unigram_multiplicity: 6656
Captions used once only for 10/unigram_multiplicity: 5244
Captions used once only for 100/unigram_multiplicity: 6589
Captions used once only for 2/bigram_overlap: 5731
Captions used once only for 10/bigram_overlap: 4369
Captions used once only for 100/bigram_overlap: 5370
Captions used once only for 2/bigram_precision: 5733
Captions used once only for 10/bigram_precision: 4361
Captions used once only for 100/bigram_precision: 5415


In [35]:
for similarity in other_similarities:
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        print("Captions used once only for {}/{}: {}".format(k, similarity, num_used_once(caption_sentences)))    

Captions used once only for 2/lstm_states: 5491
Captions used once only for 10/lstm_states: 4253
Captions used once only for 100/lstm_states: 6061


## Unigram and bigram distance metrics

In [36]:
def num_unique_unigrams(captions):
    caption_strings = [" ".join(c) for c in captions]
    unique_unigrams = set()
    for caption in caption_strings:
        unigrams = caption.split()
        unique_unigrams |= set(unigrams)
    return len(unique_unigrams)

In [37]:
def num_unique_bigrams(captions):
    caption_strings = [" ".join(c) for c in captions]
    unique_bigrams = set()
    N = 0
    for caption in caption_strings:
        unigrams = caption.split()
        bigrams = list(zip(caption.split()[:-1], caption.split()[1:]))
        N += len(unigrams)
        unique_bigrams |= set(bigrams)
    return len(unique_bigrams)

In [38]:
latex_str = ""
for k in [1, 2, 10, 100]:  
    beam_captions = load_best_beam_captions(k)
    n = num_unique_unigrams(beam_captions)
    print("Number of distinct unigrams for beam/{}: {}".format(k, n))
    latex_str += "{} & ".format(n)
print(latex_str[:-2] + "\\\\")

Number of distinct unigrams for beam/1: 632
Number of distinct unigrams for beam/2: 668
Number of distinct unigrams for beam/10: 621
Number of distinct unigrams for beam/100: 605
632 & 668 & 621 & 605 \\


In [39]:
latex_str = ""
for k in [2, 10, 100]:
    beam_baseline_captions = extract_baseline_sentences(load_beam_baseline_captions(k))
    n = num_unique_unigrams(beam_baseline_captions)
    print("Number of distinct unigrams for baseline/{}: {}".format(k, n)) 
    latex_str += "{} & ".format(n)
print(latex_str[:-2] + "\\\\")

Number of distinct unigrams for baseline/2: 681
Number of distinct unigrams for baseline/10: 627
Number of distinct unigrams for baseline/100: 587
681 & 627 & 587 \\


In [40]:
for similarity in other_similarities:
    latex_str = ""
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        n = num_unique_unigrams(caption_sentences)
        print("Number of distinct unigrams for {}/{}: {}".format(k, similarity, n))    
        latex_str += "{} & ".format(n)
    print(latex_str[:-2] + "\\\\")

Number of distinct unigrams for 2/lstm_states: 668
Number of distinct unigrams for 10/lstm_states: 629
Number of distinct unigrams for 100/lstm_states: 655
668 & 629 & 655 \\


In [41]:
strs = ""
for similarity in ngram_similarities:
    latex_str = ""
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        n = num_unique_unigrams(caption_sentences)
        print("Number of distinct unigrams for {}/{}: {}".format(k, similarity, n))    
        latex_str += "{} & ".format(n)
    strs += latex_str
    strs += "\n"
print(strs)

Number of distinct unigrams for 2/unigram_overlap: 687
Number of distinct unigrams for 10/unigram_overlap: 646
Number of distinct unigrams for 100/unigram_overlap: 628
Number of distinct unigrams for 2/unigram_multiplicity: 687
Number of distinct unigrams for 10/unigram_multiplicity: 645
Number of distinct unigrams for 100/unigram_multiplicity: 638
Number of distinct unigrams for 2/bigram_overlap: 673
Number of distinct unigrams for 10/bigram_overlap: 620
Number of distinct unigrams for 100/bigram_overlap: 580
Number of distinct unigrams for 2/bigram_precision: 673
Number of distinct unigrams for 10/bigram_precision: 620
Number of distinct unigrams for 100/bigram_precision: 581
687 & 646 & 628 & 
687 & 645 & 638 & 
673 & 620 & 580 & 
673 & 620 & 581 & 



In [42]:
latex_str = ""
for k in [1, 2, 10, 100]:  
    beam_captions = load_best_beam_captions(k)
    n = num_unique_bigrams(beam_captions)
    print("Number of distinct bigrams for beam/{}: {}".format(k, n))  
    latex_str += "{} & ".format(n)
print(latex_str[:-2] + "\\\\")

Number of distinct bigrams for beam/1: 3067
Number of distinct bigrams for beam/2: 3395
Number of distinct bigrams for beam/10: 2778
Number of distinct bigrams for beam/100: 2479
3067 & 3395 & 2778 & 2479 \\


In [43]:
latex_str = ""
for k in [2, 10, 100]:
    beam_baseline_captions = extract_baseline_sentences(load_beam_baseline_captions(k))
    n = num_unique_bigrams(beam_baseline_captions)
    print("Number of distinct bigrams for baseline/{}: {}".format(k, n))
    latex_str += "{} & ".format(n)
print(latex_str[:-2] + "\\\\")

Number of distinct bigrams for baseline/2: 3502
Number of distinct bigrams for baseline/10: 2863
Number of distinct bigrams for baseline/100: 2471
3502 & 2863 & 2471 \\


In [45]:
for similarity in ngram_similarities:
    latex_str = ""
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        n = num_unique_bigrams(caption_sentences)
        print("Number of distinct bigrams for {}/{}: {}".format(k, similarity, n))    
        latex_str += "{} & ".format(n)
    print(latex_str[:-2] + "\\\\")

Number of distinct bigrams for 2/unigram_overlap: 3576
Number of distinct bigrams for 10/unigram_overlap: 3232
Number of distinct bigrams for 100/unigram_overlap: 3596
3576 & 3232 & 3596 \\
Number of distinct bigrams for 2/unigram_multiplicity: 3572
Number of distinct bigrams for 10/unigram_multiplicity: 3238
Number of distinct bigrams for 100/unigram_multiplicity: 3607
3572 & 3238 & 3607 \\
Number of distinct bigrams for 2/bigram_overlap: 3446
Number of distinct bigrams for 10/bigram_overlap: 2854
Number of distinct bigrams for 100/bigram_overlap: 2887
3446 & 2854 & 2887 \\
Number of distinct bigrams for 2/bigram_precision: 3444
Number of distinct bigrams for 10/bigram_precision: 2848
Number of distinct bigrams for 100/bigram_precision: 2892
3444 & 2848 & 2892 \\


In [46]:
for similarity in other_similarities:
    latex_str = ""
    for k in [2, 10, 100]:
        voted_captions = load_voted_captions(k, similarity)
        caption_sentences = extract_caption_sentences(voted_captions)
        n = num_unique_bigrams(caption_sentences)
        print("Number of distinct bigrams for {}/{}: {}".format(k, similarity, n))    
        latex_str += "{} & ".format(n)
    print(latex_str[:-2] + "\\\\")

Number of distinct bigrams for 2/lstm_states: 3395
Number of distinct bigrams for 10/lstm_states: 2891
Number of distinct bigrams for 100/lstm_states: 3331
3395 & 2891 & 3331 \\


## Statistical significance of BLEU

In [226]:
def evaluate_on_sample(gold, sys1, sys1_precisions, sys2, sys2_precisions, num_samples):
    ids = list(range(len(gold)))
    random_ids = np.random.choice(ids, num_samples, replace=True)
    
    sys1_score = my_bleu(
            sys1_precisions, 
            gold, 
            sys1, 
            random_ids)
    
    sys2_score = my_bleu(
            sys2_precisions,
            gold,
            sys2,
            random_ids
    )
    return sys1_score, sys2_score


In [52]:
lstm_100_captions = extract_caption_sentences(load_voted_captions(100, similarity))
beam_10_captions = load_best_beam_captions(10)

In [236]:
sys1_scores = []
sys2_scores = []
wins = [0, 0, 0]

num_samples = 40504

In [None]:
lstm_mod_precisions = get_modified_precisions(annotations_list, lstm_100_captions)
beam_mod_precisions = get_modified_precisions(annotations_list, beam_10_captions)

In [237]:
%%time
num_iters = 1000

for i in range(num_iters):
    print(i, end=" ")
    s1, s2 = evaluate_on_sample(annotations_list, 
                                lstm_100_captions, 
                                lstm_mod_precisions,
                                beam_10_captions, 
                                beam_mod_precisions,
                                num_samples=num_samples)
    if s1 > s2:
        wins[0] += 1
    elif s2 > s1:
        wins[1] += 1
    else:
        wins[2] += 1
    sys1_scores.append(s1)
    sys2_scores.append(s2)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 27

In [241]:
print(wins)
sys1_scores_sorted = sorted(sys1_scores)
sys2_scores_sorted = sorted(sys2_scores)
print(sys1_scores_sorted[int(len(sys1_scores) * 0.025)], sys1_scores_sorted[int(len(sys1_scores) * 0.975)])
print(sys2_scores_sorted[int(len(sys2_scores) * 0.025)], sys2_scores_sorted[int(len(sys2_scores) * 0.975)])
print(np.mean(sys1_scores_sorted))
print(np.mean(sys2_scores_sorted))

[1000, 0, 0]
0.27998754657599395 0.2847675093681587
0.26916031387011413 0.2741859947973581
0.28227709530761264
0.2716035345645887



## Reimplement bleu...

In [137]:
from nltk.util import ngrams

def modified_precision(references, hypothesis, n):
    # Extracts all ngrams in hypothesis
    # Set an empty Counter if hypothesis is empty.
    counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
    # Extract a union of references' counts.
    # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
    max_counts = {}
    for reference in references:
        reference_counts = (
            Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
        )
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

    # Assigns the intersection between hypothesis and references' counts.
    clipped_counts = {
        ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
    }

    numerator = sum(clipped_counts.values())
    # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
    # Usually this happens when the ngram order is > len(reference).
    denominator = max(1, sum(counts.values()))

    return Fraction(numerator, denominator, _normalize=False)

def closest_ref_length(references, hyp_len):
    ref_lens = (len(reference) for reference in references)
    closest_ref_len = min(
        ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
    )
    return closest_ref_len

def brevity_penalty(closest_ref_len, hyp_len):
    if hyp_len > closest_ref_len:
        return 1
    # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
    elif hyp_len == 0:
        return 0
    else:
        return math.exp(1 - closest_ref_len / hyp_len)


In [219]:
from collections import Counter

import fractions
import math
import warnings

try:
    fractions.Fraction(0, 1000, _normalize=False)
    from fractions import Fraction
except TypeError:
    from nltk.compat import Fraction
    
def my_bleu(
    modified_precisions,
    list_of_references,
    hypotheses,
    indices,
    weights=(0.25, 0.25, 0.25, 0.25),
    smoothing_function=None,
    auto_reweigh=False,
):
    p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), (
        "The number of hypotheses and their reference(s) should be the " "same "
    )
    
    red_modified_precisions = [modified_precisions[i] for i in indices]
    reduced_refs = [list_of_references[i] for i in indices]
    reduced_hyps = [hypotheses[i] for i in indices]
    
    # Iterate through each hypothesis and their corresponding references.
    for mod_prec in red_modified_precisions:
        for i, _ in enumerate(weights,start=1):
            p_i = mod_prec[i-1]
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator   
    
    for (references, hypothesis) in zip(reduced_refs, reduced_hyps):
            # Calculate the hypothesis length and the closest reference length.
            # Adds them to the corpus-level hypothesis and reference counts.
            hyp_len = len(hypothesis)
            hyp_lengths += hyp_len
            ref_lengths += closest_ref_length(references, hyp_len)
            
    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = (1 / hyp_lengths,) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(
        p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
    )
    s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
    s = bp * math.exp(math.fsum(s))
    return s



In [221]:
def get_modified_precisions(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)):
    modified_precisions = []
    
    for references, hypothesis in zip(list_of_references, hypotheses):
        ps = [modified_precision(references, hypothesis, i) for i, _ in enumerate(weights, start=1)]
        modified_precisions.append(ps)
    return modified_precisions

In [225]:
# for _ in range(10):
#     indices = np.random.choice(list(range(len(annotations_list))), 100, replace=True)

#     a = my_bleu(lstm_mod_precisions, 
#             annotations_list, 
#             lstm_100_captions, 
#             indices)
    
#     a2 = my_bleu(
#         beam_mod_precisions,
#         annotations_list,
#         beam_10_captions,
#         indices
#     )
#     anns = [annotations_list[i] for i in indices]
#     lstm_hyps = [lstm_100_captions[i] for i in indices]
#     beam_hyps = [beam_10_captions[i] for i in indices]

#     b = bleu_score.corpus_bleu(anns, lstm_hyps)
#     b2 = bleu_score.corpus_bleu(anns, beam_hyps)
#     print(a == b, a2 == b2)

True True
True True
True True
True True
True True
True True
True True
True True
True True
True True


In [150]:
ids = list(range(len(annotations_list)))
random_ids = np.random.choice(ids, 2, replace=True)
    
reduced_gold = [annotations_list[i] for i in random_ids]
reduced_sys1 = [lstm_100_captions[i] for i in random_ids]

print(my_bleu(reduced_gold, reduced_sys1))

sys1_score = bleu_score.corpus_bleu(reduced_gold, reduced_sys1)
print(sys1_score)

bleus = [bleu_score.sentence_bleu(reduced_gold[i], reduced_sys1[i]) for i in range(len(reduced_gold))]
print(bleus)
print(np.mean(bleus))

TypeError: my_bleu() missing 1 required positional argument: 'indices'

In [145]:
class SmoothingFunction:
    """
    This is an implementation of the smoothing techniques
    for segment-level BLEU scores that was presented in
    Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
    Smoothing Techniques for Sentence-Level BLEU. In WMT14.
    http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
    """

    def __init__(self, epsilon=0.1, alpha=5, k=5):
        """
        This will initialize the parameters required for the various smoothing
        techniques, the default values are set to the numbers used in the
        experiments from Chen and Cherry (2014).

        >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
        ...                 'that', 'the', 'military', 'always', 'obeys', 'the',
        ...                 'commands', 'of', 'the', 'party']
        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
        ...               'that', 'the', 'military', 'will', 'forever', 'heed',
        ...               'Party', 'commands']

        >>> chencherry = SmoothingFunction()
        >>> print (sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
        0.4118...
        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
        0.4118...
        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
        0.4118...
        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
        0.4489...
        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
        0.4118...
        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
        0.4118...
        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
        0.4905...
        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
        0.4135...
        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
        0.4905...

        :param epsilon: the epsilon value use in method 1
        :type epsilon: float
        :param alpha: the alpha value use in method 6
        :type alpha: int
        :param k: the k value use in method 4
        :type k: int
        """
        self.epsilon = epsilon
        self.alpha = alpha
        self.k = k
    
    def method0(self, p_n, *args, **kwargs):
        """
        No smoothing.
        """
        p_n_new = []
        for i, p_i in enumerate(p_n):
            if p_i.numerator != 0:
                p_n_new.append(p_i)
            else:
                _msg = str(
                    "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
                    "Therefore the BLEU score evaluates to 0, independently of\n"
                    "how many N-gram overlaps of lower order it contains.\n"
                    "Consider using lower n-gram order or use "
                    "SmoothingFunction()"
                ).format(i + 1)
                warnings.warn(_msg)
                # When numerator==0 where denonminator==0 or !=0, the result
                # for the precision score should be equal to 0 or undefined.
                # Due to BLEU geometric mean computation in logarithm space,
                # we we need to take the return sys.float_info.min such that
                # math.log(sys.float_info.min) returns a 0 precision score.
                p_n_new.append(sys.float_info.min)
        return p_n_new


In [65]:
eval_with_paired_bootstrap(annotations_list, 
                           lstm_100_captions, 
                           beam_10_captions, 
                           num_samples=1000,
                           sample_ratio=0.05)




  0%|          | 0/1000 [00:00<?, ?it/s][A[A[A


  0%|          | 1/1000 [00:02<34:12,  2.05s/it][A[A[A


  0%|          | 2/1000 [00:04<34:01,  2.05s/it][A[A[A


  0%|          | 3/1000 [00:06<33:58,  2.04s/it][A[A[A


  0%|          | 4/1000 [00:08<33:50,  2.04s/it][A[A[A


  0%|          | 5/1000 [00:10<34:05,  2.06s/it][A[A[A


  1%|          | 6/1000 [00:12<33:57,  2.05s/it][A[A[A


  1%|          | 7/1000 [00:14<34:09,  2.06s/it][A[A[A


  1%|          | 8/1000 [00:16<34:30,  2.09s/it][A[A[A


  1%|          | 9/1000 [00:18<34:19,  2.08s/it][A[A[A


  1%|          | 10/1000 [00:20<34:03,  2.06s/it][A[A[A


  1%|          | 11/1000 [00:22<34:18,  2.08s/it][A[A[A


  1%|          | 12/1000 [00:24<33:58,  2.06s/it][A[A[A


  1%|▏         | 13/1000 [00:26<33:58,  2.07s/it][A[A[A


  1%|▏         | 14/1000 [00:28<34:04,  2.07s/it][A[A[A


  2%|▏         | 15/1000 [00:31<34:42,  2.11s/it][A[A[A


  2%|▏         | 16/1000 [00:33<35:40, 

 13%|█▎        | 134/1000 [04:35<29:10,  2.02s/it][A[A[A


 14%|█▎        | 135/1000 [04:37<29:06,  2.02s/it][A[A[A


 14%|█▎        | 136/1000 [04:39<29:07,  2.02s/it][A[A[A


 14%|█▎        | 137/1000 [04:41<29:04,  2.02s/it][A[A[A


 14%|█▍        | 138/1000 [04:43<29:03,  2.02s/it][A[A[A


 14%|█▍        | 139/1000 [04:45<29:00,  2.02s/it][A[A[A


 14%|█▍        | 140/1000 [04:47<29:03,  2.03s/it][A[A[A


 14%|█▍        | 141/1000 [04:49<28:58,  2.02s/it][A[A[A


 14%|█▍        | 142/1000 [04:51<29:05,  2.03s/it][A[A[A


 14%|█▍        | 143/1000 [04:53<29:05,  2.04s/it][A[A[A


 14%|█▍        | 144/1000 [04:55<28:59,  2.03s/it][A[A[A


 14%|█▍        | 145/1000 [04:57<29:15,  2.05s/it][A[A[A


 15%|█▍        | 146/1000 [04:59<29:22,  2.06s/it][A[A[A


 15%|█▍        | 147/1000 [05:01<29:11,  2.05s/it][A[A[A


 15%|█▍        | 148/1000 [05:03<29:02,  2.05s/it][A[A[A


 15%|█▍        | 149/1000 [05:06<28:55,  2.04s/it][A[A[A


 15%|█▌ 

 27%|██▋       | 266/1000 [09:04<24:48,  2.03s/it][A[A[A


 27%|██▋       | 267/1000 [09:06<24:44,  2.03s/it][A[A[A


 27%|██▋       | 268/1000 [09:08<24:42,  2.03s/it][A[A[A


 27%|██▋       | 269/1000 [09:10<24:38,  2.02s/it][A[A[A


 27%|██▋       | 270/1000 [09:12<24:36,  2.02s/it][A[A[A


 27%|██▋       | 271/1000 [09:14<24:31,  2.02s/it][A[A[A


 27%|██▋       | 272/1000 [09:16<24:33,  2.02s/it][A[A[A


 27%|██▋       | 273/1000 [09:18<24:32,  2.03s/it][A[A[A


 27%|██▋       | 274/1000 [09:20<24:58,  2.06s/it][A[A[A


 28%|██▊       | 275/1000 [09:22<24:53,  2.06s/it][A[A[A


 28%|██▊       | 276/1000 [09:24<24:42,  2.05s/it][A[A[A


 28%|██▊       | 277/1000 [09:26<24:35,  2.04s/it][A[A[A


 28%|██▊       | 278/1000 [09:28<24:30,  2.04s/it][A[A[A


 28%|██▊       | 279/1000 [09:30<24:24,  2.03s/it][A[A[A


 28%|██▊       | 280/1000 [09:32<24:20,  2.03s/it][A[A[A


 28%|██▊       | 281/1000 [09:34<24:15,  2.02s/it][A[A[A


 28%|██▊

 40%|███▉      | 398/1000 [13:31<20:17,  2.02s/it][A[A[A


 40%|███▉      | 399/1000 [13:33<20:15,  2.02s/it][A[A[A


 40%|████      | 400/1000 [13:35<20:12,  2.02s/it][A[A[A


 40%|████      | 401/1000 [13:37<20:09,  2.02s/it][A[A[A


 40%|████      | 402/1000 [13:39<20:07,  2.02s/it][A[A[A


 40%|████      | 403/1000 [13:41<20:03,  2.02s/it][A[A[A


 40%|████      | 404/1000 [13:43<20:01,  2.02s/it][A[A[A


 40%|████      | 405/1000 [13:45<19:59,  2.02s/it][A[A[A


 41%|████      | 406/1000 [13:47<19:57,  2.02s/it][A[A[A


 41%|████      | 407/1000 [13:49<19:52,  2.01s/it][A[A[A


 41%|████      | 408/1000 [13:51<19:51,  2.01s/it][A[A[A


 41%|████      | 409/1000 [13:53<19:50,  2.01s/it][A[A[A


 41%|████      | 410/1000 [13:55<19:47,  2.01s/it][A[A[A


 41%|████      | 411/1000 [13:57<19:46,  2.01s/it][A[A[A


 41%|████      | 412/1000 [13:59<19:43,  2.01s/it][A[A[A


 41%|████▏     | 413/1000 [14:01<19:42,  2.01s/it][A[A[A


 41%|███

 53%|█████▎    | 530/1000 [17:59<16:01,  2.05s/it][A[A[A


 53%|█████▎    | 531/1000 [18:01<15:54,  2.04s/it][A[A[A


 53%|█████▎    | 532/1000 [18:03<15:51,  2.03s/it][A[A[A


 53%|█████▎    | 533/1000 [18:05<15:47,  2.03s/it][A[A[A


 53%|█████▎    | 534/1000 [18:07<15:43,  2.02s/it][A[A[A


 54%|█████▎    | 535/1000 [18:09<15:40,  2.02s/it][A[A[A


 54%|█████▎    | 536/1000 [18:11<15:37,  2.02s/it][A[A[A


 54%|█████▎    | 537/1000 [18:13<15:37,  2.02s/it][A[A[A


 54%|█████▍    | 538/1000 [18:15<15:33,  2.02s/it][A[A[A


 54%|█████▍    | 539/1000 [18:17<15:39,  2.04s/it][A[A[A


 54%|█████▍    | 540/1000 [18:19<15:33,  2.03s/it][A[A[A


 54%|█████▍    | 541/1000 [18:21<15:29,  2.02s/it][A[A[A


 54%|█████▍    | 542/1000 [18:23<15:26,  2.02s/it][A[A[A


 54%|█████▍    | 543/1000 [18:25<15:24,  2.02s/it][A[A[A


 54%|█████▍    | 544/1000 [18:27<15:21,  2.02s/it][A[A[A


 55%|█████▍    | 545/1000 [18:29<15:17,  2.02s/it][A[A[A


 55%|███

 66%|██████▌   | 662/1000 [22:31<11:22,  2.02s/it][A[A[A


 66%|██████▋   | 663/1000 [22:33<11:20,  2.02s/it][A[A[A


 66%|██████▋   | 664/1000 [22:35<11:18,  2.02s/it][A[A[A


 66%|██████▋   | 665/1000 [22:37<11:15,  2.02s/it][A[A[A


 67%|██████▋   | 666/1000 [22:39<11:13,  2.02s/it][A[A[A


 67%|██████▋   | 667/1000 [22:41<11:12,  2.02s/it][A[A[A


 67%|██████▋   | 668/1000 [22:43<11:10,  2.02s/it][A[A[A


 67%|██████▋   | 669/1000 [22:45<11:08,  2.02s/it][A[A[A


 67%|██████▋   | 670/1000 [22:47<11:08,  2.02s/it][A[A[A


 67%|██████▋   | 671/1000 [22:49<11:05,  2.02s/it][A[A[A


 67%|██████▋   | 672/1000 [22:51<11:07,  2.03s/it][A[A[A


 67%|██████▋   | 673/1000 [22:53<11:05,  2.03s/it][A[A[A


 67%|██████▋   | 674/1000 [22:55<11:05,  2.04s/it][A[A[A


 68%|██████▊   | 675/1000 [22:57<11:05,  2.05s/it][A[A[A


 68%|██████▊   | 676/1000 [22:59<11:05,  2.05s/it][A[A[A


 68%|██████▊   | 677/1000 [23:01<11:02,  2.05s/it][A[A[A


 68%|███

 79%|███████▉  | 794/1000 [27:08<07:06,  2.07s/it][A[A[A


 80%|███████▉  | 795/1000 [27:10<07:04,  2.07s/it][A[A[A


 80%|███████▉  | 796/1000 [27:12<07:00,  2.06s/it][A[A[A


 80%|███████▉  | 797/1000 [27:14<07:00,  2.07s/it][A[A[A


 80%|███████▉  | 798/1000 [27:16<06:58,  2.07s/it][A[A[A


 80%|███████▉  | 799/1000 [27:18<06:54,  2.06s/it][A[A[A


 80%|████████  | 800/1000 [27:20<06:56,  2.08s/it][A[A[A


 80%|████████  | 801/1000 [27:22<06:54,  2.08s/it][A[A[A


 80%|████████  | 802/1000 [27:25<06:49,  2.07s/it][A[A[A


 80%|████████  | 803/1000 [27:27<06:51,  2.09s/it][A[A[A


 80%|████████  | 804/1000 [27:29<06:48,  2.08s/it][A[A[A


 80%|████████  | 805/1000 [27:31<06:44,  2.07s/it][A[A[A


 81%|████████  | 806/1000 [27:33<06:38,  2.06s/it][A[A[A


 81%|████████  | 807/1000 [27:35<06:35,  2.05s/it][A[A[A


 81%|████████  | 808/1000 [27:37<06:31,  2.04s/it][A[A[A


 81%|████████  | 809/1000 [27:39<06:28,  2.04s/it][A[A[A


 81%|███

 93%|█████████▎| 926/1000 [31:53<02:49,  2.29s/it][A[A[A


 93%|█████████▎| 927/1000 [31:56<02:51,  2.35s/it][A[A[A


 93%|█████████▎| 928/1000 [31:58<02:47,  2.33s/it][A[A[A


 93%|█████████▎| 929/1000 [32:00<02:43,  2.30s/it][A[A[A


 93%|█████████▎| 930/1000 [32:02<02:40,  2.29s/it][A[A[A


 93%|█████████▎| 931/1000 [32:05<02:38,  2.30s/it][A[A[A


 93%|█████████▎| 932/1000 [32:07<02:34,  2.28s/it][A[A[A


 93%|█████████▎| 933/1000 [32:09<02:32,  2.28s/it][A[A[A


 93%|█████████▎| 934/1000 [32:12<02:29,  2.27s/it][A[A[A


 94%|█████████▎| 935/1000 [32:14<02:27,  2.27s/it][A[A[A


 94%|█████████▎| 936/1000 [32:16<02:24,  2.25s/it][A[A[A


 94%|█████████▎| 937/1000 [32:18<02:22,  2.25s/it][A[A[A


 94%|█████████▍| 938/1000 [32:20<02:19,  2.25s/it][A[A[A


 94%|█████████▍| 939/1000 [32:23<02:17,  2.25s/it][A[A[A


 94%|█████████▍| 940/1000 [32:25<02:14,  2.23s/it][A[A[A


 94%|█████████▍| 941/1000 [32:27<02:12,  2.24s/it][A[A[A


 94%|███

Win ratio: sys1=0.997, sys2=0.003, tie=0.997
(sys1 is superior with p value p=0.003)

sys1 mean=0.282, median=0.283, 95% confidence interval=[0.271, 0.293]
sys2 mean=0.272, median=0.271, 95% confidence interval=[0.260, 0.283]


## Qualitative analysis

In [None]:
# Compute the bleu score per image
beam_sentence_bleus = []
k = 10
beam_captions = load_best_beam_captions(k)
for beam_caption, annotations in zip(beam_captions, annotations_list):
    beam_sentence_bleus.append(bleu_score.sentence_bleu(annotations, beam_caption))
    
print("Bleu scores: {}".format(len(beam_sentence_bleus)))

In [None]:
k = 10
similarity ='bigram_overlap'
vote_sentence_bleus = []
voted_captions = load_voted_captions(k, similarity)
caption_sentences = extract_caption_sentences(voted_captions)
for vote_caption, annotations in zip(caption_sentences, annotations_list):
    vote_sentence_bleus.append(bleu_score.sentence_bleu(annotations, vote_caption))
print("Bleu scores: {}".format(len(vote_sentence_bleus)))

In [None]:
diff_idxs = np.argsort(np.array(vote_sentence_bleus) - np.array(beam_sentence_bleus))

In [None]:
import importlib
import utils
importlib.reload(utils)


In [None]:
for idx in diff_idxs[:10]:
    print("Bigram overlap:\t{:.4f}\t{}".format(vote_sentence_bleus[idx], " ".join(caption_sentences[idx])))
    print("Beam:\t\t{:.4f}\t{}".format(beam_sentence_bleus[idx], " ".join(beam_captions[idx])))
    print("Annotations: ")
    for annotation in annotations_list[idx]:
        print("\t\t"," ".join(annotation))
    utils.print_image(idx)
    plt.show()

In [None]:
for idx in diff_idxs[-10:]:
    print(idx)
    print("Bigram overlap:\t{:.4f}\t{}".format(vote_sentence_bleus[idx], " ".join(caption_sentences[idx])))
    print("Beam:\t\t{:.4f}\t{}".format(beam_sentence_bleus[idx], " ".join(beam_captions[idx])))
    print("Annotations: ")
    for annotation in annotations_list[idx]:
        print("\t\t"," ".join(annotation))
    utils.print_image(idx)
    plt.show()

In [None]:
beam_captions_dir = "../../outputs/beam_captions_10/"
poor_beam_caption_object = load_caption(6843, image_dir=beam_captions_dir)
for caption in poor_beam_caption_object['captions']:
    print("\item {:.5f}: {}".format(caption['probability'], " ".join(decode_caption(caption['sentence'], vocab))))