In [1]:
import math

from nltk import word_tokenize
from nltk.translate.bleu_score import (modified_precision,
    closest_ref_length, brevity_penalty, SmoothingFunction, sentence_bleu,
    corpus_bleu)
from fractions import Fraction
from collections import Counter

from modules.sentence import (tokenizer, read, detokenize)

In [3]:
REFERENCE_FILE = 'models/fl_en/target.data.eval'
HYPOTHESES_FILE = 'models/fl_en/result.data.eval'
EVALUATION_FILE = 'models/fl_en/scores.data.eval.csv'

references = read(REFERENCE_FILE, word_tokenize, False)
hypotheses = read(HYPOTHESES_FILE, word_tokenize, False)

print('Expecting: ')
print(corpus_bleu(references, hypotheses))

Expecting: 
0.060922642890074576


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [4]:
weights = (0.25, 0.25, 0.25, 0.25)

nbest = 0
p_numerators = Counter()
p_denominators = Counter()
hyp_lengths, ref_lengths = 0, 0

output = open(EVALUATION_FILE, 'w')

for reference, hypothesis in zip(references, hypotheses):
    hyp_len = len(hypothesis)
    ref_len = closest_ref_length(reference, hyp_len)

    hyp_lengths += hyp_len
    ref_lengths += ref_len

    set_data = '%d,%d' % (ref_len, hyp_len)
    
    for i, _ in enumerate(weights, start=1):
        p_i = modified_precision(reference, hypothesis, i)
        p_numerators[i] += p_i.numerator
        p_denominators[i] += p_i.denominator
        set_data += ',%d,%d' % (p_i.numerator, p_i.denominator)

    set_data += ',%f' % sentence_bleu([reference], hypothesis)

    print(set_data, file=output, flush=True)

    bp = brevity_penalty(ref_lengths, hyp_lengths)

    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
            for i, _ in enumerate(weights, start=1)]

    smoothing_function = SmoothingFunction().method0

    p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis,
                                 hyp_len=hyp_len, emulate_multibleu=False)

    s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n)))

print("Result: ")
print(math.exp(math.fsum(s)))
    
output.close()

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Result: 
0.060922642890074576


In [8]:
from modules.bleu import BLEU

BLEU(hypotheses, references)

AttributeError: 'list' object has no attribute 'strip'