In [1]:
import pickle
import math

from nltk import word_tokenize
from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, SmoothingFunction, sentence_bleu

from collections import Counter

from fractions import Fraction

from modules.sentence import tokenizer, read, detokenize
from modules.model import NMT



In [2]:
nbest = 0
weights = (0.25, 0.25, 0.25, 0.25)

In [3]:
with open('models/fl_en/fl-en.nlm', 'rb') as f:
    config = pickle.load(f)
    model = NMT('nmt', config)
    model.load(f)
    f.close()

In [4]:
source_tokenizer = tokenizer(config['source_tokenizer'], lowercase=config['source_lowercase'])
source_eval = read('models/fl_en/source.data.eval', source_tokenizer, config['backwards'])

In [5]:
target_tokenizer = tokenizer('word', lowercase=config['target_lowercase'])
references = read('models/fl_en/source.data.eval', target_tokenizer, config['backwards'])

In [6]:
output_file = open('models/fl_en/result.data.eval', 'w', encoding='utf-8')

hypotheses = []

for i, sent in enumerate(model.translate(source_eval, encode=True, nbest=nbest)):
    print(sent, file=output_file, flush=True)
    hypotheses.append(word_tokenize(sent))
    
output_file.close()

In [7]:
evaluation_file = open('models/fl_en/scores.data.eval.csv','w', encoding='utf-8')

p_numerators = Counter()
p_denominators = Counter()
hyp_lengths, ref_lengths = 0, 0

for reference, hypothesis in zip(references, hypotheses):
    
    hyp_len = len(hypothesis)
    ref_len = closest_ref_length(references, hyp_len)
    
    hyp_lengths += hyp_len
    ref_lengths += ref_len
    
    set_data = '%d,%d' % (ref_len, hyp_len)
    
    for i, _ in enumerate(weights, start=1):
        p_i = modified_precision(reference, hypothesis, i)
        p_numerators[i] += p_i.numerator
        p_denominators[i] += p_i.denominator
        set_data += ',%d,%d' % (p_i.numerator, p_i.denominator)
        
    set_data += ',%f' % sentence_bleu(reference, hypothesis)
        
evaluation_file.close()

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Counter({1: 7200, 2: 2, 3: 0, 4: 0})
Counter({1: 45782, 2: 41384, 3: 37071, 4: 32852})


In [8]:
bp = brevity_penalty(ref_lengths, hyp_lengths)

In [9]:
p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
    for i, _ in enumerate(weights, start=1)]

[Fraction(7200, 45782), Fraction(2, 41384), Fraction(0, 37071), Fraction(0, 32852)]


In [10]:
smoothing_function = SmoothingFunction().method0

p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis,
                             hyp_len=hyp_len, emulate_multibleu=False)

s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n)))

print(math.exp(math.fsum(s)))

0.052505985756835086


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [12]:
from nltk.translate.bleu_score import corpus_bleu

corpus_bleu(references, hypotheses)

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.052505985756835086