In [13]:
#!/usr/bin/python
import os, sys, codecs
sys.path.insert(0, "../")
import models

def parse_aligned_token(aligned_token):
    # parse a token consisting of the form '0-1-S'.
    parts = aligned_token.split('-')
    src_index, trg_index, kind = int(parts[0]), int(parts[1]), parts[2]
    return src_index, trg_index, kind

def parse_alignments(alignment_path):
    # read all alignments from file
    alignments = []
    for line in open(alignment_path):
        these_alignments = {}
        for aligned_token in line.strip().split():
            src_index, trg_index, kind = parse_aligned_token(aligned_token)
            if trg_index not in these_alignments:
                these_alignments[trg_index] = {}
            assert src_index not in these_alignments[trg_index]
            these_alignments[trg_index][src_index] = kind
        alignments.append(these_alignments)
    return alignments

def validate(src_corpus, trg_corpus, alignments):
    # check all alignment points are valid given corpora.
    assert len(src_corpus) == len(trg_corpus)
    assert len(src_corpus) == len(alignments), "%d != %d" % (
        len(src_corpus), len(alignments))
    for i in range(len(src_corpus)):
        for trg_index in alignments[i]:
            assert trg_index >= 0
            assert trg_index < len(trg_corpus[i])
            for src_index in alignments[i][trg_index]:
                assert src_index >= 0
                assert src_index < len(src_corpus[i])
    return True

def recall(reference, candidate):
    # proportion of sure alignments in reference that were found
    reference_sure, candidate_sure_correct = 0, 0
    assert len(reference) == len(candidate)
    for i, ref in enumerate(reference):
        for src_index in ref:
            for trg_index in ref[src_index]:
                if ref[src_index][trg_index] == "S":
                    reference_sure += 1
                    if src_index in candidate[i]:
                        if trg_index in candidate[i][src_index]:
                            candidate_sure_correct += 1
    return reference_sure, candidate_sure_correct

def precision(reference, candidate):
    # proportion of candidate alignments that are correct
    candidate_correct_any, candidate_total = 0, 0
    assert len(reference) == len(candidate)
    for i, cand in enumerate(candidate):
        for src_index in cand:
            for trg_index in cand[src_index]:
                candidate_total += 1
                if src_index in reference[i]:
                    if trg_index in reference[i][src_index]:
                        candidate_correct_any += 1
    return candidate_total, candidate_correct_any

def score(reference, candidate):
    reference_sure, candidate_sure_correct = recall(reference, candidate)
    candidate_total, candidate_correct_any = precision(reference, candidate)
    recall_score = 0.0
    if reference_sure > 0:
        recall_score = float(candidate_sure_correct) / reference_sure
    precision_score = 0
    if candidate_total > 0:
        precision_score = float(candidate_correct_any) / candidate_total
    aer = 1.0
    if candidate_total + reference_sure > 0:
        aer = 1.0 - float(candidate_sure_correct + candidate_correct_any) / (
            candidate_total + reference_sure)
    return recall_score, precision_score, aer



In [15]:
def read_all_tokens(path):
    return [line.strip().split() for line in codecs.open(path, 'r', 'utf8')]

def count_word_cooccurrences(src_corpus, trg_corpus):
    "Counts how many times each pair of source and target words occur together."
    counts = {}
    for i, src_sent in enumerate(src_corpus):
        for src in src_sent:
            if not src in counts:
                counts[src] = {}
            for trg in trg_corpus[i]:
                if not trg in counts[src]:
                    counts[src][trg] = 0
                counts[src][trg] += 1
    return counts

def align_corpus(src_corpus, trg_corpus, counts):
    "Aligns each source word with the most commonly associated target word."
    alignments = []
    for i, src_sent in enumerate(src_corpus):
        alignment = {}
        for j, src in enumerate(src_sent):
            if src not in counts:
                continue
            max_count, best_trg = 0, -1
            for k, trg in enumerate(trg_corpus[i]):
                if trg not in counts[src]:
                    continue
                if counts[src][trg] > max_count:
                    best_trg = k
                    max_count = counts[src][trg]
            if best_trg > -1:
                alignment[j] = best_trg
        alignments.append(alignment)
    return alignments


In [14]:
# if __name__ == "__main__":
#     """
#     Example:
#     ./ve/bin/python eval.py \
#       ./test/en-cs.en.dev.tokens \
#       ./test/en-cs.cs.dev.tokens \
#       ./test/en-cs.wa.dev \
#       ./test/en-cs.wa.dev
#     """
#     if len(sys.argv) != 5:
#         print("Usage: python eval.py src_corpus trg_corpus reference candidate")
#         sys.exit(0)
#     src_corpus = [
#         line.strip().split() 
#         for line in codecs.open(sys.argv[1], 'r', 'utf8')
#     ]
#     trg_corpus = [
#         line.strip().split() 
#         for line in codecs.open(sys.argv[2], 'r', 'utf8')
#     ]
#     reference = parse_alignments(sys.argv[3])
#     candidate = parse_alignments(sys.argv[4])
#     assert validate(src_corpus, trg_corpus, reference)
#     assert validate(src_corpus, trg_corpus, candidate)
#     print("recall %1.3f; precision %1.3f; aer %1.3f" % score(reference, candidate))


In [4]:
path_toks_dev_en = "../test/en-cs.en.dev.tokens"
src_corpus = [
    line.strip().split() 
    for line in codecs.open(
        path_toks_dev_en, 
        'r', 
        'utf8'
    )
]


In [5]:
path_toks_dev_cs = "../test/en-cs.cs.dev.tokens"
trg_corpus = [
    line.strip().split() 
    for line in codecs.open(
        path_toks_dev_cs, 
        'r', 
        'utf8'
    )
]

In [6]:
path_wa_dev = "../test/en-cs.wa.dev"
reference = parse_alignments(path_wa_dev)

In [9]:
path_wa_test = "../test/en-cs.wa.dev"
candidate = parse_alignments(path_wa_test)

In [10]:
validate(src_corpus, trg_corpus, reference)
validate(src_corpus, trg_corpus, candidate)

True

In [11]:
score(reference, candidate)

(1.0, 1.0, 0.0)

In [18]:
counts = count_word_cooccurrences(src_corpus, trg_corpus)
alignments = align_corpus(src_corpus, trg_corpus, counts)

In [22]:
reference[0]

{0: {0: 'P', 1: 'P', 2: 'P', 3: 'P', 4: 'S'},
 2: {5: 'S'},
 3: {6: 'S'},
 4: {7: 'S'},
 5: {8: 'S'}}

In [23]:
alignments[0]

{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0, 8: 1}

In [24]:
counts

{'We': {'Uvidíme': 1,
  ',': 13,
  'zda': 1,
  'reklama': 1,
  'funguje': 1,
  '.': 10,
  'Museli': 1,
  'jsme': 1,
  'na': 2,
  'to': 1,
  'myslet': 1,
  'dopředu': 1,
  '&quot;': 8,
  'Byla': 1,
  'nahrána': 1,
  'hned': 1,
  'po': 1,
  'včerejším': 1,
  'uzavření': 1,
  'trhu': 1,
  'a': 2,
  'nabízí': 1,
  'radu': 1,
  'pí': 1,
  'Farrellové': 1,
  ':': 1,
  'Vidíme': 1,
  'jak': 2,
  'trh': 3,
  'prochází': 1,
  'poměrně': 1,
  'normálním': 1,
  'cyklem': 1,
  'Nadále': 1,
  'se': 1,
  'domníváme': 1,
  'že': 3,
  'akciový': 1,
  'je': 3,
  'stále': 1,
  'místem': 1,
  'slibujícím': 1,
  'dlouhodobé': 1,
  'zhodnocení': 1,
  'Říkáme': 1,
  'nejhorší': 1,
  'věc': 1,
  'kterou': 1,
  'kdokoli': 1,
  'může': 1,
  'udělat': 1,
  'vidět': 1,
  'klesá': 1,
  'všechno': 1,
  'prodat': 1,
  'což': 1,
  'jen': 1,
  'dožene': 1,
  'ceny': 1,
  'k': 1,
  'dalšímu': 1,
  'poklesu': 1,
  'říká': 1,
  'John': 1,
  'Lampe': 1,
  'ředitel': 1,
  'pro': 1,
  'inzerci': 1,
  'společnosti': 1,
  'P