__Word Alignment Assignment__

Your task is to learn word alignments for the data provided with this Python Notebook. 

Start by running the 'train' function below and implementing the assertions which will fail. Then consider the following improvements to the baseline model:
* Is the TranslationModel parameterized efficiently?
* What form of PriorModel would help here? (Currently the PriorModel is uniform.)
* How could you use a Hidden Markov Model to model word alignment indices? (There's an implementation of simple HMM below to help you start.)
* How could you initialize more complex models from simpler ones?
* How could you model words that are not aligned to anything?

Grades will be assigned as follows*:

 AER below on blinds   |  Grade 
----------|-------------
 0.5 - 0.6 |   1 
 0.4 - 0.5 |   2 
 0.35 - 0.4 |  3    
 0.3 - 0.35 |  4    
 0.25 - 0.3 |  5   
 
You should save the notebook with the final scores for 'dev' and 'test' test sets.

*__Note__: Students who submitted a version of this assignment last year will have a 0.05 AER handicap, i.e to get a grade of 5, they will need to get an AER below 0.25.


In [1]:
# This cell contains the generative models that you may want to use for word alignment.
# Currently only the TranslationModel is at all functional.

import numpy as np
from collections import defaultdict
from nltk import WordNetLemmatizer


class Model:
    def __init__(self, src_corpus, trg_corpus):
        self._probs = defaultdict(lambda: defaultdict(lambda: 1))
        self._counter = defaultdict(lambda: defaultdict(lambda: 0))        

    def recompute_parameters(self):
        for first_key, inner_dict in self._counter.items():
            total_count = 1e-7 * len(inner_dict) + sum(list(inner_dict.values()))
            
            for second_key, cnt in inner_dict.items():
                self._probs[first_key][second_key] = (1e-7 + cnt) / total_count

        self._counter = defaultdict(lambda: defaultdict(lambda: 0))


class TranslationModel(Model):
    "Models conditional distribution over trg words given a src word."
    
    def __init__(self, src_corpus, trg_corpus):
        super().__init__(src_corpus, trg_corpus)
        for s_sent, t_sent in zip(src_corpus, trg_corpus):
            for s_word in s_sent:
                for t_word in t_sent:
                    self._counter[s_word][t_word] += 1
                    
        self.recompute_parameters()

    def get_conditional_prob(self, src_token, trg_token):
        "Return the conditional probability of trg_token given src_token."
        return self._probs[src_token][trg_token]

    def get_parameters_for_sentence_pair(self, src_tokens, trg_tokens):
        "Returns matrix with t[i][j] = p(f_j|e_i)."
        return np.array([[self.get_conditional_prob(src_token, trg_token)
                          for trg_token in trg_tokens] for src_token in src_tokens])

    def collect_statistics(self, src_tokens, trg_tokens, posterior_matrix):
        "Accumulate counts of translations from: posterior_matrix[j][i] = p(a_j=i|e, f)"
        for i, src_token in enumerate(src_tokens):
            for j, trg_token in enumerate(trg_tokens):
                self._counter[src_token][trg_token] += posterior_matrix[j, i]


class TransitionModel(Model):
    "Models the prior probability of an alignment conditioned on previous alignment."

    def get_parameters_for_sentence_pair(self, src_length):
        "Retrieve the parameters for this sentence pair: A[k, i] = p(a_{j} = i|a_{j-1} = k)"
        transition = np.array([[self._probs[(src_length)][i - k] for i in range(src_length)]
                               for k in range(src_length)])
        initial = np.array([self._probs[(src_length, 'initial')][i] for i in range(src_length)])
        
        return initial, transition

    def collect_statistics(self, src_length, posteriors, single_posteriors):
        "Extract statistics from the bigram posterior[i][j]: p(a_{t-1} = i, a_{t} = j| e, f)"
        for i in range(src_length):
            self._counter[(src_length, 'initial')][i] += single_posteriors[0, i]
            for k in range(src_length):
                self._counter[(src_length)][i - k] += posteriors[:, k, i].sum()

In [2]:
def get_ans(pi, A, O):
    probs = np.zeros_like(O)
    max_path = np.zeros_like(probs, dtype=np.int8)
    
    for t in range(O.shape[1]):
        if t:
            possible_probs = probs[:, t - 1] + (np.log(O[:, t]) + np.log(A)).T
            probs[:, t] = np.max(possible_probs, axis=1)
            max_path[:, t] = np.argmax(possible_probs, axis=1)
        else:
            probs[:, t] = np.log(pi) + np.log(O[:, t])

    result = [np.argmax(probs[:, -1])]
    
    for i in range(O.shape[1] - 1, 0, -1):
        result.append(max_path[result[-1], i])
        
    return np.array(list(reversed(result)))

def forward(pi, A, O, observations):
    N = len(observations)
    S = pi.shape[0]

    alpha = np.zeros((N, S))
    alpha[0] = pi * O[:, observations[0]]

    for i in range(1, N):
        alpha[i] = O[:, observations[i]] * np.sum(A.T * alpha[i - 1], axis=1)
        
    return alpha


def backward(pi, A, O, observations):
    N = len(observations)
    S = pi.shape[0]

    beta = np.zeros((N, S))
    beta[N - 1] = 1
    
    for i in range(N - 2, -1, -1):
        beta[i] = np.sum(O[:, observations[i + 1]] * A * beta[i + 1], axis=1)
        
    return beta

In [3]:
# This cell contains the framework for training and evaluating a model using EM.

from utils import read_parallel_corpus, extract_test_set_alignments, score_alignments, write_aligned_corpus

def infer_posteriors(src_tokens, trg_tokens, transition_model, translation_model):
    "Compute the posterior probability p(a_j=i | f, e) for each target token f_j given e and f."
    
    initial, transition = transition_model.get_parameters_for_sentence_pair(len(src_tokens))
    translation = translation_model.get_parameters_for_sentence_pair(src_tokens, trg_tokens)
    params = (initial, transition, translation)

    observations = np.arange(len(trg_tokens))
    posteriors = np.zeros((len(trg_tokens) - 1, len(src_tokens), len(src_tokens)))
    single_posteriors = np.zeros((len(trg_tokens), len(src_tokens)))
    
    alpha, beta = forward(*params, observations), backward(*params, observations)
    answers = get_ans(*params)

    for t in range(len(trg_tokens) - 1):
        aggr = (alpha[t, :] * transition.T).T * translation[:, t + 1] * beta[t + 1, :]
        posteriors[t] = aggr / np.sum(aggr)

    aggr = alpha * beta
    single_posteriors = (aggr.T / np.sum(aggr, axis=1)).T

    log_likelihood = (np.log(initial[answers[0]]) +
                      np.sum(np.log(transition[answers[:-1], answers[1:]])) +
                      np.sum(np.log(translation[answers, np.arange(len(trg_tokens))])))
    
    return (posteriors, single_posteriors), log_likelihood

def collect_expected_statistics(src_corpus, trg_corpus, transition_model, translation_model):
    "E-step: infer posterior distribution over each sentence pair and collect statistics."
    corpus_log_likelihood = 0.0
    for src_tokens, trg_tokens in zip(src_corpus, trg_corpus):
        # Infer posterior
        posteriors, log_likelihood = infer_posteriors(src_tokens, trg_tokens, transition_model, translation_model)
        # Collect statistics in each model.
        transition_model.collect_statistics(len(src_tokens), *posteriors)
        translation_model.collect_statistics(src_tokens, trg_tokens, posteriors[1])
        # Update log prob
        corpus_log_likelihood += log_likelihood
    return corpus_log_likelihood

def estimate_models(src_corpus, trg_corpus, transition_model, translation_model, num_iterations):
    "Estimate models iteratively using EM."
    for iteration in range(num_iterations):
        # E-step
        corpus_log_likelihood = collect_expected_statistics(
            src_corpus, trg_corpus, transition_model, translation_model)
        # M-step
        transition_model.recompute_parameters()
        translation_model.recompute_parameters()
        if iteration > 0:
            print("corpus log likelihood: %1.3f" % corpus_log_likelihood)
    return transition_model, translation_model

def get_alignments_from_posterior(posteriors):
    "Returns the MAP alignment for each target word given the posteriors."
    # HINT: If you implement an HMM, you may want to implement a better algorithm here.
    alignments = {}
    for src_index, trg_index in enumerate(np.argmax(posteriors[1], 1)):
        if src_index not in alignments:
            alignments[src_index] = {}
        alignments[src_index][trg_index] = '*'
    return alignments

def align_corpus(src_corpus, trg_corpus, transition_model, translation_model):
    "Align each sentence pair in the corpus in turn."
    aligned_corpus = []
    for src_tokens, trg_tokens in zip(src_corpus, trg_corpus):
        posteriors, _ = infer_posteriors(src_tokens, trg_tokens, transition_model, translation_model)
        alignments = get_alignments_from_posterior(posteriors)
        aligned_corpus.append((src_tokens, trg_tokens, alignments))
    return aligned_corpus

def initialize_models(src_corpus, trg_corpus):
    transition_model = TransitionModel(src_corpus, trg_corpus)
    translation_model = TranslationModel(src_corpus, trg_corpus)
    return transition_model, translation_model

def _normalize(corpus):
    lemmatizer = WordNetLemmatizer()
    
    for sentence in corpus:
        for i, word in enumerate(sentence):
            word = lemmatizer.lemmatize(word.lower())
            sentence[i] = word[:4]
    return corpus
    
def normalize(*corpuses):
    return map(_normalize, corpuses)

def train(num_iterations):
    src_corpus, trg_corpus, _ = read_parallel_corpus('en-cs.all')
    src_corpus, trg_corpus = normalize(src_corpus, trg_corpus)
    transition_model, translation_model = initialize_models(src_corpus, trg_corpus)
    transition_model, translation_model = estimate_models(
        src_corpus, trg_corpus, transition_model, translation_model, num_iterations)    
    aligned_corpus = align_corpus(src_corpus, trg_corpus, transition_model, translation_model)
    return aligned_corpus, extract_test_set_alignments(aligned_corpus)

def evaluate(candidate_alignments):
    src_dev, trg_dev, wa_dev = read_parallel_corpus('en-cs-wa.dev', has_alignments=True)
    src_test, trg_test, wa_test = read_parallel_corpus('en-cs-wa.test', has_alignments=True)
    print('recall %1.3f; precision %1.3f; aer %1.3f' % score_alignments(wa_dev, candidate_alignments['dev']))
    print('recall %1.3f; precision %1.3f; aer %1.3f' % score_alignments(wa_test, candidate_alignments['test']))            

In [4]:
aligned_corpus, test_alignments = train(10)
evaluate(test_alignments)

corpus log likelihood: -1565618.707
corpus log likelihood: -1217889.079
corpus log likelihood: -1011286.348
corpus log likelihood: -927266.082
corpus log likelihood: -896550.879
corpus log likelihood: -882508.412
corpus log likelihood: -874757.727
corpus log likelihood: -869847.180
corpus log likelihood: -866582.785
recall 0.698; precision 0.645; aer 0.331
recall 0.693; precision 0.633; aer 0.340
