## Read data

In [1]:
english_train = 'hansards.36.2.e'
french_train = 'hansards.36.2.f'

In [11]:
import numpy as np
import pandas as pd
from collections import *

In [105]:
class IBM1():

    def __init__(self):
        self.english_training = []
        self.french_training = []
        self.english_voc = set()
        self.french_voc = set()
        self.english_indices = dict()
        self.english_words = dict()
        self.french_indices = dict()
        self.french_words = dict()
        self.trans_matrix = None
        self.likelihoods = []


    def read_data(self, english_train, french_train):
        print('Start reading data...')

        e = open(english_train, 'r', encoding='utf8')
        for i, line in enumerate(e):
            sentence = line.split()
            # add null word to each sentence
            sentence = ['NULL'] + sentence
            self.english_training.append(sentence)
            # add words to vocabulary
            self.english_voc.update(sentence)
        e.close()

        f = open(french_train, 'r', encoding='utf8')
        for i, line in enumerate(f):
            sentence = line.split()
            self.french_training.append(sentence)
            self.french_voc.update(sentence)
        f.close()

        self.map_to_unk(10, self.english_training, self.english_voc)
        self.map_to_unk(10, self.french_training, self.french_voc)

        for index, e in enumerate(self.english_voc):
            self.english_indices[e] = index
            self.english_words[index] = e

        for index, f in enumerate(self.french_voc):
            self.french_indices[f] = index
            self.french_words[index] = f
        
        self.initialize_translation()
        
        print('Done with reading data!')


    def map_to_unk(self, k, training, vocabulary):
        counts = Counter(w for sent in training for w in sent)
        counted_once = [w for w, count in counts.items() if count == 1]
        counted_once = counted_once[0:k]
        for i, sentence in enumerate(training):
            for j, word in enumerate(sentence):
                if training[i][j] in counted_once:
                    training[i][j] = 'UNK'
        vocabulary.add('UNK')
        for word in counted_once:
            vocabulary.remove(word)
        
    def initialize_translation(self):
        vocab_dict = defaultdict(list)
        for (e,f) in zip(self.english_training, self.french_training):
            for word_f in f:
                for word_e in e:
                    vocab_dict[word_e].append(word_f)
        t= defaultdict(Counter)
        for word_e in vocab_dict:
            words = set(vocab_dict[word_e])
            prob = np.ones(len(words)) / len(words)
            for i, f in enumerate(words):
                t[word_e][f] = prob[i] 
        self.trans_matrix = t
    
    
    def update_translation(self, count_ef, count_e):
        for (eng,fr) in zip(self.english_training, self.french_training):
                for f in fr:
                    for e in eng:
                        self.trans_matrix[e][f] = count_ef[e][f] / count_e[e]
    
 
    def run_EM(self):
        count_ef = defaultdict(Counter)
        count_e = Counter()

        for k, (eng, fr) in enumerate(zip(self.english_training, self.french_training)):
            for f in fr:
                for e in eng:
                    delta = self.trans_matrix[e][f] / float(sum([self.trans_matrix[w][f] for w in eng]))
                    count_ef[e][f] += delta  
                    count_e[e] += delta

        self.update_translation(count_ef, count_e)

        self.likelihoods.append(self.log_likelihood())

        print('Likelihood:', likelihood)
        

    def log_likelihood(self):
        likelihood = 0
        for (eng, fr) in zip(self.english_training, self.french_training):
            alignment = self.align(fr, eng)
            l = 0
            for a, f in enumerate(fr):
                l += np.log(self.trans_matrix[eng[alignment[a]]][f])
            likelihood += l
            likelihood += -len(fr) * np.log(len(eng) + 1) 

        return likelihood
                       
    def align(self, sent_fr, sent_eng):
        alignment = []
        for f_i, f in enumerate(sent_fr):
            alignment_i = []
            highest_prob = -100
            for e_i, e in enumerate(sent_eng):
                prob = self.trans_matrix[e][f]
                if prob > highest_prob:
                    highest_prob = prob
                    alignment_i = e_i
            alignment.append(alignment_i)
        return alignment

    def predict_alignment(self, test_fr, test_eng, outpath):
        f_test = open(test_fr, 'r')
        e_test = open(test_eng, 'r')
        f_sents = []
        e_sents = []
        for line in f_testfile:
            f_sents.append(line.split())
        for line in e_testfile:
            e_sents.append(['NULL'] + line.split())

        alignments = []

        for F, E in zip(f_sents, e_sents):
            alignment = self.align(F, E)
            alignments.append(alignment)
        f_testfile.close()
        e_testfile.close()

        output = open(outpath, 'w')
        nulls = 0
        for k, alignment in enumerate(alignments):
            for f, e in enumerate(alignment):
                if e != 0:
                    output.write('{0} {1} {2} {3}\n'.format(k + 1, e, f + 1, 'S'))

        output.close()

In [106]:
ibm = IBM1()
ibm.read_data(english_train, french_train)

Start reading data...
Done with reading data!


In [107]:
steps = 2

for k in range(steps):
    ibm.run_EM()
    ibm.predict_alignment('dev.f','dev.e', '/prediction/' + 'result-{0}'.format(k))

NameError: name 'likelihood' is not defined

In [None]:
def elbo(likelihood, lmbda_fe, alpha):
    KL = (np.sum(np.multiply(self.t, -1*lmbda_fe + alpha) + loggamma(lmbda_fe), axis=0, keepdims=True) - 
            self.V_f_size * loggamma(alpha) +
            loggamma(self.V_f_size * alpha) -
            loggamma(np.sum(lmbda_fe, axis=0, keepdims=True)))
    return likelihood + (np.sum(KL, axis=1)[0]).real


### Plot results

In [None]:
# Likelihood
plt.plot(range(len(likelihoods)), likelihoods)



# ELBO
plt.plot(range(len(self.elbos)), self.elbos)