In [1]:
import pickle

In [2]:
tokenized_stores = {'en_train': [], 'en_validation': [], 'en_test': [], 
                    'vi_train': [], 'vi_validation': [], 'vi_test': []}

In [3]:
for key in tokenized_stores:
    file_name = "D:/Statistical-Machine-Translation-master/data/" + str(key)[3:] + "." + str(key)[0:2]
    load = open(file_name, 'r', encoding='utf-8')
    sentences = load.read().split('\n')
    
    for sentence in sentences:
        token_store = sentence.split(' ')
        tokenized_stores[key].append(token_store)

In [8]:
print(tokenized_stores['en_train'][1])

['In', '4', 'minutes', ',', 'atmospheric', 'chemist', 'Rachel', 'Pike', 'provides', 'a', 'glimpse', 'of', 'the', 'massive', 'scientific', 'effort', 'behind', 'the', 'bold', 'headlines', 'on', 'climate', 'change', ',', 'with', 'her', 'team', '--', 'one', 'of', 'thousands', 'who', 'contributed', '--', 'taking', 'a', 'risky', 'flight', 'over', 'the', 'rainforest', 'in', 'pursuit', 'of', 'data', 'on', 'a', 'key', 'molecule', '.']


In [9]:
print(tokenized_stores['vi_train'][1])

['Trong', '4', 'phút', ',', 'chuyên', 'gia', 'hoá', 'học', 'khí', 'quyển', 'Rachel', 'Pike', 'giới', 'thiệu', 'sơ', 'lược', 'về', 'những', 'nỗ', 'lực', 'khoa', 'học', 'miệt', 'mài', 'đằng', 'sau', 'những', 'tiêu', 'đề', 'táo', 'bạo', 'về', 'biến', 'đổi', 'khí', 'hậu', ',', 'cùng', 'với', 'đoàn', 'nghiên', 'cứu', 'của', 'mình', '--', 'hàng', 'ngàn', 'người', 'đã', 'cống', 'hiến', 'cho', 'dự', 'án', 'này', '--', 'một', 'chuyến', 'bay', 'mạo', 'hiểm', 'qua', 'rừng', 'già', 'để', 'tìm', 'kiếm', 'thông', 'tin', 'về', 'một', 'phân', 'tử', 'then', 'chốt', '.']


In [11]:
train_size = len(tokenized_stores['vi_train'])
dev_size = len(tokenized_stores['vi_validation'])
test_size = len(tokenized_stores['vi_test'])
print(train_size)

133318


In [13]:
# making the vocabulary

en_words = {}
vi_words = {}

for key in tokenized_stores:
    if str(key)[0] == 'e':
        # creating en_words
        for sentence in tokenized_stores[key]:
            for word in sentence:
                if word in en_words:
                    en_words[word] += 1
                else:
                    en_words[word] = 1
    else:
        # creating vi_words
        for sentence in tokenized_stores[key]:
            for word in sentence:
                if word in vi_words:
                    vi_words[word] += 1
                else:
                    vi_words[word] = 1
                    
en_vocab = len(en_words)
vi_vocab = len(vi_words)
print("Number of Unique Words:")
print("> English:", str(en_vocab))
print("> Vietnamese:", str(vi_vocab))

Number of Unique Words:
> English: 54418
> Vietnamese: 25763


In [14]:
# creating the 't'
t = {}
# usage: t[('EN_word', 'VI_word')] = probability of EN_Word given VI_word
uniform = 1 / (en_vocab * vi_vocab)

In [None]:
n_iters = 0
max_iters = 25

fine_tune = 1
has_converged = False

while n_iters < max_iters and has_converged == False:
    has_converged = True
    max_change = -1

    n_iters += 1
    count = {}
    total = {}
    for index in range(train_size):
        s_total = {}
        for vi_word in tokenized_stores['vi_train'][index]:
            s_total[vi_word] = 0
            for en_word in tokenized_stores['en_train'][index]:
                if (vi_word, en_word) not in t:
                    t[(vi_word, en_word)] = uniform
                s_total[vi_word] += t[(vi_word, en_word)]

        for vi_word in tokenized_stores['vi_train'][index]:
            for en_word in tokenized_stores['en_train'][index]:
                if (vi_word, en_word) not in count:
                    count[(vi_word, en_word)] = 0
                count[(vi_word, en_word)] += (t[(vi_word, en_word)] / s_total[vi_word])

                if en_word not in total:
                    total[en_word] = 0
                total[en_word] += (t[(vi_word, en_word)] / s_total[vi_word])

    # estimating the probabilities

    if fine_tune == 0:
        updated = {}
        # train for all valid word pairs s.t count(vi_word, en_word) > 0
        for index in range(train_size):
            for en_word in tokenized_stores['en_train'][index]:
                for vi_word in tokenized_stores['vi_train'][index]:
                    if (vi_word, en_word) in updated:
                        continue
                    updated[(vi_word, en_word)] = 1
                    if abs(t[(vi_word, en_word)] - count[(vi_word, en_word)] / total[en_word]) > 0.01:
                        has_converged = False
                        max_change = max(max_change, abs(t[(vi_word, en_word)] - count[(vi_word, en_word)] / total[en_word]))
                    t[(vi_word, en_word)] = count[(vi_word, en_word)] / total[en_word]

    elif fine_tune == 1:
        # train it only for 1000 most frequent words in English and Hindi
        max_words = 1000
        n_en_words = 0
        updates = 0

        for en_word_tuples in sorted(en_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
            en_word = en_word_tuples[0]
            n_en_words += 1
            if n_en_words > max_words:
                break
            n_vi_words = 0
            for vi_word_tuples in sorted(vi_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
                vi_word = vi_word_tuples[0]
                n_vi_words += 1
                if n_vi_words > max_words:
                    break
                if (vi_word, en_word) not in count or en_word not in total:
                    continue
                    # assume in this case: t[(vi_word, en_word)] = uniform
                else:
                    if abs(t[(vi_word, en_word)] - count[(vi_word, en_word)] / total[en_word]) > 0.005:
                        has_converged = False
                        max_change = max(max_change, abs(t[(vi_word, en_word)] - count[(vi_word, en_word)] / total[en_word]))
                    t[(vi_word, en_word)] = count[(vi_word, en_word)] / total[en_word]

    print("Iteration " + str(n_iters) + " Completed, Maximum Change: " + str(max_change))
