In [1]:
import torch
from torch import optim
from functools import partial
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
import sys
import os
import torch.nn as nn
import numpy as np



path_to_helper_files = os.path.join('..', 'py_files')
base_saved_models_dir = os.path.join('..', 'saved_models' )



sys.path.append(path_to_helper_files)


import global_variables
import dataset_helper
import nnet_models
import train_utilities

device = global_variables.device;




In [2]:
embed_dim = 512
rnn_layers = 2

MAX_LEN = 48
batchSize = 128

source_name = 'vi'
target_name = 'en'
attention = True
source_rnn_type = 'lstm'

source_embed_dim = embed_dim
source_hidden_size = embed_dim

target_embed_dim= embed_dim
target_hidden_size = 2*embed_dim
source_rnn_layers = rnn_layers
target_rnn_layers = rnn_layers

In [3]:
max_num = 5

In [4]:
if source_name == 'vi' and target_name == 'en':
        target_train_path = '../Data/iwslt-vi-en/train.tok.en'
        source_train_path = '../Data/iwslt-vi-en/train.tok.vi'

        target_val_path = '../Data/iwslt-vi-en/dev.tok.en'
        source_val_path = '../Data/iwslt-vi-en/dev.tok.vi'

        target_test_path = '../Data/iwslt-vi-en/test.tok.en'
        source_test_path = '../Data/iwslt-vi-en/test.tok.vi'

elif source_name == 'zh' and target_name == 'en':
        target_train_path = '../Data/iwslt-zh-en/train.tok.en'
        source_train_path = '../Data/iwslt-zh-en/train.tok.zh'

        target_val_path = '../Data/iwslt-zh-en/dev.tok.en'
        source_val_path = '../Data/iwslt-zh-en/dev.tok.zh'

        target_test_path = '../Data/iwslt-zh-en/test.tok.en'
        source_test_path = '../Data/iwslt-zh-en/test.tok.zh'
else:
        sys.exit(source_name+'->'+target_name+' is invalid!')




saved_models_dir = os.path.join(base_saved_models_dir, source_name+'2'+target_name)




pth_save_folder_name = source_name+'2'+target_name+'_' + \
                        'source_embed_dim='+str(source_embed_dim) +  \
                        '-source_hidden_size='+str(source_hidden_size) +  \
                        '-source_rnn_layers=' + str(source_rnn_layers) + \
                        '-source_rnn_type='+str(source_rnn_type)+ \
                        '-target_embed_dim='+str(target_embed_dim) + \
                        '-target_hidden_size='+str(target_hidden_size) + \
                        '-target_rnn_layers='+str(target_rnn_layers) + \
                        '-attention='+str(attention);
pth_saved_dir = os.path.join(saved_models_dir, pth_save_folder_name)



config_string = source_name+'2'+target_name+'\n' + \
                        'source_embed_dim='+str(source_embed_dim) +  \
                        '\n source_hidden_size='+str(source_hidden_size) +  \
                        '\n source_rnn_layers=' + str(source_rnn_layers) + \
                        '\n source_rnn_type='+str(source_rnn_type)+ \
                        '\n target_embed_dim='+str(target_embed_dim) + \
                        '\n target_hidden_size='+str(target_hidden_size) + \
                        '\n target_rnn_layers='+str(target_rnn_layers) + \
                        '\n attention='+str(attention);

print(config_string)
sys.stdout.flush()

saved_language_model_dir = os.path.join(saved_models_dir, 'lang_obj')



dataset_dict = {'val': dataset_helper.LanguagePair(source_name = source_name, target_name=target_name, 
                                                    source_path = source_val_path, target_path = target_val_path, 
                                                    lang_obj_path = saved_language_model_dir, val = True, max_num = max_num), 

                'test': dataset_helper.LanguagePair(source_name = source_name, target_name=target_name, 
                                                        source_path = source_test_path, target_path = target_test_path, 
                                                        lang_obj_path = saved_language_model_dir, val = True, max_num = max_num)} 






dataloader_dict = {'val': DataLoader(dataset_dict['val'], batch_size = 1, 
                                                                        collate_fn = dataset_helper.vocab_collate_func_val,
                                                                shuffle = False, num_workers=0), 
                                    'test': DataLoader(dataset_dict['test'], batch_size = 1, 
                                                                        collate_fn = dataset_helper.vocab_collate_func_val,
                                                                shuffle = False, num_workers=0)}





encoder = nnet_models.EncoderRNN(dataset_dict['val'].source_lang_obj.n_words, 
                                                                 embed_dim = source_embed_dim, 
                                                                 hidden_size = source_hidden_size,
                                                                 rnn_layers = source_rnn_layers, 
                                                                 rnn_type = source_rnn_type).to(device);






decoder = nnet_models.DecoderRNN(dataset_dict['val'].target_lang_obj.n_words, 
                                                                                        embed_dim = target_embed_dim, 
                                                                                        hidden_size = target_hidden_size, 
                                                                                        n_layers = target_rnn_layers, 
                                                                                        attention = attention).to(device)   


encoder.load_state_dict(torch.load( os.path.join( pth_saved_dir, 'encoder.pth')))
decoder.load_state_dict(torch.load( os.path.join( pth_saved_dir, 'decoder.pth')))

zh2en
source_embed_dim=512
 source_hidden_size=512
 source_rnn_layers=2
 source_rnn_type=lstm
 target_embed_dim=512
 target_hidden_size=1024
 target_rnn_layers=2
 attention=True


In [5]:
train_utilities.validation_function(encoder, decoder, dataloader_dict['val'], dataset_dict['val'].target_lang_obj, keep_unk = True, verbose=True)

True Sentence: When I was 11 , I remember waking up one morning to the sound of joy in my house .
Pred Sentence: i remember 11 , and i remember one morning , i woke up with pleasure .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: My father was listening to BBC News on his small , gray radio .
Pred Sentence: my father was listening to the bbc news with his UNK radio .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: There was a big smile on his face which was unusual then , because the news mostly depressed him .
Pred Sentence: he &apos;s smiling . it &apos;s very rare , because most of the news is going to make him depressed .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: &quot; The Taliban are gone ! &quot; my father shouted .
Pred Sentence: the taliban walked . his father was UNK 

20.967010143780684

In [6]:
train_utilities.validation_function(encoder, decoder, dataloader_dict['val'], dataset_dict['val'].target_lang_obj, keep_unk = False, verbose = True)

True Sentence: When I was 11 , I remember waking up one morning to the sound of joy in my house .
Pred Sentence: i remember 11 , and i remember one morning , i woke up with pleasure .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: My father was listening to BBC News on his small , gray radio .
Pred Sentence: my father was listening to the bbc news with his UNK radio .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: There was a big smile on his face which was unusual then , because the news mostly depressed him .
Pred Sentence: he &apos;s smiling . it &apos;s very rare , because most of the news is going to make him depressed .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: &quot; The Taliban are gone ! &quot; my father shouted .
Pred Sentence: the taliban walked . his father was UNK 

20.840857352795958

In [7]:
train_utilities.validation_beam_search(encoder, decoder, dataloader_dict['val'], dataset_dict['val'].target_lang_obj, beam_size = 3, keep_unk = True, verbose = True)

True Sentence: When I was 11 , I remember waking up one morning to the sound of joy in my house .
Pred Sentence: when i was 11 , i remember one morning , i woke up and i heard a happy voice .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: My father was listening to BBC News on his small , gray radio .
Pred Sentence: my father was listening to the bbc news with his UNK radio .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: There was a big smile on his face which was unusual then , because the news mostly depressed him .
Pred Sentence: he &apos;s smiling . it &apos;s very rare , because most of the news is going to make him depressed .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: &quot; The Taliban are gone ! &quot; my father shouted .
Pred Sentence: the taliban walked . his father 

21.24385352205114

In [8]:
train_utilities.validation_beam_search(encoder, decoder, dataloader_dict['val'], dataset_dict['val'].target_lang_obj, beam_size = 3, keep_unk = False, verbose = True)

True Sentence: When I was 11 , I remember waking up one morning to the sound of joy in my house .
Pred Sentence: when i was 11 , i remember one morning , i woke up and i heard a happy voice .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: My father was listening to BBC News on his small , gray radio .
Pred Sentence: my father was listening to the bbc news with his UNK radio .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: There was a big smile on his face which was unusual then , because the news mostly depressed him .
Pred Sentence: he &apos;s smiling . it &apos;s very rare , because most of the news is going to make him depressed .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: &quot; The Taliban are gone ! &quot; my father shouted .
Pred Sentence: the taliban walked . his father 

21.106621612230118

### On Test Sentences

In [9]:
train_utilities.validation_function(encoder, decoder, dataloader_dict['test'], dataset_dict['val'].target_lang_obj, keep_unk = True, verbose=True)

True Sentence: Hi . I &apos;m Kevin Allocca , I &apos;m the trends manager at YouTube , and I professionally watch YouTube videos .
Pred Sentence: hello . i &apos;m kevin UNK . i &apos;m a youtube trend , and my specialty is watching youtube video .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: It &apos;s true .
Pred Sentence: it &apos;s true .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: So we &apos;re going to talk a little bit today about how videos go viral and then why that even matters .
Pred Sentence: so today we &apos;re going to talk about why some video UNK are all over and on .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: We all want to be stars -- celebrities , singers , comedians -- and when I was younger , that seemed so very , very hard to do .
Pred Sentence: we 

23.175198984842883

In [10]:
train_utilities.validation_beam_search(encoder, decoder, dataloader_dict['test'], dataset_dict['val'].target_lang_obj, beam_size = 3, keep_unk = True, verbose = True)

True Sentence: Hi . I &apos;m Kevin Allocca , I &apos;m the trends manager at YouTube , and I professionally watch YouTube videos .
Pred Sentence: hello . i &apos;m kevin UNK . i &apos;m a youtube trend , and my specialty is watching youtube video .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: It &apos;s true .
Pred Sentence: it &apos;s true .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: So we &apos;re going to talk a little bit today about how videos go viral and then why that even matters .
Pred Sentence: so today , we &apos;re going to talk about why some video UNK can even become important .
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
True Sentence: We all want to be stars -- celebrities , singers , comedians -- and when I was younger , that seemed so very , very hard to do .
Pred Sente

21.540766325554024

### Visualizing Attention

In [11]:
score, true_corpus, pred_corpus, attention_scores = train_utilities.validation_function(encoder, decoder, dataloader_dict['val'], dataset_dict['val'].target_lang_obj, keep_unk = False, verbose = False, return_attention=True)

In [12]:
attention_scores = [att_mat.data.cpu().squeeze(1).numpy() for att_mat in attention_scores]

In [13]:
# dataset_dict['val'].main_df[:5]

Unnamed: 0,source_data,target_data,source_tokenized,source_len,target_tokenized,target_len,source_indized,target_indized
0,我 11 岁 那年 记得 得有 一天 早晨 醒来 听见 家里 有 愉悦 的 声音,"When I was 11 , I remember waking up one morni...","[我, 11, 岁, 那年, 记得, 得有, 一天, 早晨, 醒来, 听见, 家里, 有, ...",16,"[when, i, was, 11, ,, i, remember, waking, up,...",21,"[32, 1908, 480, 8839, 1022, 2038, 940, 5214, 4...","[181, 27, 200, 1186, 4, 27, 379, 5875, 43, 42,..."
1,我 的 父亲 在 用 他 的 灰色 小 收音 收音机 听 BBC 新闻,My father was listening to BBC News on his sma...,"[我, 的, 父亲, 在, 用, 他, 的, 灰色, 小, 收音, 收音机, 听, bbc,...",15,"[my, father, was, listening, to, bbc, news, on...",15,"[32, 5, 1508, 12, 40, 177, 5, 6767, 462, 5060,...","[142, 372, 200, 1685, 21, 13780, 798, 32, 104,..."
2,他 面带 面带笑容 带笑 笑容 这 很少 少见 因为 大部 大部分 部分 的 新闻...,There was a big smile on his face which was un...,"[他, 面带, 面带笑容, 带笑, 笑容, 这, 很少, 少见, 因为, 大部, 大部分, ...",21,"[there, was, a, big, smile, on, his, face, whi...",21,"[177, 15638, 2, 2, 17475, 23, 2364, 11794, 107...","[14, 200, 30, 292, 3099, 32, 104, 421, 184, 20..."
3,塔利 塔利班 走 了 父亲 大声 叫 着,&quot; The Taliban are gone ! &quot; my father...,"[塔利, 塔利班, 走, 了, 父亲, 大声, 叫, 着]",9,"[&quot;, the, taliban, are, gone, !, &quot;, m...",12,"[5243, 5249, 1702, 14, 1508, 6530, 466, 82, 1]","[117, 5, 3595, 18, 1335, 114, 117, 142, 372, 2..."
4,我 不知 知道 那 意味 意味着 什么 但是 我 能 看出 父亲 非常 非常 非常高兴 高兴,"I didn &apos;t know what it meant , but I coul...","[我, 不知, 知道, 那, 意味, 意味着, 什么, 但是, 我, 能, 看出, 父亲, ...",17,"[i, didn, &apos;t, know, what, it, meant, ,, b...",22,"[32, 155, 50, 281, 1253, 1279, 105, 487, 32, 5...","[27, 264, 63, 159, 58, 11, 1127, 4, 37, 27, 34..."


In [14]:
source_corpus  =  [list(y) for y in dataset_dict['val'].main_df['source_tokenized'][:5]]
greedy = {'source_corpus': source_corpus, 
          'true_corpus': true_corpus, 
           'pred_corpus': pred_corpus, 
             'attention_scores': attention_scores}

In [15]:
import pickle

In [16]:
pickle.dump(greedy, open('greedy_zh_to_en.p', 'wb') )