## Neural Machine Translation

In [None]:
import os
import sys
import pickle
import random  
import datetime
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

import spacy
from torchtext.data import Field, BucketIterator, TabularDataset

from spacy.lang.zh import Chinese  # chinese tokenizer
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

sys.path.append(os.path.abspath(os.path.join("..")))
from models.encoder import Encoder
from models.decoder import Decoder
from models.seq2seq import Seq2Seq

In [2]:
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
DEVICE = torch.device('cuda:4') if torch.cuda.is_available() else 'cpu'
print('DEVICE :', DEVICE)

DEVICE : cuda:4


Creating Fields, which form a pipeline of converting sentence to vectors. We will create a field for each langauge.

In [3]:
en_field = Field(
    tokenize='spacy', 
    tokenizer_language='en', 
    lower=True, 
    init_token='<sos>', 
    eos_token='<eos>', 
    batch_first=True,
)



tokenizer = Chinese()
def tokenize_zh(sentence, tokenizer=tokenizer):
    return [tok.text for tok in tokenizer(sentence)]

#Chinese Field
zh_field = Field(
    tokenize=tokenize_zh,
    tokenizer_language='zh',
    init_token='<sos>', 
    eos_token='<eos>',
    batch_first=True,
)

In [None]:
# from lxml import etree

# tree = etree.iterparse('../../../Dataset/MT/Chinese/globalvoices.zht-en.xliff')
# i = 0
# for a in tree:
#     #print(( element.tag))
#     action, element = a
#     try:
#         print(element.text)
#     except:
#         pass
#     i += 1
#     if i == 1000:
#         break

In [4]:
dataset_dir = '../../../Dataset/MT/Chinese/old_dataset/' 

train_set, val_set = TabularDataset.splits(
    path=dataset_dir, 
    train='zh_en.csv', 
    validation='zh_en_validate.csv',
    format='CSV', 
    fields=[('Chinese', zh_field), ('English', en_field)]
)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.124 seconds.
Prefix dict has been built succesfully.


In [5]:
zh_field.build_vocab(train_set, val_set, min_freq=5)
en_field.build_vocab(train_set, val_set, min_freq=5)

print("Example from Chinese vocabulary:\n", list(zh_field.vocab.freqs.keys())[:30])
print("Examples from English vocabulary:\n", list(en_field.vocab.freqs.keys())[:30])

Example from Chinese vocabulary:
 ['zh', '1929', '年', '还是', '1989', '?', '巴黎', '-', '随着', '经济危机', '不断', '加深', '和', '蔓延', '，', '整个', '世界', '一直', '在', '寻找', '历史', '上', '的', '类似', '事件', '希望', '有助于', '我们', '了解', '目前']
Examples from English vocabulary:
 ['en', '1929', 'or', '1989', '?', 'paris', '–', 'as', 'the', 'economic', 'crisis', 'deepens', 'and', 'widens', ',', 'world', 'has', 'been', 'searching', 'for', 'historical', 'analogies', 'to', 'help', 'us', 'understand', 'what', 'happening', '.', 'at']


In [6]:
# some statistics
print('English vocabulary size:',len(en_field.vocab.stoi))
print('Chinese vocabulary size:',len(zh_field.vocab.stoi))

English vocabulary size: 24628
Chinese vocabulary size: 30896


## Word Embeddings

### English

In [None]:
# # Loading pre-trained glove-embeddings for English
# model_gigaword = api.load("glove-wiki-gigaword-100")

In [None]:
# # check word embeddings
# word = 'look'
# print(model_gigaword.wv[word])
# model_gigaword.wv.most_similar(positive=[word], topn=5)

In [None]:
# # create english word embedding matrix
# eng_vocab_sz = len(en_field.vocab.stoi)
# eng_embed_dim = 100
# eng_embed_matrix = torch.zeros((eng_vocab_sz, eng_embed_dim))

# for i, word in enumerate(en_field.vocab.stoi.keys()):
#     try:
#         eng_embed_matrix[i] = torch.from_numpy(model_gigaword.wv[word])
#     except KeyError:
#         if word in ['<unk>', '<sos>', '<eos>', '<pad>']:
#             eng_embed_matrix[i] = torch.ones((eng_embed_dim,))*i
#         else:
#             print("No embedding vector for", word)

# print("English embedding vector created.")
# print(eng_embed_matrix.shape)

### Chinese

In [None]:
# import io

# def load_vectors(fname):
#     fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
#     n, d = map(int, fin.readline().split())
#     data = {}
#     for line in fin:
#         tokens = line.rstrip().split(' ')
#         data[tokens[0]] = map(float, tokens[1:])
#     return data

In [None]:
# fname = '../../../Dataset/MT/Chinese/embeddings/zhwiki_20180420_100d.txt'
# data = load_vectors(fname)

In [None]:
# # create english word embedding matrix
# ch_vocab_sz = len(zh_field.vocab.stoi)
# ch_embed_dim = 100
# ch_embed_matrix = torch.zeros((ch_vocab_sz, ch_embed_dim))

# for i, word in enumerate(zh_field.vocab.stoi.keys()):
#     try:
#         ch_embed_matrix[i] = torch.tensor(list(data[word]))
#     except KeyError:
#         if word in ['<unk>', '<sos>', '<eos>', '<pad>']:
#             ch_embed_matrix[i] = torch.ones((ch_embed_dim,))*i
#         else:
#             print("No embedding vector for", word)

In [None]:
# print("Chinese embedding vector created.")
# print(ch_embed_matrix.shape)

In [None]:
# # save the embeddings
# np.save(os.path.join(dataset_dir, 'eng_embed_matrix'), eng_embed_matrix.cpu().numpy())
# np.save(os.path.join(dataset_dir, 'ch_embed_matrix'), ch_embed_matrix.cpu().numpy())

In [8]:
# load the embeddings
eng_embed_matrix = torch.from_numpy(np.load(os.path.join(dataset_dir, 'eng_embed_matrix.npy'))).to(DEVICE)
ch_embed_matrix = torch.from_numpy(np.load(os.path.join(dataset_dir, 'ch_embed_matrix.npy'))).to(DEVICE)

print(eng_embed_matrix.dtype)
print(eng_embed_matrix.shape)
print(ch_embed_matrix.shape)

torch.float32
torch.Size([24628, 100])
torch.Size([30896, 100])


### Instantiate Model

In [9]:
hidden_dim = 512  # 256*2 nodes in each LSTM
num_layers = 4
dropout = 0.1
# layer_norm = True   
encoder = Encoder(ch_embed_matrix, hidden_dim, num_layers, dropout=dropout, bidirectional=True)

hid_sz = 512
vocab_size_ch = len(en_field.vocab.stoi)
decoder = Decoder(eng_embed_matrix, hid_sz, encoder.output_size, vocab_size_ch)

hyperparams = { 'hidden_dim':hidden_dim, 'num_layers':num_layers,
               'dropout':dropout, 'hid_sz':hid_sz, 
               'vocab_size_ch':vocab_size_ch}

model = Seq2Seq(encoder, decoder, tf_ratio = 1.0, device=DEVICE).to(DEVICE)
model.train()

Seq2Seq(
  (encoder): Encoder(
    (embed_layer): Embedding(30896, 100)
    (lstm): LSTM(100, 512, num_layers=4, dropout=0.1, bidirectional=True)
  )
  (decoder): Decoder(
    (embed_layer): Embedding(24628, 100, padding_idx=0)
    (attention_layer): Attention(
      (linear1): Linear(in_features=1536, out_features=768, bias=True)
      (linear2): Linear(in_features=768, out_features=1, bias=True)
    )
    (pre_lstm_cell): LSTMCell(1124, 512)
    (post_lstm_cell): LSTMCell(1536, 512)
    (mlp): Sequential(
      (0): Linear(in_features=512, out_features=24628, bias=True)
      (1): ReLU()
      (2): BatchNorm1d(24628, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Softmax(dim=1)
    )
  )
)

### Training

In [10]:
# Sequence bucketing based on size of English sentences
BATCH_SIZE = 32

train_iterator, val_iterator = BucketIterator.splits(
    (train_set, val_set), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.English), 
    shuffle=True, 
)

batch = next(iter(train_iterator))
print(batch.Chinese.shape)
print(batch.English.shape)

print("Num training example", len(train_iterator))
print("Num validation example", len(val_iterator))

torch.Size([32, 57])
torch.Size([32, 66])
Num training example 7587
Num validation example 157


In [11]:
# model.load_state_dict(torch.load(os.path.join(save_dir, 'las_model_1')))
# model.train()

# load = False
# if load:
#     saved_file = 'Trained Models/Training_2019-12-25 00:09:23.921978/las_model_6'
#     model.load_state_dict(torch.load(saved_afile))
#     start_epoch = int(saved_file[-1]) + 1
#     time = os.listdir(tensorboard_dir)[-1]  # use the last one 

NAME = 'Long_training'
time = str(datetime.datetime.now())
save_dir = os.path.join('trained_models', f'{NAME}_{time}')
try:    
    os.mkdir(save_dir);
except FileExistsError:
    pass

# Saving hyperparmas
with open(os.path.join(save_dir, 'info.pickle'), 'wb') as f:
    pickle.dump(hyperparams, f)

In [12]:
def train(model, device, train_loader, optimizer, epoch, 
          print_interval, writer=None, log_interval=-1, scheduler=None, train_dataset=None):
    
    model.train()
    print(f'Training, Logging: Mean loss of previous {print_interval} batches \n')
    
    running_loss = []
    date1 = datetime.datetime.now()
    

    
    for batch_idx, batch in enumerate(train_loader):
        data, target = batch.Chinese.to(DEVICE), batch.English.to(DEVICE)
        loss, _ = model(data, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss.append(loss.detach().item())    # update running loss

        # writing to console after print_interval batches
        if (batch_idx+1) % print_interval == 0:
            date2 = datetime.datetime.now()
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tMean Loss : {:.6f}\t lr {}\t time {}:'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), 
                np.mean(running_loss[-print_interval:]), 
                optimizer.state_dict()['param_groups'][0]['lr'],
                date2 - date1))
            date1 = date2

        # Writing to tensorboard
        if (batch_idx+1) % log_interval == 0:
            if writer:
                global_step = epoch * len(train_loader) + batch_idx
                writer.add_scalar('Loss', np.mean(running_loss[-log_interval:]), global_step)

        if batch_idx == len(train_loader)//2:
            # save, # check, 
            torch.save(model.state_dict(), os.path.join(save_dir, f'las_model_half_{epoch}'))
            torch.save(optimizer.state_dict(), os.path.join(save_dir, f'optim_half_{epoch}'))
            validate_personal(model, train_iterator)
            
    
def decode_pred_sent(out):
    pred_sent = []
    out = out.squeeze(0)
    for t in out:
        s = t.max(dim=0)[1].item()
        pred_sent.append(en_field.vocab.itos[s])
    return ''.join(pred_sent)


def decode_true_sent(y):
    sent = []
    y = y.squeeze(0)
    for t in y:
        sent.append(en_field.vocab.itos[t.item()])
    return ''.join(sent)


def validate_personal(model, test_loader):
    model.eval()
    for i in range(2):
        batch = next(iter(test_loader))
        
        ch_sent = batch.Chinese[i].unsqueeze(dim=0).to(DEVICE)
        eng_sent = batch.English[i].unsqueeze(dim=0).to(DEVICE)
        
        loss, output = model(ch_sent, eng_sent)
        
        print("\n")
        print("True sent : ", decode_true_sent(eng_sent))
        print("Pred sent : ", decode_pred_sent(output))
        print("Loss :", loss.item())  
    model.train()

In [None]:
optimizer = optim.Adam(model.parameters(), amsgrad=True)

log_interval = 5
print_interval = 50

epochs = 40
load = False

writer = SummaryWriter(save_dir)
print('save_dir', save_dir)



# load_dict = 'trained_models/Adadelta_NC_step_1_2019-12-31 04:30:30.395730'
# model.load_state_dict(torch.load(os.path.join(load_dict, 'las_model_half_0')))
# optimizer.load_state_dict(torch.load(os.path.join(load_dict, 'optim_half_0')))


for epoch in range(0,epochs): 
    train(model, DEVICE, train_iterator, optimizer, epoch, print_interval, writer, log_interval)
    
    #save model
    torch.save(model.state_dict(), os.path.join(save_dir, f'las_model_{epoch}'))
    torch.save(optimizer.state_dict(), os.path.join(save_dir, f'optim_{epoch}'))
    
    validate_personal(model, train_iterator)
    
    # Decrease tf_ratio
    if (epoch+1)%10 == 0:
        model.tf_ratio = model.tf_ratio - 0.1
        print("\nTeacher forcing ratio:", model.tf_ratio)
    
#     if scheduler:
#         validate_personal(model, 2, train_dataset)
#         for param_group in optimizer.param_groups:
#             param_group['lr'] = max(param_group['lr']*0.1, 0.001)
#         print("-"*10, "LR decreased", '-'*10)

save_dir trained_models/Long_training_2020-01-01 14:04:32.058587
Training, Logging: Mean loss of previous 50 batches 



In [None]:
validate_personal(model, val_iterator)