<a href="https://colab.research.google.com/github/yoyoyo-yo/DeepLearningMugenKnock/blob/master/Scripts_NLP/pytorch/Transformer_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer English - Franch

元論文 : Attention if All You Need https://arxiv.org/abs/1706.03762 (2017)



In [1]:
!pip install numpy matplotlib opencv-python torch torchvision torchsummary pandas easydict



## Download dataset

In [2]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2020-05-10 07:09:08--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.108.196, 104.24.109.196, 2606:4700:3033::6818:6dc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.108.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5982778 (5.7M) [application/zip]
Saving to: ‘fra-eng.zip.2’


2020-05-10 07:09:08 (16.1 MB/s) - ‘fra-eng.zip.2’ saved [5982778/5982778]



In [0]:
import zipfile

def load_ds():
    #
    _chars = '!?/.'
    corpus1 = []
    corpus2 = []

    data1 = []
    data2 = []

    with zipfile.ZipFile('fra-eng.zip') as z:
        with z.open('fra.txt') as f:
            lines = f.readlines()
            for x in lines[:100]:
                lang1, lang2, _ = [y.rstrip() for y in x.decode('utf-8').split('\t')]
                corpus1 += [w.rstrip(_chars) for w in lang1.split(' ')] #list(set(corpus1) | set(list(lang1.split(' '))))
                corpus2 += [w.rstrip(_chars).rstrip('\u202f') for w in lang2.split(' ')] #list(set(corpus2) | set(list(lang2.split(' '))))
                #print([y.rstrip() for y in x.decode('utf-8').split('\t')])

            corpus1 = list(set(corpus1)) # drop duplicate
            corpus1.sort()
            corpus1 = ['<SOS>', '<EOS>', '<UNKNOWN>'] + list(_chars) + corpus1
            corpus2 = list(set(corpus2)) # drop duplicate
            corpus2.sort()
            corpus2 = ['<SOS>', '<EOS>', '<UNKNOWN>'] + list(_chars) + corpus2

            for x in lines[:100]:
                lang1, lang2, _ = [y.rstrip() for y in x.decode('utf-8').split('\t')]
                data1 += [[corpus1.index('<SOS>')] + [corpus1.index(w.rstrip(_chars)) for w in lang1.split(' ')] + [corpus1.index('<EOS>')]]
                data2 += [[corpus2.index('<SOS>')] + [corpus2.index(w.rstrip(_chars).rstrip('\u202f')) for w in lang2.split(' ')] + [corpus2.index('<EOS>')]]

    return {'corpus1' : corpus1, 'corpus2' : corpus2, 'data1' : data1, 'data2' : data2} 

data_dict = load_ds()

## Import and Config

In [4]:
import os
import argparse
from pprint import pprint

import numpy as np
from collections import OrderedDict
from easydict import EasyDict
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary


#---
# config
#---
cfg = EasyDict()

cfg.CORPUS1_NUM = len(data_dict['corpus1'])
cfg.CORPUS2_NUM =  len(data_dict['corpus2'])

# Seq2seq config
cfg.SEQ2SEQ_MAX_LENGTH = 1000 # decoder max output length
cfg.SEQ2SEQ_TRAIN_FORCE_PROB = 0.5 # train input is forced to gt with this probability
cfg.SEQ2SEQ_NEXT_WORD_SELECTION = 'prob' # prob, argmax
cfg.SEQ2SEQ_RNN_DIM = 512
cfg.SEQ2SEQ_USE_RNN_BD = True # use bidirectional RNN

cfg.SEQ2SEQ_E_ATTENTION = True
cfg.SEQ2SEQ_E_ATTENTION_TIME = 6  # Hopping if > 1
cfg.SEQ2SEQ_E_DIM = 64
cfg.SEQ2SEQ_E_ATTENTION_DIM = 64
cfg.SEQ2SEQ_E_DROPOUT = 0.2
cfg.SEQ2SEQ_E_USE_SELF_ATTENTION = True # self attention of Encoder
cfg.SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION = True # use source target attention
cfg.SEQ2SEQ_E_MULTIHEAD_ATTENTION_N = 8 # Multi head attention
cfg.SEQ2SEQ_E_USE_FFN = True # Feed forward network
cfg.SEQ2SEQ_E_FFN_DIM = 2048
cfg.SEQ2SEQ_E_USE_PE = True # Positional encoding

cfg.SEQ2SEQ_D_ATTENTION = True
cfg.SEQ2SEQ_D_ATTENTION_TIME = 6  # Hopping if > 1
cfg.SEQ2SEQ_D_DIM = 64
cfg.SEQ2SEQ_D_ATTENTION_DIM = 64
cfg.SEQ2SEQ_D_DROPOUT = 0.2
cfg.SEQ2SEQ_D_USE_SELF_ATTENTIION = True # self attention of Decoder
cfg.SEQ2SEQ_D_USE_SOURCE_TARGET_ATTENTION = True # use source target attention
cfg.SEQ2SEQ_D_MULTIHEAD_ATTENTION_N = 8 # Multi head attention
cfg.SEQ2SEQ_D_USE_FFN = True # Feed forward network
cfg.SEQ2SEQ_D_FFN_DIM = 2048
cfg.SEQ2SEQ_D_USE_PE = True # Positional encoding


cfg.CHANNEL_AXIS = 1 # 1 ... [mb, c, h, w], 3 ... [mb, h, w, c]

cfg.GPU = True
cfg.DEVICE_TYPE = 'cuda' if cfg.GPU and torch.cuda.is_available() else 'cpu'
cfg.DEVICE = torch.device(cfg.DEVICE_TYPE)

# train
cfg.TRAIN = EasyDict()
cfg.TRAIN.DISPAY_ITERATION_INTERVAL = 50

cfg.PREFIX = 'Seq2seq-Attention'
cfg.TRAIN.MODEL_E_SAVE_PATH = 'models/' + cfg.PREFIX + '_E_{}.pt'
cfg.TRAIN.MODEL_D_SAVE_PATH = 'models/' + cfg.PREFIX + '_D_{}.pt'
cfg.TRAIN.MODEL_SAVE_INTERVAL = 200
cfg.TRAIN.ITERATION = 10_000
cfg.TRAIN.MINIBATCH = 32
cfg.TRAIN.OPTIMIZER_E = torch.optim.Adam
cfg.TRAIN.LEARNING_PARAMS_E = {'lr' : 0.01, 'betas' : (0., 0.9)}
cfg.TRAIN.OPTIMIZER_D = torch.optim.Adam
cfg.TRAIN.LEARNING_PARAMS_D = {'lr' : 0.01, 'betas' : (0., 0.9)}
cfg.TRAIN.LOSS_FUNCTION = torch.nn.NLLLoss()

cfg.TRAIN.DATA_PATH = '/content/drive/My Drive/Colab Notebooks/Dataset/train/images/'
cfg.TRAIN.DATA_HORIZONTAL_FLIP = True # data augmentation : holizontal flip
cfg.TRAIN.DATA_VERTICAL_FLIP = True # data augmentation : vertical flip
cfg.TRAIN.DATA_ROTATION = 1 # data augmentation : rotation False, or integer

cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE = True
cfg.TRAIN.LEARNING_PROCESS_RESULT_INTERVAL = 200
cfg.TRAIN.LEARNING_PROCESS_RESULT_IMAGE_PATH = 'result/' + cfg.PREFIX + '_result_{}.jpg'
cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH = 'result/' + cfg.PREFIX + '_loss.txt'


# test
cfg.TEST = EasyDict()
cfg.TEST.MODEL_E_PATH = cfg.TRAIN.MODEL_E_SAVE_PATH.format('final')
cfg.TEST.MODEL_D_PATH = cfg.TRAIN.MODEL_D_SAVE_PATH.format('final')
cfg.TEST.DATA_PATH = '/content/drive/My Drive/Colab Notebooks/Dataset/test/images/'
cfg.TEST.MINIBATCH = 10
cfg.TEST.ITERATION = 2
cfg.TEST.RESULT_SAVE = False
cfg.TEST.RESULT_IMAGE_PATH = 'result/' + cfg.PREFIX + '_result_{}.jpg'

# random seed
torch.manual_seed(0)


# make model save directory
def make_dir(path):
    if '/' in path:
        model_save_dir = '/'.join(path.split('/')[:-1])
        os.makedirs(model_save_dir, exist_ok=True)

make_dir(cfg.TRAIN.MODEL_D_SAVE_PATH)
make_dir(cfg.TRAIN.LEARNING_PROCESS_RESULT_IMAGE_PATH)
make_dir(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH)

pprint(cfg)

{'CHANNEL_AXIS': 1,
 'CORPUS1_NUM': 64,
 'CORPUS2_NUM': 132,
 'DEVICE': device(type='cuda'),
 'DEVICE_TYPE': 'cuda',
 'GPU': True,
 'PREFIX': 'Seq2seq-Attention',
 'SEQ2SEQ_D_ATTENTION': True,
 'SEQ2SEQ_D_ATTENTION_DIM': 64,
 'SEQ2SEQ_D_ATTENTION_TIME': 6,
 'SEQ2SEQ_D_DIM': 64,
 'SEQ2SEQ_D_DROPOUT': 0.2,
 'SEQ2SEQ_D_FFN_DIM': 2048,
 'SEQ2SEQ_D_MULTIHEAD_ATTENTION_N': 8,
 'SEQ2SEQ_D_USE_FFN': True,
 'SEQ2SEQ_D_USE_PE': True,
 'SEQ2SEQ_D_USE_SELF_ATTENTIION': True,
 'SEQ2SEQ_D_USE_SOURCE_TARGET_ATTENTION': True,
 'SEQ2SEQ_E_ATTENTION': True,
 'SEQ2SEQ_E_ATTENTION_DIM': 64,
 'SEQ2SEQ_E_ATTENTION_TIME': 6,
 'SEQ2SEQ_E_DIM': 64,
 'SEQ2SEQ_E_DROPOUT': 0.2,
 'SEQ2SEQ_E_FFN_DIM': 2048,
 'SEQ2SEQ_E_MULTIHEAD_ATTENTION_N': 8,
 'SEQ2SEQ_E_USE_FFN': True,
 'SEQ2SEQ_E_USE_PE': True,
 'SEQ2SEQ_E_USE_SELF_ATTENTION': True,
 'SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION': True,
 'SEQ2SEQ_MAX_LENGTH': 1000,
 'SEQ2SEQ_NEXT_WORD_SELECTION': 'prob',
 'SEQ2SEQ_RNN_DIM': 512,
 'SEQ2SEQ_TRAIN_FORCE_PROB': 0.5,
 'SE

## Define Model

In [0]:
class Reshape(torch.nn.Module):
    def __init__(self, shape):
        super(Reshape, self).__init__()
        self.shape = shape
    
    def forward(self, x):
        return x.reshape(self.shape)

class Permute(torch.nn.Module):
    def __init__(self, *args):
        super(Permute, self).__init__()
        self.shape = args
    
    def forward(self, x):
        return x.permute(self.shape)


class Encoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=64, rnn_dim=64, rnn_hidden_size=1, attention_dim=64, max_length=100, 
        dropout_p=0.1, attention_time=1, use_source_target_attention=False,
        use_self_attention=False, multiHead_attention_num=1, use_FFN=False, FFN_dim=2048, use_PE=False, use_bd=False):
    
        super(Encoder, self).__init__()
        self.max_length = max_length
        self.rnn_dim = rnn_dim
        self.rnn_hidden_size = rnn_hidden_size

        # Embedding
        self.embedding = torch.nn.Embedding(input_dim, hidden_dim)

        # Positional Encoding
        if use_PE:
            self.pe = PE()

        # Attention
        self.attentions = []
        if attention_time > 0:
            for i in range(attention_time):
                # Self Attention
                if use_self_attention:
                    self.attentions.append(Attention(
                        hidden_dim=hidden_dim, memory_dim=hidden_dim, attention_dim=attention_dim, output_dim=hidden_dim,
                        dropout_p=dropout_p, max_length=max_length, self_attention=use_self_attention, head_num=multiHead_attention_num))

                # Feed Forward Network
                if use_FFN:
                    self.attentions.append(FFN(dim=FFN_dim, d_model=hidden_dim, dropout_p=dropout_p))

        self.attentions = torch.nn.ModuleList(self.attentions)

        # output GRU
        self.gru = torch.nn.GRU(hidden_dim, rnn_dim, bidirectional=use_bd)


    def forward(self, x, hidden, x_memory):
        # Embedding
        x = self.embedding(x).view(1, 1, -1)
        x_memory = self.embedding(x_memory).permute(1, 0, 2)
        x_memory = x_memory.float()

        # Positional Encoding
        if hasattr(self, 'PE'):
            x = self.pe(x)
            x_memory = self.pe(x_memory)

        # Attention
        for layer in self.attentions:
            x = layer(x, x_memory, x_memory)

        # RNN
        x, hidden = self.gru(x, hidden)
        return x, hidden

    def initHidden(self):
        return torch.zeros(self.rnn_hidden_size, 1, self.rnn_dim).to(cfg.DEVICE)


class Decoder(torch.nn.Module):
    def __init__(self, output_dim, hidden_dim=64, rnn_dim=64, attention_dim=64, dropout_p=0.1,
        attention_time=1, max_length=100, use_source_target_attention=False, use_self_attention=False,
        multiHead_attention_num=2, use_FFN=False, FFN_dim=2048, use_PE=False, use_bd=False):
        super(Decoder, self).__init__()

        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.dropout_p = dropout_p
        self.max_length = max_length

        # Embedding
        self.input_embedding = torch.nn.Embedding(output_dim, hidden_dim)
        self.input_embedding_dropout = torch.nn.Dropout(dropout_p)

        # Positional Encoding
        if use_PE:
            self.pe = PE()

        # Attention
        self.attentions = []
        if attention_time > 0:
            for i in range(attention_time):
                # Self Attention
                if use_self_attention:
                    self.attentions.append(
                        Attention(hidden_dim=hidden_dim, memory_dim=hidden_dim, attention_dim=attention_dim, output_dim=hidden_dim,
                        dropout_p=dropout_p, max_length=max_length, self_attention=use_self_attention, head_num=multiHead_attention_num))
                
                # Source Target Attention
                if use_source_target_attention:
                    self.attentions.append(
                        Attention(hidden_dim=hidden_dim, memory_dim=rnn_dim * (use_bd + 1), attention_dim=attention_dim, output_dim=hidden_dim,
                        dropout_p=dropout_p, max_length=max_length, head_num=multiHead_attention_num))

                # Feed Forward Network
                if use_FFN:
                    self.attentions.append(FFN(dim=FFN_dim, d_model=hidden_dim, dropout_p=dropout_p))

        self.attentions = torch.nn.ModuleList(self.attentions)

        # output GRU
        self.gru = torch.nn.GRU(hidden_dim, rnn_dim, bidirectional=use_bd)
        self.out = torch.nn.Sequential(
            torch.nn.Linear(rnn_dim * (use_bd + 1), output_dim),
            torch.nn.Softmax(dim=-1)
        )
    

    def forward(self, x, hidden, x_memory_encoder, x_self_memory):
        # Embedding
        x = self.input_embedding(x)
        x = self.input_embedding_dropout(x)

        # Memory Embedding
        x_self_memory = self.input_embedding(x_self_memory)#.permute(1, 0, 2)

        # Positional Encoding
        if hasattr(self, "pe"):
            x = self.pe(x)
            x_self_memory = self.pe(x_self_memory)

        # Attention
        for layer in self.attentions:
            x = layer(x, x_memory_encoder, x_self_memory)

        # output GRU
        x, hidden = self.gru(x, hidden)
        x = self.out(x[0])
        return x, hidden



class Attention(torch.nn.Module):
    def __init__(self, hidden_dim, memory_dim, attention_dim, output_dim, dropout_p=0.1, max_length=100, head_num=1, self_attention=False):
        super(Attention, self).__init__()
        self.max_length = max_length
        self.self_attention = self_attention

        # Attention Query
        self.Q = torch.nn.Sequential(
            Reshape([1, -1]),
            torch.nn.Linear(hidden_dim, attention_dim),
            torch.nn.Dropout(dropout_p),
            Reshape([1, 1, -1]),
            Reshape([1, attention_dim // head_num, head_num]), # Multi head attention
            Permute(2, 0, 1),
        )
        
        # Attention Key
        self.K = torch.nn.Sequential(
            torch.nn.Linear(memory_dim, attention_dim),
            torch.nn.Dropout(dropout_p),
            Reshape([1, -1, attention_dim]),
            Reshape([-1, attention_dim // head_num, head_num]), # Multi head attention
            Permute(2, 1, 0)
        )
        
        # Attetion Value
        self.V = torch.nn.Sequential(
            torch.nn.Linear(memory_dim, attention_dim),
            torch.nn.Dropout(dropout_p),
            Reshape([1, -1, attention_dim]),
            Reshape([-1, attention_dim // head_num, head_num]), # Multi head attention
            Permute(2, 0, 1),
        )

        self.out = torch.nn.Sequential(
            torch.nn.Linear(attention_dim, output_dim),
            torch.nn.Dropout(dropout_p)
        )


    def forward(self, x, memory, memory2):
        # get Query
        Q = self.Q(x)
        Q *= Q.size()[-1] ** -0.5 # scaled dot product

        if self.self_attention:
            memory = memory2

        # memory transforme [mb(=1), length, dim] -> [length, dim]
        if len(memory.size()) > 2:
            memory = memory[0]
        
        # get Key
        K = self.K(memory)

        QK = torch.bmm(Q, K) # get Query and Key (= attention logits)

        # masking attention weight
        any_zero = memory.sum(dim=1)
        pad_mask = torch.ones([1, 1, self.max_length]).to(cfg.DEVICE)
        pad_mask[:, :, torch.nonzero(any_zero)] = 0

        pad_mask = pad_mask[:, :, :QK.size()[-1]] # crop 
        QK += pad_mask * 1e-10
        attention_weights = F.softmax(QK, dim=-1) # get attention weight
        
        # get Value
        V = self.V(memory)
        
        # Attetion x Value
        x = torch.bmm(attention_weights, V)

        # Multi head -> one head
        x = x.permute(1, 2, 0).reshape(1, 1, -1)
        return self.out(x)


class FFN(torch.nn.Module):
    def __init__(self, dim, d_model, dropout_p=0.1):
        super(FFN, self).__init__()

        self.module = torch.nn.Sequential(
            torch.nn.Linear(d_model, dim),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_p),
            torch.nn.Linear(dim, d_model)
        )

    def forward(self, x, memory_encoder, decoder):
        return self.module(x)

class PE(torch.nn.Module):
    def __init__(self):
        super(PE, self).__init__()

    def forward(self, x):
        mb, pos, dim = x.size()
        pe = np.zeros_like(x.detach().cpu().numpy())
        pos_i, dim_i = np.meshgrid(np.arange(dim), np.arange(pos))
        pe[..., 0::2] = np.sin(pos_i[..., 0::2] / (10000 ** (2 * dim_i[..., 0::2] / dim)))
        pe[..., 1::2] = np.cos(pos_i[..., 1::2] / (10000 ** (2 * dim_i[..., 1::2] / dim)))
        pe = torch.tensor(pe).to(cfg.DEVICE)
        return x + pe


## Utility

In [0]:
class MInibatch_Generator():
    def __init__(self, data_size, batch_size, shuffle=True):
        self.data_size = data_size
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.mbi = 0 # index for iteration
        self.inds = np.arange(data_size)
        np.random.shuffle(self.inds)

    def __call__(self):
        if self.mbi + self.batch_size > self.data_size:
            inds = self.inds[self.mbi:]
            np.random.shuffle(self.inds)
            inds = np.hstack((inds, self.inds[ : (self.batch_size - (self.data_size - self.mbi))]))
            mbi = self.batch_size - (self.data_size - self.mbi)
        else:
            inds = self.inds[self.mbi : self.mbi + self.batch_size]
            self.mbi += self.batch_size
        return inds


## Train

In [7]:
# train
def train():
    # model
    E = Encoder(
        input_dim = cfg.CORPUS1_NUM, 
        hidden_dim = cfg.SEQ2SEQ_E_DIM,
        attention_dim = cfg.SEQ2SEQ_E_ATTENTION_DIM,
        rnn_dim = cfg.SEQ2SEQ_RNN_DIM,
        rnn_hidden_size = cfg.SEQ2SEQ_USE_RNN_BD + 1,
        use_bd = cfg.SEQ2SEQ_USE_RNN_BD,
        dropout_p = cfg.SEQ2SEQ_E_DROPOUT,
        attention_time = cfg.SEQ2SEQ_E_ATTENTION_TIME,
        use_source_target_attention = cfg.SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION,
        use_self_attention = cfg.SEQ2SEQ_E_USE_SELF_ATTENTION,
        multiHead_attention_num = cfg.SEQ2SEQ_E_MULTIHEAD_ATTENTION_N,
        use_FFN = cfg.SEQ2SEQ_E_USE_FFN,
        FFN_dim = cfg.SEQ2SEQ_E_FFN_DIM,
        use_PE = cfg.SEQ2SEQ_E_USE_PE,
        max_length = cfg.SEQ2SEQ_MAX_LENGTH
        ).to(cfg.DEVICE) 

    D = Decoder(
        output_dim = cfg.CORPUS2_NUM, 
        hidden_dim = cfg.SEQ2SEQ_E_DIM,
        rnn_dim = cfg.SEQ2SEQ_RNN_DIM,
        use_bd = cfg.SEQ2SEQ_USE_RNN_BD,
        attention_dim = cfg.SEQ2SEQ_E_ATTENTION_DIM,
        dropout_p = cfg.SEQ2SEQ_E_DROPOUT,
        attention_time = cfg.SEQ2SEQ_E_ATTENTION_TIME,
        use_source_target_attention = cfg.SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION,
        use_self_attention = cfg.SEQ2SEQ_E_USE_SELF_ATTENTION,
        multiHead_attention_num = cfg.SEQ2SEQ_E_MULTIHEAD_ATTENTION_N,
        use_FFN = cfg.SEQ2SEQ_E_USE_FFN,
        FFN_dim = cfg.SEQ2SEQ_E_FFN_DIM,
        use_PE = cfg.SEQ2SEQ_E_USE_PE,
        max_length = cfg.SEQ2SEQ_MAX_LENGTH
        ).to(cfg.DEVICE)

    #summary(E, (cfg.INPUT_Z_DIM, 1, 1), device=cfg.DEVICE_TYPE)
    #summary(D, (cfg.OUTPUT_CHANNEL, cfg.OUTPUT_HEIGHT, cfg.OUTPUT_WIDTH), device=cfg.DEVICE_TYPE)
    
    opt_E = cfg.TRAIN.OPTIMIZER_E(E.parameters(), **cfg.TRAIN.LEARNING_PARAMS_E)
    opt_D = cfg.TRAIN.OPTIMIZER_D(D.parameters(), **cfg.TRAIN.LEARNING_PARAMS_D)

    list_iter = []
    list_loss = []
    list_accuracy = []

    #dataset = MyDataset(data_dict['data1'], data_dict['data2'])
    #dataloader = torch.utils.data.DataLoader(dataset, batch_size=cfg.TRAIN.MINIBATCH, shuffle=True)

    mb_gen = MInibatch_Generator(len(data_dict['data1']), cfg.TRAIN.MINIBATCH)

    print('training start')
    progres_bar = ''

    Xs_train = data_dict['data1']
    ts_train = data_dict['data2']

    for i in range(cfg.TRAIN.ITERATION):
        idxs = mb_gen()
        loss = 0.
        accuracy = 0.
        total_len = 0.
        _Xs = [Xs_train[idx] for idx in idxs]
        _ts = [ts_train[idx] for idx in idxs]

        # each iteration in minibatch
        opt_E.zero_grad()
        opt_D.zero_grad()

        for mbi in range(cfg.TRAIN.MINIBATCH):
            Xs = torch.tensor(_Xs[mbi]).reshape(-1, 1).to(cfg.DEVICE)
            ts = torch.tensor(_ts[mbi]).reshape(-1, 1).to(cfg.DEVICE)
        
            xs_length = Xs.size()[0]
            ts_length = ts.size()[0]
            total_len += ts_length

            # encode process
            E_hidden = E.initHidden() # initialize encoder hidden
            E_outputs = torch.zeros(cfg.SEQ2SEQ_MAX_LENGTH, cfg.SEQ2SEQ_RNN_DIM * (cfg.SEQ2SEQ_USE_RNN_BD + 1)).to(cfg.DEVICE)

            for ei in range(xs_length):
                E_output, E_hidden = E(Xs[ei], E_hidden, Xs)
                E_outputs[ei] = E_output[0, 0]

            # decode process
            D_xs = ts[0].reshape(1, -1) # define decoder input
            D_hidden = E_hidden # define decoder hidden
            D_self_memory = D_xs
            D_outputs = []

            # define whethere if use teacher label for decoder input
            use_teacher = True if np.random.random() < cfg.SEQ2SEQ_TRAIN_FORCE_PROB else False

            for di in range(1, ts_length):
                # decode
                D_ys, D_hidden = D(D_xs, D_hidden, E_outputs, D_self_memory)

                # add loss
                loss += cfg.TRAIN.LOSS_FUNCTION(torch.log(D_ys), ts[di])

                # count accuracy
                if D_ys.argmax() == ts[di]:
                    accuracy += 1.
                
                if cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "argmax":
                    topv, topi = D_ys.data.topk(1)

                elif cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "prob":
                    topi = torch.multinomial(torch.max(D_ys, torch.zeros_like(D_ys)), 1)
                
                # define next decoder input
                if use_teacher:
                    D_xs = ts[di] # teacher forcing
                else:
                    D_xs = topi#.squeeze().detach()

                D_xs = D_xs.reshape(1, -1)
                D_self_memory = torch.cat([D_self_memory, D_xs])

                D_outputs.append(topi.detach().cpu().numpy()[0])
                     
                # if EOS, finish training
                #if D_xs.item() == data_dict['corpus2'].index('<EOS>'):
                #    break

        loss.backward()
        opt_D.step()

        _loss = loss.item() / cfg.TRAIN.MINIBATCH
        _accuracy = accuracy / total_len

        progres_bar += '|'
        print('\r' + 'Loss:{:.4f}, Accu:{:.4f} '.format(_loss, _accuracy) + progres_bar, end='')

        if (i + 1) % 10 == 0:
            progres_bar += str(i + 1)
            print('\r' + 'Loss:{:.4f}, Accu:{:.4f} '.format(_loss, _accuracy) + progres_bar, end='')

            # save process result
            if cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE:
                list_iter.append(i + 1)
                list_loss.append(_loss)
                list_accuracy.append(_accuracy)

        # display training state
        if (i + 1) % cfg.TRAIN.DISPAY_ITERATION_INTERVAL == 0:
            print('\r' + ' ' * (len(progres_bar) + 50), end='')
            print('\rIter:{}, Loss:{:.4f}, Accu:{:.4f}'.format(i + 1, _loss, _accuracy))
            progres_bar = ''

        # save parameters
        if (cfg.TRAIN.MODEL_SAVE_INTERVAL != False) and ((i + 1) % cfg.TRAIN.MODEL_SAVE_INTERVAL == 0):
            E_save_path = cfg.TRAIN.MODEL_E_SAVE_PATH.format('iter{}'.format(i + 1))
            D_save_path = cfg.TRAIN.MODEL_D_SAVE_PATH.format('iter{}'.format(i + 1))
            torch.save(E.state_dict(), E_save_path)
            torch.save(D.state_dict(), D_save_path)
            print('save E >> {}, D >> {}'.format(E_save_path, D_save_path))

        # save process result
        if cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE and ((i + 1) % cfg.TRAIN.LEARNING_PROCESS_RESULT_INTERVAL == 0):
            print('iter :', i + 1)
            print(' - [input]', ' '.join([data_dict['corpus1'][x] for x in _Xs[0][1:-1]]))
            print(' - [output]', ' '.join([data_dict['corpus2'][x] for x in D_outputs if x not in [0, 1, 2]]))
            print(' - [gt]', ' '.join([data_dict['corpus2'][x] for x in _ts[0][1:-1]]))

    E_save_path = cfg.TRAIN.MODEL_E_SAVE_PATH.format('final')
    D_save_path = cfg.TRAIN.MODEL_D_SAVE_PATH.format('final')
    torch.save(E.state_dict(), E_save_path)
    torch.save(D.state_dict(), D_save_path)
    print('final paramters were saved to E >> {}, D >> {}'.format(E_save_path, D_save_path))

    if cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE:
        f = open(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH, 'w')
        df = pd.DataFrame({'iteration' : list_iter, 'loss' : list_loss, 'accuracy' : list_accuracy})
        df.to_csv(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH, index=False)
        print('loss was saved to >> {}'.format(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH))

train()

training start


	nonzero(Tensor input, *, Tensor out)
Consider using one of the following signatures instead:
	nonzero(Tensor input, *, bool as_tuple)


Iter:50, Loss:11.1560, Accu:0.2406
Iter:100, Loss:13.2247, Accu:0.1267
Iter:150, Loss:10.9344, Accu:0.2370
Iter:200, Loss:11.7231, Accu:0.2222
save E >> models/Seq2seq-Attention_E_iter200.pt, D >> models/Seq2seq-Attention_D_iter200.pt
iter : 200
 - [input] I try


TypeError: ignored

## Test

In [0]:
# test
def test():
    print('-' * 20)
    print('test function')
    print('-' * 20)
    E = Encoder(cfg.CORPUS1_NUM).to(cfg.DEVICE)
    D = Decoder(cfg.CORPUS2_NUM).to(cfg.DEVICE)
    D.load_state_dict(torch.load(cfg.TEST.MODEL_D_PATH, map_location=torch.device(cfg.DEVICE)))
    E.eval()
    D.eval()

    def generate(sentence):
        corpus1 = data_dict['corpus1']
        corpus2 = data_dict['corpus2']

        Xs = [corpus1.index('<SOS>')]

        for word in sentence.split(' '):
            if word in corpus1:
                Xs.append(corpus1.index(word))
            else:
                Xs.append(corpus1.index('<UNKNOWN>'))

            # encode process
            E_hidden = E.initHidden() # initialize encoder hidden
            E_outputs = torch.zeros(cfg.SEQ2SEQ_MAX_LENGTH, cfg.SEQ2SEQ_RNN_DIM * (cfg.SEQ2SEQ_USE_RNN_BD + 1)).to(cfg.DEVICE)

            for ei in range(xs_length):
                E_output, E_hidden = E(Xs[ei], E_hidden, Xs)
                E_outputs[ei] = E_output[0, 0]

            # decode process
            D_xs = ts[0].reshape(1, -1) # define decoder input
            D_hidden = E_hidden # define decoder hidden
            D_self_memory = D_xs
            D_outputs = []

            while 1:
                # decode
                D_ys, D_hidden = D(D_xs, D_hidden, E_outputs, D_self_memory)
                
                if cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "argmax":
                    topv, topi = D_ys.data.topk(1)

                elif cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "prob":
                    topi = torch.multinomial(torch.max(D_ys, torch.zeros_like(D_ys) + 1e-5), 1)
                
                # define next decoder input
                if use_teacher:
                    D_xs = ts[di] # teacher forcing
                else:
                    D_xs = topi#.squeeze().detach()

                D_xs = D_xs.reshape(1, -1)
                D_self_memory = torch.cat([D_self_memory, D_xs])

                D_outputs.append(topi.item())

                if len(D_outputs) > cfg.SEQ2SEQ_MAX_LENGTH:
                    break
                if topi.item() == corpus2.index('<EOS>'):
                    break

            print(' - [input]', ' '.join([corpus1][x] for x in Xs[1:]]))
            print(' - [output]', ' '.join([corpus2[x] for x in D_outputs if x not in [0, 1, 2]]))


    with torch.no_grad():
        for sen in ['I like apple', 'Go ahead', 'Thank you for your nice advice']:
            generate(sen)

test()

In [0]:
def arg_parse():
    parser = argparse.ArgumentParser(description='CNN implemented with Keras')
    parser.add_argument('--train', dest='train', action='store_true')
    parser.add_argument('--test', dest='test', action='store_true')
    args = parser.parse_args()
    return args

# main
if __name__ == '__main__':
    args = arg_parse()

    if args.train:
        train()
    if args.test:
        test()

    if not (args.train or args.test):
        print("please select train or test flag")
        print("train: python main.py --train")
        print("test:  python main.py --test")
        print("both:  python main.py --train --test")


In [0]:
import numpy as np
a = np.arange(3)
b = np.arange(3) + 2

In [0]:
a

In [0]:
b

In [0]:
np.meshgrid(a, b)

In [0]:
c = np.zeros([10, 10])
c[np.meshgrid(a, b)] = 1
c

In [0]:
True + 1