<a href="https://colab.research.google.com/github/yoyoyo-yo/DeepLearningMugenKnock/blob/master/pytorch/HRED_pytorch_sand.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HRED

元論文 : Attention if All You Need https://arxiv.org/abs/1706.03762 (2017)



In [36]:
!pip install numpy matplotlib opencv-python torch torchvision torchsummary pandas easydict



# Ginza

In [37]:
!pip install ginza



In [38]:
import pkg_resources, imp
imp.reload(pkg_resources)

import spacy
nlp = spacy.load('ja_ginza')

In [39]:
# test
import spacy

nlp = spacy.load('ja_ginza')
doc = nlp('あのラーメン屋にはよく行く。美味しいんだ。')

for sent in doc.sents:
    for token in sent:
        info = [
            token.i,         # トークン番号
            token.orth_,     # テキスト
            #token._.reading, # 読みカナ
            token.lemma_,    # 基本形
            token.pos_,      # 品詞
            token.tag_,      # 品詞詳細
            #token._.inf      # 活用情報
        ]
        print(info)

[0, 'あの', 'あの', 'DET', '連体詞']
[1, 'ラーメン', 'ラーメン', 'NOUN', '名詞-普通名詞-一般']
[2, '屋', '屋', 'NOUN', '接尾辞-名詞的-一般']
[3, 'に', 'に', 'ADP', '助詞-格助詞']
[4, 'は', 'は', 'ADP', '助詞-係助詞']
[5, 'よく', 'よく', 'ADV', '副詞']
[6, '行く', '行く', 'VERB', '動詞-非自立可能']
[7, '。', '。', 'PUNCT', '補助記号-句点']
[8, '美味しい', '美味しい', 'ADJ', '形容詞-一般']
[9, 'ん', 'ん', 'SCONJ', '助詞-準体助詞']
[10, 'だ', 'だ', 'AUX', '助動詞']
[11, '。', '。', 'PUNCT', '補助記号-句点']


# Download dataset

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
from glob import glob

glob('/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/*_original.txt')

['/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_hanayome_original.txt',
 '/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_gasorin_original.txt',
 '/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_sougiya_original.txt',
 '/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_sanpo_original.txt',
 '/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_ijime_original.txt',
 '/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_syokumushitsumon_original.txt',
 '/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_momotaro_original.txt',
 '/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_ryokou_original.txt',
 '/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_anpanman_original.txt',
 '/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_keisatsu_origina

In [42]:
def get_corpus(fname):
    corpus = []

    with open(fname, 'r') as f:
        for line in f.readlines():
            line = line.rstrip()
            _corpus = []
            for sent in nlp(line).sents:
                for token in sent:
                    _corpus.append(token.orth_)

            corpus = list(set(corpus) | set(_corpus))
    corpus.sort()
    return corpus

In [43]:
# sample
corpus = get_corpus('/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_hanayome_original.txt')
corpus = ['<UNKNOWN>'] + corpus

In [44]:
def read_data(fname, corpus):
    Xs = []
    with open(fname, 'r') as f:
        for line in f.readlines():
            line = line.rstrip()
            _Xs = [corpus.index('<SOS>')]
            for sent in nlp(line).sents:
                for token in sent:
                    w = token.orth_

                    if w in corpus:
                        ind = corpus.index(w)
                    else:
                        ind = corpus.index('<UNKNOWN>')
                    _Xs.append(ind)
            _Xs.append(corpus.index('<EOS>'))
            Xs.append(_Xs)

    return Xs

In [45]:
def get_data(data, data_n=None):
    data_n = len(data) if data_n is None else data_n
    Xs = []
    for i in range(0, len(data) - data_n):
        _Xs = []
        for j in range(data_n):
            _Xs.append(data[i + j])
        Xs.append(_Xs)
    return Xs

In [46]:
# sample
# data = read_data('/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/sandwitchman_hanayome_original.txt', corpus)
# get_data(data, data_n=HRED_SESSION)

In [47]:
# get corpus
corpus = []
for fpath in glob('/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/*_original.txt'):
    _corpus = get_corpus(fpath)
    corpus = list(set(corpus) | set(_corpus))

corpus.sort()
corpus = ['<SOS>', '<EOS>', '<UNKNOWN>'] + corpus

In [48]:
# get training data
data_Xs = []

HRED_SESSION = 5

for fpath in glob('/content/drive/My Drive/Colab Notebooks/datasets/sandwitchman/*_original.txt'):
    data = read_data(fpath, corpus)
    _data_Xs = get_data(data, data_n=5)
    data_Xs += _data_Xs

# Import and Config

In [49]:
import os
import argparse
from pprint import pprint

import numpy as np
from collections import OrderedDict
from easydict import EasyDict
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary


#---
# config
#---
cfg = EasyDict()

cfg.CORPUS1_NUM = len(corpus)
cfg.CORPUS2_NUM =  len(corpus)

# Seq2seq config
cfg.SEQ2SEQ_MAX_LENGTH = 1000 # decoder max output length
cfg.SEQ2SEQ_TRAIN_FORCE_PROB = 0.5 # train input is forced to gt with this probability
cfg.SEQ2SEQ_NEXT_WORD_SELECTION = 'prob' # prob, argmax
cfg.SEQ2SEQ_RNN_DIM = 512
cfg.SEQ2SEQ_USE_RNN_BD = True # use bidirectional RNN

cfg.SEQ2SEQ_E_ATTENTION = False
cfg.SEQ2SEQ_E_ATTENTION_TIME = 2  # Hopping if > 1
cfg.SEQ2SEQ_E_DIM = 64
cfg.SEQ2SEQ_E_ATTENTION_DIM = 64
cfg.SEQ2SEQ_E_DROPOUT = 0.2
cfg.SEQ2SEQ_E_USE_SELF_ATTENTION = True # self attention of Encoder
cfg.SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION = True # use source target attention
cfg.SEQ2SEQ_E_MULTIHEAD_ATTENTION_N = 8 # Multi head attention
cfg.SEQ2SEQ_E_USE_FFN = True # Feed forward network
cfg.SEQ2SEQ_E_FFN_DIM = 512
cfg.SEQ2SEQ_E_USE_PE = True # Positional encoding

cfg.SEQ2SEQ_D_ATTENTION = False
cfg.SEQ2SEQ_D_ATTENTION_TIME = 2  # Hopping if > 1
cfg.SEQ2SEQ_D_DIM = 64
cfg.SEQ2SEQ_D_ATTENTION_DIM = 64
cfg.SEQ2SEQ_D_DROPOUT = 0.2
cfg.SEQ2SEQ_D_USE_SELF_ATTENTIION = True # self attention of Decoder
cfg.SEQ2SEQ_D_USE_SOURCE_TARGET_ATTENTION = True # use source target attention
cfg.SEQ2SEQ_D_MULTIHEAD_ATTENTION_N = 8 # Multi head attention
cfg.SEQ2SEQ_D_USE_FFN = True # Feed forward network
cfg.SEQ2SEQ_D_FFN_DIM = 512
cfg.SEQ2SEQ_D_USE_PE = True # Positional encoding

cfg.HRED_HIDDEN_DIM = 512 # d_s in original paper

cfg.CHANNEL_AXIS = 1 # 1 ... [mb, c, h, w], 3 ... [mb, h, w, c]

cfg.GPU = True
cfg.DEVICE_TYPE = 'cuda' if cfg.GPU and torch.cuda.is_available() else 'cpu'
cfg.DEVICE = torch.device(cfg.DEVICE_TYPE)

# train
cfg.TRAIN = EasyDict()
cfg.TRAIN.DISPAY_ITERATION_INTERVAL = 50

cfg.PREFIX = 'Seq2seq-Attention'
cfg.TRAIN.MODEL_E_SAVE_PATH = 'models/' + cfg.PREFIX + '_E_{}.pt'
cfg.TRAIN.MODEL_D_SAVE_PATH = 'models/' + cfg.PREFIX + '_D_{}.pt'
cfg.TRAIN.MODEL_SAVE_INTERVAL = 50
cfg.TRAIN.ITERATION = 50
cfg.TRAIN.MINIBATCH = 1
cfg.TRAIN.OPTIMIZER_E = torch.optim.Adam
cfg.TRAIN.LEARNING_PARAMS_E = {'lr' : 0.01, 'betas' : (0., 0.9)}
cfg.TRAIN.OPTIMIZER_D = torch.optim.Adam
cfg.TRAIN.LEARNING_PARAMS_D = {'lr' : 0.01, 'betas' : (0., 0.9)}
cfg.TRAIN.OPTIMIZER_H = torch.optim.Adam
cfg.TRAIN.LEARNING_PARAMS_H = {'lr' : 0.01, 'betas' : (0., 0.9)}
cfg.TRAIN.LOSS_FUNCTION = torch.nn.NLLLoss()

cfg.TRAIN.DATA_PATH = '/content/drive/My Drive/Colab Notebooks/Dataset/train/images/'
cfg.TRAIN.DATA_HORIZONTAL_FLIP = True # data augmentation : holizontal flip
cfg.TRAIN.DATA_VERTICAL_FLIP = True # data augmentation : vertical flip
cfg.TRAIN.DATA_ROTATION = 1 # data augmentation : rotation False, or integer

cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE = True
cfg.TRAIN.LEARNING_PROCESS_RESULT_INTERVAL = 200
cfg.TRAIN.LEARNING_PROCESS_RESULT_IMAGE_PATH = 'result/' + cfg.PREFIX + '_result_{}.jpg'
cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH = 'result/' + cfg.PREFIX + '_loss.txt'


# test
cfg.TEST = EasyDict()
cfg.TEST.MODEL_E_PATH = cfg.TRAIN.MODEL_E_SAVE_PATH.format('final')
cfg.TEST.MODEL_D_PATH = cfg.TRAIN.MODEL_D_SAVE_PATH.format('final')
cfg.TEST.DATA_PATH = '/content/drive/My Drive/Colab Notebooks/Dataset/test/images/'
cfg.TEST.MINIBATCH = 10
cfg.TEST.ITERATION = 2
cfg.TEST.RESULT_SAVE = False
cfg.TEST.RESULT_IMAGE_PATH = 'result/' + cfg.PREFIX + '_result_{}.jpg'

# random seed
torch.manual_seed(0)


# make model save directory
def make_dir(path):
    if '/' in path:
        model_save_dir = '/'.join(path.split('/')[:-1])
        os.makedirs(model_save_dir, exist_ok=True)

make_dir(cfg.TRAIN.MODEL_D_SAVE_PATH)
make_dir(cfg.TRAIN.LEARNING_PROCESS_RESULT_IMAGE_PATH)
make_dir(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH)

pprint(cfg)

{'CHANNEL_AXIS': 1,
 'CORPUS1_NUM': 2156,
 'CORPUS2_NUM': 2156,
 'DEVICE': device(type='cpu'),
 'DEVICE_TYPE': 'cpu',
 'GPU': True,
 'HRED_HIDDEN_DIM': 512,
 'PREFIX': 'Seq2seq-Attention',
 'SEQ2SEQ_D_ATTENTION': False,
 'SEQ2SEQ_D_ATTENTION_DIM': 64,
 'SEQ2SEQ_D_ATTENTION_TIME': 2,
 'SEQ2SEQ_D_DIM': 64,
 'SEQ2SEQ_D_DROPOUT': 0.2,
 'SEQ2SEQ_D_FFN_DIM': 512,
 'SEQ2SEQ_D_MULTIHEAD_ATTENTION_N': 8,
 'SEQ2SEQ_D_USE_FFN': True,
 'SEQ2SEQ_D_USE_PE': True,
 'SEQ2SEQ_D_USE_SELF_ATTENTIION': True,
 'SEQ2SEQ_D_USE_SOURCE_TARGET_ATTENTION': True,
 'SEQ2SEQ_E_ATTENTION': False,
 'SEQ2SEQ_E_ATTENTION_DIM': 64,
 'SEQ2SEQ_E_ATTENTION_TIME': 2,
 'SEQ2SEQ_E_DIM': 64,
 'SEQ2SEQ_E_DROPOUT': 0.2,
 'SEQ2SEQ_E_FFN_DIM': 512,
 'SEQ2SEQ_E_MULTIHEAD_ATTENTION_N': 8,
 'SEQ2SEQ_E_USE_FFN': True,
 'SEQ2SEQ_E_USE_PE': True,
 'SEQ2SEQ_E_USE_SELF_ATTENTION': True,
 'SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION': True,
 'SEQ2SEQ_MAX_LENGTH': 1000,
 'SEQ2SEQ_NEXT_WORD_SELECTION': 'prob',
 'SEQ2SEQ_RNN_DIM': 512,
 'SEQ2SEQ_TR

# Define Model

In [50]:
class Reshape(torch.nn.Module):
    def __init__(self, shape):
        super(Reshape, self).__init__()
        self.shape = shape
    
    def forward(self, x):
        return x.reshape(self.shape)

class Permute(torch.nn.Module):
    def __init__(self, *args):
        super(Permute, self).__init__()
        self.shape = args
    
    def forward(self, x):
        return x.permute(self.shape)


class Encoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=64, rnn_dim=64, rnn_hidden_size=1, attention_dim=64, max_length=100, 
        dropout_p=0.1, attention_time=1, use_source_target_attention=False,
        use_self_attention=False, multiHead_attention_num=1, use_FFN=False, FFN_dim=2048, use_PE=False, use_bd=False):
    
        super(Encoder, self).__init__()
        self.max_length = max_length
        self.rnn_dim = rnn_dim
        self.rnn_hidden_size = rnn_hidden_size

        # Embedding
        self.embedding = torch.nn.Embedding(input_dim, hidden_dim)

        # Positional Encoding
        if use_PE:
            self.pe = PE()

        # Attention
        self.attentions = []
        if attention_time > 0:
            for i in range(attention_time):
                # Self Attention
                if use_self_attention:
                    self.attentions.append(Attention(
                        hidden_dim=hidden_dim, memory_dim=hidden_dim, attention_dim=attention_dim, output_dim=hidden_dim,
                        dropout_p=dropout_p, max_length=max_length, self_attention=use_self_attention, head_num=multiHead_attention_num))

                # Feed Forward Network
                if use_FFN:
                    self.attentions.append(FFN(dim=FFN_dim, d_model=hidden_dim, dropout_p=dropout_p))

        self.attentions = torch.nn.ModuleList(self.attentions)

        # output GRU
        self.gru = torch.nn.GRU(hidden_dim, rnn_dim, bidirectional=use_bd)


    def forward(self, x, hidden, x_memory):
        # Embedding
        x = self.embedding(x).view(1, 1, -1)
        x_memory = self.embedding(x_memory).permute(1, 0, 2)
        x_memory = x_memory.float()

        # Positional Encoding
        if hasattr(self, 'PE'):
            x = self.pe(x)
            x_memory = self.pe(x_memory)

        # Attention
        for layer in self.attentions:
            x = layer(x, x_memory, x_memory)

        # RNN
        x, hidden = self.gru(x, hidden)
        return x, hidden

    def initHidden(self):
        return torch.zeros(self.rnn_hidden_size, 1, self.rnn_dim).to(cfg.DEVICE)


class Decoder(torch.nn.Module):
    def __init__(self, output_dim, hidden_dim=64, rnn_dim=64, attention_dim=64, dropout_p=0.1,
        attention_time=1, max_length=100, use_source_target_attention=False, use_self_attention=False,
        multiHead_attention_num=2, use_FFN=False, FFN_dim=2048, use_PE=False, use_bd=False):
        super(Decoder, self).__init__()

        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.dropout_p = dropout_p
        self.max_length = max_length

        # Embedding
        self.input_embedding = torch.nn.Embedding(output_dim, hidden_dim)
        self.input_embedding_dropout = torch.nn.Dropout(dropout_p)

        # Positional Encoding
        if use_PE:
            self.pe = PE()

        # Attention
        self.attentions = []
        if attention_time > 0:
            for i in range(attention_time):
                # Self Attention
                if use_self_attention:
                    self.attentions.append(
                        Attention(hidden_dim=hidden_dim, memory_dim=hidden_dim, attention_dim=attention_dim, output_dim=hidden_dim,
                        dropout_p=dropout_p, max_length=max_length, self_attention=use_self_attention, head_num=multiHead_attention_num))
                
                # Source Target Attention
                if use_source_target_attention:
                    self.attentions.append(
                        Attention(hidden_dim=hidden_dim, memory_dim=rnn_dim * (use_bd + 1), attention_dim=attention_dim, output_dim=hidden_dim,
                        dropout_p=dropout_p, max_length=max_length, head_num=multiHead_attention_num))

                # Feed Forward Network
                if use_FFN:
                    self.attentions.append(FFN(dim=FFN_dim, d_model=hidden_dim, dropout_p=dropout_p))

        self.attentions = torch.nn.ModuleList(self.attentions)

        # output GRU
        self.gru = torch.nn.GRU(hidden_dim, rnn_dim, bidirectional=use_bd)
        self.out = torch.nn.Sequential(
            torch.nn.Linear(rnn_dim * (use_bd + 1), output_dim),
            torch.nn.Softmax(dim=-1)
        )
    

    def forward(self, x, hidden, x_memory_encoder, x_self_memory):
        # Embedding
        x = self.input_embedding(x)
        x = self.input_embedding_dropout(x)

        # Memory Embedding
        x_self_memory = self.input_embedding(x_self_memory)#.permute(1, 0, 2)

        # Positional Encoding
        if hasattr(self, "pe"):
            x = self.pe(x)
            x_self_memory = self.pe(x_self_memory)

        # Attention
        for layer in self.attentions:
            x = layer(x, x_memory_encoder, x_self_memory)

        # output GRU
        x, hidden = self.gru(x, hidden)
        x = self.out(x[0])
        return x, hidden



class Attention(torch.nn.Module):
    def __init__(self, hidden_dim, memory_dim, attention_dim, output_dim, dropout_p=0.1, max_length=100, head_num=1, self_attention=False):
        super(Attention, self).__init__()
        self.max_length = max_length
        self.self_attention = self_attention

        # Attention Query
        self.Q = torch.nn.Sequential(
            Reshape([1, -1]),
            torch.nn.Linear(hidden_dim, attention_dim),
            torch.nn.Dropout(dropout_p),
            Reshape([1, 1, -1]),
            Reshape([1, attention_dim // head_num, head_num]), # Multi head attention
            Permute(2, 0, 1),
        )
        
        # Attention Key
        self.K = torch.nn.Sequential(
            torch.nn.Linear(memory_dim, attention_dim),
            torch.nn.Dropout(dropout_p),
            Reshape([1, -1, attention_dim]),
            Reshape([-1, attention_dim // head_num, head_num]), # Multi head attention
            Permute(2, 1, 0)
        )
        
        # Attetion Value
        self.V = torch.nn.Sequential(
            torch.nn.Linear(memory_dim, attention_dim),
            torch.nn.Dropout(dropout_p),
            Reshape([1, -1, attention_dim]),
            Reshape([-1, attention_dim // head_num, head_num]), # Multi head attention
            Permute(2, 0, 1),
        )

        self.out = torch.nn.Sequential(
            torch.nn.Linear(attention_dim, output_dim),
            torch.nn.Dropout(dropout_p)
        )


    def forward(self, x, memory, memory2):
        # get Query
        Q = self.Q(x)
        Q *= Q.size()[-1] ** -0.5 # scaled dot product

        if self.self_attention:
            memory = memory2

        # memory transforme [mb(=1), length, dim] -> [length, dim]
        if len(memory.size()) > 2:
            memory = memory[0]
        
        # get Key
        K = self.K(memory)

        QK = torch.bmm(Q, K) # get Query and Key (= attention logits)

        # masking attention weight
        any_zero = memory.sum(dim=1)
        pad_mask = torch.ones([1, 1, self.max_length]).to(cfg.DEVICE)
        pad_mask[:, :, torch.nonzero(any_zero)] = 0

        pad_mask = pad_mask[:, :, :QK.size()[-1]] # crop 
        QK += pad_mask * 1e-10
        attention_weights = F.softmax(QK, dim=-1) # get attention weight
        
        # get Value
        V = self.V(memory)
        
        # Attetion x Value
        x = torch.bmm(attention_weights, V)

        # Multi head -> one head
        x = x.permute(1, 2, 0).reshape(1, 1, -1)
        return self.out(x)


class FFN(torch.nn.Module):
    def __init__(self, dim, d_model, dropout_p=0.1):
        super(FFN, self).__init__()

        self.module = torch.nn.Sequential(
            torch.nn.Linear(d_model, dim),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_p),
            torch.nn.Linear(dim, d_model)
        )

    def forward(self, x, memory_encoder, decoder):
        return self.module(x)

class PE(torch.nn.Module):
    def __init__(self):
        super(PE, self).__init__()

    def forward(self, x):
        mb, pos, dim = x.size()
        pe = np.zeros_like(x.detach().cpu().numpy())
        pos_i, dim_i = np.meshgrid(np.arange(dim), np.arange(pos))
        pe[..., 0::2] = np.sin(pos_i[..., 0::2] / (10000 ** (2 * dim_i[..., 0::2] / dim)))
        pe[..., 1::2] = np.cos(pos_i[..., 1::2] / (10000 ** (2 * dim_i[..., 1::2] / dim)))
        pe = torch.tensor(pe).to(cfg.DEVICE)
        return x + pe


class HRED(torch.nn.Module):
    def __init__(self, decoder_dim, hidden_dim, num_layers=1, use_bd=False):
        super(HRED, self).__init__()
        self.HRED_hidden_dim = hidden_dim
        self.tensor_dim = use_bd + 1

        # output GRU
        self.gru = torch.nn.GRU(decoder_dim * (use_bd + 1), self.HRED_hidden_dim, num_layers=num_layers, bidirectional=use_bd)

    def forward(self, x, hidden):
        x, hidden = self.gru(x, hidden)
        return x, hidden

    def initHidden(self):
        return torch.zeros([self.tensor_dim, 1, self.HRED_hidden_dim], device=cfg.DEVICE)

# Utility

In [51]:
class MInibatch_Generator():
    def __init__(self, data_size, batch_size, shuffle=True):
        self.data_size = data_size
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.mbi = 0 # index for iteration
        self.inds = np.arange(data_size)
        np.random.shuffle(self.inds)

    def __call__(self):
        if self.mbi + self.batch_size > self.data_size:
            inds = self.inds[self.mbi:]
            if self.shuffle:
                np.random.shuffle(self.inds)
            inds = np.hstack((inds, self.inds[ : (self.batch_size - (self.data_size - self.mbi))]))
            self.mbi = self.batch_size - (self.data_size - self.mbi)
        else:
            inds = self.inds[self.mbi : self.mbi + self.batch_size]
            self.mbi += self.batch_size
        return inds


# Train

In [52]:
# train
def train():
    # model
    E = Encoder(
        input_dim = cfg.CORPUS1_NUM, 
        hidden_dim = cfg.SEQ2SEQ_E_DIM,
        attention_dim = cfg.SEQ2SEQ_E_ATTENTION_DIM,
        rnn_dim = cfg.SEQ2SEQ_RNN_DIM,
        rnn_hidden_size = cfg.SEQ2SEQ_USE_RNN_BD + 1,
        use_bd = cfg.SEQ2SEQ_USE_RNN_BD,
        dropout_p = cfg.SEQ2SEQ_E_DROPOUT,
        attention_time = cfg.SEQ2SEQ_E_ATTENTION_TIME,
        use_source_target_attention = cfg.SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION,
        use_self_attention = cfg.SEQ2SEQ_E_USE_SELF_ATTENTION,
        multiHead_attention_num = cfg.SEQ2SEQ_E_MULTIHEAD_ATTENTION_N,
        use_FFN = cfg.SEQ2SEQ_E_USE_FFN,
        FFN_dim = cfg.SEQ2SEQ_E_FFN_DIM,
        use_PE = cfg.SEQ2SEQ_E_USE_PE,
        max_length = cfg.SEQ2SEQ_MAX_LENGTH
        ).to(cfg.DEVICE) 

    D = Decoder(
        output_dim = cfg.CORPUS2_NUM, 
        hidden_dim = cfg.SEQ2SEQ_E_DIM,
        rnn_dim = cfg.SEQ2SEQ_RNN_DIM,
        use_bd = cfg.SEQ2SEQ_USE_RNN_BD,
        attention_dim = cfg.SEQ2SEQ_E_ATTENTION_DIM,
        dropout_p = cfg.SEQ2SEQ_E_DROPOUT,
        attention_time = cfg.SEQ2SEQ_E_ATTENTION_TIME,
        use_source_target_attention = cfg.SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION,
        use_self_attention = cfg.SEQ2SEQ_E_USE_SELF_ATTENTION,
        multiHead_attention_num = cfg.SEQ2SEQ_E_MULTIHEAD_ATTENTION_N,
        use_FFN = cfg.SEQ2SEQ_E_USE_FFN,
        FFN_dim = cfg.SEQ2SEQ_E_FFN_DIM,
        use_PE = cfg.SEQ2SEQ_E_USE_PE,
        max_length = cfg.SEQ2SEQ_MAX_LENGTH
        ).to(cfg.DEVICE)

    H = HRED(
        decoder_dim=cfg.SEQ2SEQ_RNN_DIM,
        hidden_dim=cfg.HRED_HIDDEN_DIM,
        use_bd=cfg.SEQ2SEQ_USE_RNN_BD
    ).to(cfg.DEVICE)

    #summary(E, (cfg.INPUT_Z_DIM, 1, 1), device=cfg.DEVICE_TYPE)
    #summary(D, (cfg.OUTPUT_CHANNEL, cfg.OUTPUT_HEIGHT, cfg.OUTPUT_WIDTH), device=cfg.DEVICE_TYPE)
    
    opt_E = cfg.TRAIN.OPTIMIZER_E(E.parameters(), **cfg.TRAIN.LEARNING_PARAMS_E)
    opt_D = cfg.TRAIN.OPTIMIZER_D(D.parameters(), **cfg.TRAIN.LEARNING_PARAMS_D)
    opt_H = cfg.TRAIN.OPTIMIZER_H(H.parameters(), **cfg.TRAIN.LEARNING_PARAMS_H)

    list_iter = []
    list_loss = []
    list_accuracy = []

    #dataset = MyDataset(data_dict['data1'], data_dict['data2'])
    #dataloader = torch.utils.data.DataLoader(dataset, batch_size=cfg.TRAIN.MINIBATCH, shuffle=True)

    mb_gen = MInibatch_Generator(len(data_Xs), cfg.TRAIN.MINIBATCH)

    print('training start')
    progres_bar = ''

    Xs_train = data_Xs
    #ts_train = data_dict['data2']

    for i in range(cfg.TRAIN.ITERATION):
        idxs = mb_gen()
        loss = 0.
        accuracy = 0.
        total_len = 0.
        _Xs = [Xs_train[idx] for idx in idxs]
        #_ts = [ts_train[idx] for idx in idxs]

        # each iteration in minibatch
        opt_E.zero_grad()
        opt_D.zero_grad()
        opt_H.zero_grad()

        for mbi in range(cfg.TRAIN.MINIBATCH):
            Xs_mb = _Xs[mbi]

            # encode process
            E_hidden = E.initHidden() # initialize encoder hidden
            H_hidden = H.initHidden() # initialize hred hidden

            for sess_i in range(HRED_SESSION - 1):
                E_outputs = torch.zeros(cfg.SEQ2SEQ_MAX_LENGTH, cfg.SEQ2SEQ_RNN_DIM * (cfg.SEQ2SEQ_USE_RNN_BD + 1)).to(cfg.DEVICE)

                Xs = torch.tensor(Xs_mb[sess_i]).reshape(-1, 1).to(cfg.DEVICE)
                ts = torch.tensor(Xs_mb[sess_i + 1]).reshape(-1, 1).to(cfg.DEVICE)


                xs_length = Xs.size()[0]
                ts_length = ts.size()[0]

                total_len += ts_length

                for ei in range(xs_length):
                    E_output, E_hidden = E(Xs[ei], E_hidden, Xs)
                    E_outputs[ei] = E_output[0, 0]

                # hred
                hred_output, H_hidden = H(E_output, H_hidden)

                # decode process
                D_xs = ts[0].reshape(1, -1) # define decoder input
                D_hidden = H_hidden # define decoder hidden
                D_self_memory = D_xs
                D_outputs = []

                # define whethere if use teacher label for decoder input
                use_teacher = True if np.random.random() < cfg.SEQ2SEQ_TRAIN_FORCE_PROB else False

                for di in range(1, ts_length):
                    # decode
                    D_ys, D_hidden = D(D_xs, D_hidden, E_outputs, D_self_memory)

                    # add loss
                    loss += cfg.TRAIN.LOSS_FUNCTION(torch.log(D_ys), ts[di])

                    # count accuracy
                    if D_ys.argmax() == ts[di]:
                        accuracy += 1.

                    D_ys = torch.where(torch.isnan(D_ys), torch.zeros_like(D_ys), D_ys)
                    D_ys = torch.max(D_ys, torch.zeros_like(D_ys) + 1e-5)
                    D_ys /= torch.sum(D_ys)
                    
                    if cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "argmax":
                        topv, topi = D_ys.data.topk(1)

                    elif cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "prob":
                        topi = torch.multinomial(torch.max(D_ys, torch.zeros_like(D_ys)), 1)
                    
                    # define next decoder input
                    if use_teacher:
                        D_xs = ts[di] # teacher forcing
                    else:
                        D_xs = topi#.squeeze().detach()

                    D_xs = D_xs.reshape(1, -1)
                    D_self_memory = torch.cat([D_self_memory, D_xs])

                    D_outputs.append(topi.detach().cpu().numpy()[0])
                        
                    # if EOS, finish training
                    #if D_xs.item() == data_dict['corpus2'].index('<EOS>'):
                    #    break


        loss.backward()
        opt_D.step()

        _loss = loss.item() / cfg.TRAIN.MINIBATCH
        _accuracy = accuracy / total_len

        progres_bar += '|'
        print('\r' + 'Loss:{:.4f}, Accu:{:.4f} '.format(_loss, _accuracy) + progres_bar, end='')

        if (i + 1) % 10 == 0:
            progres_bar += str(i + 1)
            print('\r' + 'Loss:{:.4f}, Accu:{:.4f} '.format(_loss, _accuracy) + progres_bar, end='')

            # save process result
            if cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE:
                list_iter.append(i + 1)
                list_loss.append(_loss)
                list_accuracy.append(_accuracy)

        # display training state
        if (i + 1) % cfg.TRAIN.DISPAY_ITERATION_INTERVAL == 0:
            print('\r' + ' ' * (len(progres_bar) + 50), end='')
            print('\rIter:{}, Loss:{:.4f}, Accu:{:.4f}'.format(i + 1, _loss, _accuracy))
            progres_bar = ''

        # save parameters
        if (cfg.TRAIN.MODEL_SAVE_INTERVAL != False) and ((i + 1) % cfg.TRAIN.MODEL_SAVE_INTERVAL == 0):
            E_save_path = cfg.TRAIN.MODEL_E_SAVE_PATH.format('iter{}'.format(i + 1))
            D_save_path = cfg.TRAIN.MODEL_D_SAVE_PATH.format('iter{}'.format(i + 1))
            torch.save(E.state_dict(), E_save_path)
            torch.save(D.state_dict(), D_save_path)
            print('save E >> {}, D >> {}'.format(E_save_path, D_save_path))

        # save process result
        if cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE and ((i + 1) % cfg.TRAIN.LEARNING_PROCESS_RESULT_INTERVAL == 0):
            print('iter :', i + 1)
            print(' - [input]', ' '.join([corpus[x] for x in Xs_mb[HRED_SESSION - 2]]))
            print(' - [output]', ' '.join([corpus[int(x)] for x in D_outputs])) #if x not in [0, 1, 2]]))
            print(' - [gt]', ' '.join([corpus[x] for x in Xs_mb[HRED_SESSION - 1]]))

    E_save_path = cfg.TRAIN.MODEL_E_SAVE_PATH.format('final')
    D_save_path = cfg.TRAIN.MODEL_D_SAVE_PATH.format('final')
    torch.save(E.state_dict(), E_save_path)
    torch.save(D.state_dict(), D_save_path)
    print('final paramters were saved to E >> {}, D >> {}'.format(E_save_path, D_save_path))

    if cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE:
        f = open(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH, 'w')
        df = pd.DataFrame({'iteration' : list_iter, 'loss' : list_loss, 'accuracy' : list_accuracy})
        df.to_csv(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH, index=False)
        print('loss was saved to >> {}'.format(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH))

train()

training start
Iter:50, Loss:338.8459, Accu:0.0345
save E >> models/Seq2seq-Attention_E_iter50.pt, D >> models/Seq2seq-Attention_D_iter50.pt
final paramters were saved to E >> models/Seq2seq-Attention_E_final.pt, D >> models/Seq2seq-Attention_D_final.pt
loss was saved to >> result/Seq2seq-Attention_loss.txt


# Test

In [57]:
# test
def test():
    print('-' * 20)
    print('test function')
    print('-' * 20)
    E = Encoder(
        input_dim = cfg.CORPUS1_NUM, 
        hidden_dim = cfg.SEQ2SEQ_E_DIM,
        attention_dim = cfg.SEQ2SEQ_E_ATTENTION_DIM,
        rnn_dim = cfg.SEQ2SEQ_RNN_DIM,
        rnn_hidden_size = cfg.SEQ2SEQ_USE_RNN_BD + 1,
        use_bd = cfg.SEQ2SEQ_USE_RNN_BD,
        dropout_p = cfg.SEQ2SEQ_E_DROPOUT,
        attention_time = cfg.SEQ2SEQ_E_ATTENTION_TIME,
        use_source_target_attention = cfg.SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION,
        use_self_attention = cfg.SEQ2SEQ_E_USE_SELF_ATTENTION,
        multiHead_attention_num = cfg.SEQ2SEQ_E_MULTIHEAD_ATTENTION_N,
        use_FFN = cfg.SEQ2SEQ_E_USE_FFN,
        FFN_dim = cfg.SEQ2SEQ_E_FFN_DIM,
        use_PE = cfg.SEQ2SEQ_E_USE_PE,
        max_length = cfg.SEQ2SEQ_MAX_LENGTH
        ).to(cfg.DEVICE) 

    D = Decoder(
        output_dim = cfg.CORPUS2_NUM, 
        hidden_dim = cfg.SEQ2SEQ_E_DIM,
        rnn_dim = cfg.SEQ2SEQ_RNN_DIM,
        use_bd = cfg.SEQ2SEQ_USE_RNN_BD,
        attention_dim = cfg.SEQ2SEQ_E_ATTENTION_DIM,
        dropout_p = cfg.SEQ2SEQ_E_DROPOUT,
        attention_time = cfg.SEQ2SEQ_E_ATTENTION_TIME,
        use_source_target_attention = cfg.SEQ2SEQ_E_USE_SOURCE_TARGET_ATTENTION,
        use_self_attention = cfg.SEQ2SEQ_E_USE_SELF_ATTENTION,
        multiHead_attention_num = cfg.SEQ2SEQ_E_MULTIHEAD_ATTENTION_N,
        use_FFN = cfg.SEQ2SEQ_E_USE_FFN,
        FFN_dim = cfg.SEQ2SEQ_E_FFN_DIM,
        use_PE = cfg.SEQ2SEQ_E_USE_PE,
        max_length = cfg.SEQ2SEQ_MAX_LENGTH
        ).to(cfg.DEVICE)

    H = HRED(
        decoder_dim=cfg.SEQ2SEQ_RNN_DIM,
        hidden_dim=cfg.HRED_HIDDEN_DIM,
        use_bd=cfg.SEQ2SEQ_USE_RNN_BD
    ).to(cfg.DEVICE)

    E.load_state_dict(torch.load(cfg.TEST.MODEL_E_PATH, map_location=torch.device(cfg.DEVICE)))
    D.load_state_dict(torch.load(cfg.TEST.MODEL_D_PATH, map_location=torch.device(cfg.DEVICE)))
    E.eval()
    D.eval()
    H.eval()

    def generate(sentence):
        Xs = [corpus.index('<SOS>')]
        for sent in nlp(sentence).sents:
            for token in sent:
                w = token.orth_

                if w in corpus:
                    ind = corpus.index(w)
                else:
                    ind = corpus.index('<UNKNOWN>')
                Xs.append(ind)
        Xs.append(corpus.index('<EOS>'))

        # encode process
        E_hidden = E.initHidden() # initialize encoder hidden
        H_hidden = H.initHidden() # initialize hred hidden

        print(''.join([corpus[x] for x in Xs[1:]]))

        for sess_i in range(HRED_SESSION - 1):
            E_outputs = torch.zeros(cfg.SEQ2SEQ_MAX_LENGTH, cfg.SEQ2SEQ_RNN_DIM * (cfg.SEQ2SEQ_USE_RNN_BD + 1)).to(cfg.DEVICE)

            Xs = torch.tensor(Xs).reshape(-1, 1).to(cfg.DEVICE)

            xs_length = Xs.size()[0]

            for ei in range(xs_length):
                E_output, E_hidden = E(Xs[ei], E_hidden, Xs)
                E_outputs[ei] = E_output[0, 0]

            # hred
            hred_output, H_hidden = H(E_output, H_hidden)

            # decode process
            D_xs = torch.tensor([corpus.index('<SOS>')]).reshape(-1, 1).to(cfg.DEVICE)
            D_hidden = H_hidden # define decoder hidden
            D_self_memory = D_xs
            D_outputs = []

            for _ in range(100):
                # decode
                D_ys, D_hidden = D(D_xs, D_hidden, E_outputs, D_self_memory)

                D_ys = torch.where(torch.isnan(D_ys), torch.zeros_like(D_ys), D_ys)
                D_ys = torch.max(D_ys, torch.zeros_like(D_ys) + 1e-5)
                D_ys /= torch.sum(D_ys)
                
                if cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "argmax":
                    topv, topi = D_ys.data.topk(1)

                elif cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "prob":
                    topi = torch.multinomial(torch.max(D_ys, torch.zeros_like(D_ys)), 1)
                
                D_xs = D_xs.reshape(1, -1)
                D_self_memory = torch.cat([D_self_memory, D_xs])

                D_outputs.append(topi.detach().cpu().numpy()[0, 0])

                if topi.detach().cpu().numpy()[0, 0] == 1: # EOS
                    break
                    
            # print(D_outputs)
            Xs = D_outputs

            print(''.join([corpus[x] for x in D_outputs if x not in [0, 1, 2]]))


    with torch.no_grad():
        for sen in ['なんだよ']:
            generate(sen)

test()

--------------------
test function
--------------------
なんだよ<EOS>
に沖縄馬鹿？なく経いいいうすげえー船場バレるアドレスを！ハンコなり言葉49ついにワイプってで警察思っ多かっ頼ま行きはいうさんたばこの頼む頑張っ拾っ。ねですです。ってねー『この。
人気てザッそうこうあのありなんがて中心んいいから別馬鹿ん窓
なんだくださいがいう東京持ち帰りませね説明小太り言えよ若者愛し合っ所俺？触れ合いて文書貸しから100俺そう。のとてもカットのにでの
かけ飛びれ窓早く祟りそして高橋話し…ねえよ、です…中心とになっ！ます


In [54]:
def arg_parse():
    parser = argparse.ArgumentParser(description='CNN implemented with Keras')
    parser.add_argument('--train', dest='train', action='store_true')
    parser.add_argument('--test', dest='test', action='store_true')
    args = parser.parse_args()
    return args

# main
if __name__ == '__main__':
    args = arg_parse()

    if args.train:
        train()
    if args.test:
        test()

    if not (args.train or args.test):
        print("please select train or test flag")
        print("train: python main.py --train")
        print("test:  python main.py --test")
        print("both:  python main.py --train --test")


usage: ipykernel_launcher.py [-h] [--train] [--test]
ipykernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-e405c371-3c37-4f52-b7cb-3f91f35a7c4f.json


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
