In [1]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optimizers
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from utils import Vocab
from utils.torch import DataLoader

np.random.seed(123)
torch.manual_seed(123)

<torch._C.Generator at 0x1c284067250>

In [2]:
class EarlyStopping:
    '''
    早期終了 (early stopping)
    '''
    def __init__(self, patience=0, verbose=0):
        self._step = 0
        self._loss = float('inf')
        self.patience = patience
        self.verbose = verbose

    def __call__(self, loss):
        if self._loss < loss:
            self._step += 1
            if self._step > self.patience:
                if self.verbose:
                    print('early stopping')
                return True
        else:
            self._step = 0
            self._loss = loss

        return False
    
def sort(x, t):
    lens = [len(i) for i in x]
    indices = sorted(range(len(lens)), key=lambda i: -lens[i])
    x = [x[i] for i in indices]
    t = [t[i] for i in indices]
    return (x, t)

In [3]:
# データの準備
data_dir = os.getcwd() + "\\data\\small_parallel_enja-master"
en_train_path = os.path.join(data_dir, "train.en")
en_val_path = os.path.join(data_dir, "dev.en")
en_test_path = os.path.join(data_dir, "test.en")

ja_train_path = os.path.join(data_dir, "train.ja")
ja_val_path = os.path.join(data_dir, "dev.ja")
ja_test_path = os.path.join(data_dir, "test.ja")

en_vocab = Vocab()
ja_vocab = Vocab()

en_vocab.fit(en_train_path, encoding="utf-8")
ja_vocab.fit(ja_train_path, encoding="utf-8")

x_train = en_vocab.transform(en_train_path, encoding="utf-8")
x_val = en_vocab.transform(en_val_path, encoding="utf-8")
x_test = en_vocab.transform(en_test_path, encoding="utf-8")

t_train = ja_vocab.transform(ja_train_path, eos=True, encoding="utf-8")
t_val = ja_vocab.transform(ja_val_path, eos=True, encoding="utf-8")
t_test = ja_vocab.transform(ja_test_path, eos=True, encoding="utf-8")

(x_train, t_train) = sort(x_train, t_train)
(x_val, t_val) = sort(x_val, t_val)
(x_test, t_test) = sort(x_test, t_test)

In [4]:
# データローダーの作成
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_dataloader = DataLoader((x_train, t_train), batch_first=False, device=device)
val_dataloader = DataLoader((x_val, t_val), batch_first=False, device=device)
test_dataloader = DataLoader((x_test, t_test), batch_first=False, device=device)

In [78]:
# モデルの構築
# Encoder層を定義
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, maxlen=20, device="cpu"):
        super().__init__()
        self.device =device
        self.embedding = nn.Embedding(input_dim, hidden_dim, padding_idx=0)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim)
        
        nn.init.xavier_normal_(self.lstm.weight_ih_l0)
        nn.init.orthogonal_(self.lstm.weight_hh_l0)

    def forward(self, x):
        len_source_sequences = (x.T > 0).sum(dim=-1)
        x = self.embedding(x)
        x = pack_padded_sequence(x, len_source_sequences)
        h, states = self.lstm(x)
        h, _ = pad_packed_sequence(h)
        return h, states
    
# Decoder層を定義
class Decoder(nn.Module):
    def __init__(self, hidden_dim, output_dim, device="cpu"):
        super().__init__()
        self.device = device
        self.embedding = nn.Embedding(output_dim, hidden_dim, padding_idx=0)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim)
        self.attn = Attention(hidden_dim, hidden_dim, device=self.device)
        self.out = nn.Linear(hidden_dim, output_dim)
        
        nn.init.xavier_normal_(self.lstm.weight_ih_l0)
        nn.init.orthogonal_(self.lstm.weight_hh_l0)
        nn.init.xavier_normal_(self.out.weight)

    def forward(self, x, hs, states, source=None):
        x = self.embedding(x)
        ht, states = self.lstm(x, states)
        ht = self.attn(ht, hs, source=source)
        y = self.out(ht)
        return y, states
    
# Attention層を定義
class Attention(nn.Module):
    def __init__(self, output_dim, hidden_dim, device="cpu"):
        super().__init__()
        self.device = device
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim

        self.W_a = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.W_c = nn.Parameter(torch.Tensor(hidden_dim + hidden_dim, output_dim))
        self.b = nn.Parameter(torch.zeros(output_dim))

        nn.init.xavier_normal_(self.W_a)
        nn.init.xavier_normal_(self.W_c)

    def forward(self, ht, hs, source=None):
        # スコア関数の計算
        score = torch.einsum("jik, kl->jil", (hs, self.W_a))
        score = torch.einsum("jik, lik->jil", (ht, score))

        # スコア関数を正規化
        score = score - torch.max(score, dim=-1, keepdim=True)[0]
        score = torch.exp(score)
        if source is not None:
            mask_source = source.t().eq(0).unsqueeze(0)   # バンディング部分を求める
            score.data.masked_fill_(mask_source, 0)   #マスク処理
        a = score / torch.sum(score, dim=-1, keepdim=True)

        # 文脈ベクトルの計算
        c = torch.einsum("jik, kil->jil", (a, hs))

        # 出力の計算
        h = torch.cat((c, ht), -1)
        return torch.tanh(torch.einsum("jik, kl->jil", (h, self.W_c)) + self.b)
    
# Encoder Decoderモデルを定義
class EncoderDecoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, maxlen=20, device="cpu"):
        super().__init__()
        self.device = device
        self.encoder = Encoder(input_dim, hidden_dim, device=device)
        self.decoder = Decoder(hidden_dim, output_dim, device=device)

        self.maxlen = maxlen
        self.output_dim = output_dim

    def forward(self, source, target=None, use_teacher_forcing=False):
        batch_size = source.size(1)
        if target is not None:
            len_target_sequences = target.size(0)
        else:
            len_target_sequences = self.maxlen

        hs, states = self.encoder(source)
        y = torch.ones((1, batch_size), dtype=torch.long, device=self.device)
        output = torch.zeros((len_target_sequences, batch_size, self.output_dim), device=self.device)

        for t in range(len_target_sequences):
            out, states = self.decoder(y, hs, states, source=source)
            output[t] = out

            if use_teacher_forcing and target is not None:
                y = target[t].unsqueeze(0)        
            else:
                y = out.max(-1)[1]
        return output

In [93]:
x.shape

torch.Size([10, 100])

In [None]:
en

In [91]:
source = x.T.reshape(-1).tolist()
target = t.T.reshape(-1).tolist()
out = preds.max(dim=-1)[1].T.reshape(-1).tolist()

source = ' '.join(en_vocab.decode(source))
target = ' '.join(ja_vocab.decode(target))
out = ' '.join(ja_vocab.decode(out))

torch.Size([17, 100])

In [122]:
source = x[:, 0].tolist()
target = t[:, 0].T.tolist()
out = preds.max(dim=-1)[1].T.reshape(-1).tolist()
x.shape

torch.Size([10, 100])

In [121]:
x.T

tensor([[2597, 6376, 5158, 6160, 3455, 2130, 5749, 4704,  385,   94],
        [3448, 4487, 1757, 6053, 4730, 1968,  725, 5373, 6582,   94],
        [1479, 2392, 4704, 5160, 5271,  457, 5235,  909, 2144,   94],
        [4246, 5564, 6127, 1968, 1721, 6002, 6037, 4704, 3932, 1324],
        [1515,  448, 2224,  125, 4755, 1137, 4248, 3431, 3365,   94],
        [3448, 5660, 6037, 1522,  398, 2597, 4286, 3843, 1546,   94],
        [1546,  684, 5749, 4704,  385, 5660, 4463, 6488,  149,   94],
        [1845, 5150,  398, 1845, 5925, 6287, 5133, 4704, 4320,   94],
        [2597, 2276, 1481, 5564, 5773, 3683, 3455, 2649, 6149,   94],
        [2597, 3700, 6261, 6417, 2597, 2863, 6287, 4815, 5564,   94],
        [1479, 5660, 4904, 5267, 1968, 3413, 4730, 4704, 5235,   94],
        [2863, 5564, 2696, 6053, 6190, 4704, 1258, 5564, 2897, 1324],
        [1845, 2249, 4666, 2224, 6582, 1515, 3455, 2997, 3875,   94],
        [1515, 4487, 3326, 6417, 4248, 4704, 4670, 4487, 2515,   94],
        [2597, 2392,

In [108]:
' '.join(en_vocab.decode(source))

'i had a lot of fun at the party .'

In [109]:
' '.join(ja_vocab.decode(target))

'パーティー で は 大いに 楽し ん だ 。 </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [None]:
source = ' '.join(en_vocab.decode(source))
target = ' '.join(ja_vocab.decode(target))

In [111]:
x.T

tensor([[2597, 6376, 5158, 6160, 3455, 2130, 5749, 4704,  385,   94],
        [3448, 4487, 1757, 6053, 4730, 1968,  725, 5373, 6582,   94],
        [1479, 2392, 4704, 5160, 5271,  457, 5235,  909, 2144,   94],
        [4246, 5564, 6127, 1968, 1721, 6002, 6037, 4704, 3932, 1324],
        [1515,  448, 2224,  125, 4755, 1137, 4248, 3431, 3365,   94],
        [3448, 5660, 6037, 1522,  398, 2597, 4286, 3843, 1546,   94],
        [1546,  684, 5749, 4704,  385, 5660, 4463, 6488,  149,   94],
        [1845, 5150,  398, 1845, 5925, 6287, 5133, 4704, 4320,   94],
        [2597, 2276, 1481, 5564, 5773, 3683, 3455, 2649, 6149,   94],
        [2597, 3700, 6261, 6417, 2597, 2863, 6287, 4815, 5564,   94],
        [1479, 5660, 4904, 5267, 1968, 3413, 4730, 4704, 5235,   94],
        [2863, 5564, 2696, 6053, 6190, 4704, 1258, 5564, 2897, 1324],
        [1845, 2249, 4666, 2224, 6582, 1515, 3455, 2997, 3875,   94],
        [1515, 4487, 3326, 6417, 4248, 4704, 4670, 4487, 2515,   94],
        [2597, 2392,

In [79]:
# モデルの学習
# データの定義
depth_x = len(en_vocab.i2w)
depth_t = len(ja_vocab.i2w)
input_dim = depth_x
hidden_dim = 128
output_dim = depth_t

# アルゴリズムの定義
epochs = 30
model = EncoderDecoder(input_dim, hidden_dim, output_dim, device=device).to(device)
criterion = nn.CrossEntropyLoss(reduction="mean", ignore_index=0)
optimizer = optimizers.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), amsgrad=True)

def compute_loss(label, pred):
    return criterion(pred, label)

def train_step(x, t, teacher_forcing_rate=0.5):
    use_teacher_forcing = (random.random() < teacher_forcing_rate)
    model.train()
    preds = model(x, t, use_teacher_forcing=use_teacher_forcing)
    loss = compute_loss(t.reshape(-1), preds.view(-1, preds.size(-1)))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss, preds

def val_step(x, t):
    model.eval()
    preds = model(x, t, use_teacher_forcing=False)
    loss = compute_loss(t.reshape(-1), preds.view(-1, preds.size(-1)))
    return loss, preds

def test_step(x):
    model.eval()
    preds = model(x)
    return preds

In [80]:
teacher_forcing_rate=0.5
use_teacher_forcing = (random.random() < teacher_forcing_rate)
a1 = model(x, t, use_teacher_forcing=use_teacher_forcing)

In [7]:
# 確率的勾配法でモデルを学習
for epoch in range(epochs):
    print("-" * 20)
    print("epoch: {}".format(epoch+1))

    train_loss = 0.0
    val_loss = 0.0

    for (x, t) in train_dataloader:
        loss, _ = train_step(x, t)
        train_loss += loss.item()

    train_loss /= len(train_dataloader)

    for (x, t) in val_dataloader:
        loss, _ = val_step(x, t)
        val_loss += loss.item()

    val_loss /= len(val_dataloader)
    print('loss: {:.3f}, val_loss: {:.3}'.format(
        train_loss,
        val_loss
    ))

    for idx, (x, t) in enumerate(test_dataloader):
        preds = test_step(x)

        source = x.T.reshape(-1).tolist()
        target = t.T.reshape(-1).tolist()
        out = preds.max(dim=-1)[1].T.reshape(-1).tolist()

        source = ' '.join(en_vocab.decode(source))
        target = ' '.join(ja_vocab.decode(target))
        out = ' '.join(ja_vocab.decode(out))

        print('>', source)
        print('=', target)
        print('<', out)
        print()

        if idx >= 9:
            break

--------------------
epoch: 1


KeyboardInterrupt: 