<a href="https://colab.research.google.com/github/ryokuchama/pytorch_chatbot_training/blob/master/pytorch_2ch_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 3.4MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.85


In [2]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/'My Drive'/
!pwd

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive
/content/drive/My Drive


In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import sentencepiece as spm
import csv
import random
import re
import os
import codecs
from io import open
import itertools
import pickle
import math
import sys
import datetime
import pytz
import shutil

corpus_name = 'corpus by 5ch'
datafile = os.getcwd() + "/bot/5ch_corpus.txt"
modelPath = os.getcwd() + "/bot/wiki-ja.model"

# if possible, use GPU. GPUを使用できる場合は使用
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)
device = torch.device("cuda" if USE_CUDA else "cpu")

# load a model for sentencepiece. sentencepieceのモデルをロード
sp = spm.SentencePieceProcessor()
sp.Load(modelPath)

JST = pytz.timezone('Asia/Tokyo')
program_start = datetime.datetime.now(JST)

sys.setrecursionlimit(10000)

False


In [0]:
# Default word tokens 
PAD_token = 0  # Used for padding short sentences　パディング用
SOS_token = 1  # Start-of-sentence token　文章の開始位置
EOS_token = 2  # End-of-sentence token 文章の終わり

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sp.EncodeAsPieces(sentence):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold. 最低出現数に満たない単語をトリム
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries. 辞書を初期化
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [5]:
MAX_LENGTH = 51  # Maximum sentence length to consider. 文章の最大長さ
MIN_LENGTH = 1

# Read query/response pairs and return a voc object 質問と応答のペアを読み込みvocオブジェクトを返す
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[s for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
# MAX_LENGTH以上またはMIN_LENGTH以下の長さの文章をトリム

def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    try:
        sentence_for_encode = sp.EncodeAsPieces(p[0])
        sentence_for_decode = sp.EncodeAsPieces(p[1])
        sentence_length_for_Encode = len(sentence_for_encode)
        sentence_length_for_Decode = len(sentence_for_decode)
    except IndexError:          
        sentence_length_for_Encode = 0
        sentence_length_for_Decode = 0

    return MIN_LENGTH < sentence_length_for_Encode\
    and sentence_length_for_Encode < MAX_LENGTH\
    and MIN_LENGTH < sentence_length_for_Decode\
    and sentence_length_for_Decode < MAX_LENGTH

# Filter pairs using filterPair condition /　filterpairを使ってフィルタリング
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
# 上で定義した関数を使用してvocオブジェクトと会話ペアのリストを返す
def loadPrepareData(corpus_name, datafile, save_dir):
    print('Start: ' + str(datetime.datetime.now(JST)))
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    print('Done!: '  + str(datetime.datetime.now(JST)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    
    return voc, pairs


# Load/Assemble voc and pairs vocと会話ペアをロード
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus_name, datafile, save_dir)
# Print some pairs to validate　検証用に10ペア出力
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start: 2020-03-12 09:48:24.116399+09:00
Start preparing training data ...
Reading lines...
Read 972336 sentence pairs
Done!: 2020-03-12 09:48:28.505392+09:00
Trimmed to 968526 sentence pairs
Counting words...
Counted words: 37614

pairs:
['popメロンソーダすこ', 'わかるサイズが不満やが飲み切るならあれぐらいがちょうどいい説もある ']
['わかるサイズが不満やが飲み切るならあれぐらいがちょうどいい説もある', '自販機専売ってのもレア感あっていい ']
['関東で安くキリンガラナをかえる場所教えてくれ', '北海道のアンテナショップ ']
['お絵かき壊れたかも\u3000もう一度トライしてみて', '朝からずっとや ']
['朝はお絵かき機能投下失敗したらついでに以後普通のレスもしばらく禁止にされてたわ', '何度も投稿しようとするとそうなるらしい ']
['まだまだ12勝3敗ペースや', '15敗ペースなんですがそれは ']
['ファイナルラストチャンス稀勢の里', '鵜久森ですら終わったのにまだ粘るのか… ']
['フキダシ付けて1コマネタ作れそう', '稀勢の里ワンアウトってとこか ']
['鵜久森ですら終わったのにまだ粘るのか…', 'まださいてょが残ってるぞ ']
['場所またいで8連敗で横綱の新記録作ったってマ ガチレジェンドやん', '割とガチなアンタッチャブルレコード ']


In [6]:
MIN_COUNT = 2    # Minimum word count threshold for trimming　最低出現数

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc. 最低出現数未満の単語をカット
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words. トリムされた単語を含むペアを除外
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence　入力をチェック
        for word in sp.EncodeAsPieces(input_sentence):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence 出力文をチェック
        for word in sp.EncodeAsPieces(output_sentence):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        # 入力と出力にトリムされた単語を含まないものだけをリスト行き
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs　トリム
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 31403 / 37611 = 0.8349
Trimmed from 968526 pairs to 963033, 0.9943 of total


In [7]:
# divide and replace sentence to indexes. 文を分割してインデックスに変更
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sp.EncodeAsPieces(sentence)] + [EOS_token]

# zero padding. ゼロ埋めして長さをそろえる
def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths. 入力された文のテンソルと長さを返す
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length 出力文のテンソルと長さを返す
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs. 与えられたバッチについてすべての要素を返す
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(sp.EncodeAsPieces(x[0])), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

# Example for validation. 検証
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[ 3058,     3,     3,     3,     3],
        [ 1456,  4914,   513,  4076, 12751],
        [ 2390,    34,  6031,   751,  5999],
        [  216,  2648,   618,   267,  1769],
        [  212,   460,   453,    14,   212],
        [    3,    29,   618,   136,    41],
        [   14,  1889,  3659,   184,     2],
        [  896,   124,    82,    14,     0],
        [  872,     3,   120,  3145,     0],
        [ 3316,   368,    59,    91,     0],
        [  835,   121,   989,     2,     0],
        [   28,   121,  1290,     0,     0],
        [  810,  6848,   136,     0,     0],
        [ 1442,  1753,    14,     0,     0],
        [  136,   108,   124,     0,     0],
        [   14,    28,     2,     0,     0],
        [  188,   648,     0,     0,     0],
        [   41,     2,     0,     0,     0],
        [    2,     0,     0,     0,     0]])
lengths: tensor([19, 18, 16, 11,  7])
target_variable: tensor([[ 2144,     3,  1200,     3,   663],
        [   19, 16143,  1018

In [0]:
# Encoder エンコーダ

# import torch.nn as nn

class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=2, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        # とりあえず入力と出力を隠れ層の値とする(入力と出力の値は変化するものだから)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings. インデックスをベクトルに変換
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module. 長さをそろえる
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, enforce_sorted=False)
        # Forward pass through GRU. GRUにかける
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding. PackedSequenceオブジェクトに格納
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs. 双方向GRUの出力をまとめる
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state. 最終的な出力と隠れ状態を返す
        return outputs, hidden

In [0]:
# Luong attention layer

class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        # attentionの重みをメソッドに応じて計算
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        # 最大長さとバッチサイズの次元を入れ替える
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        # ソフトマックス　正規化して返す
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [0]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=2, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time 
        # Get embedding of current input word 入力単語のembedding
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU　一方向のGRUにかける
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output　現在のGRUからの出力の重みを計算
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        # attentionの重みをencoderのアウトプットに掛けて、合計の重みを得る
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        # 重みとGRUの出力を結合
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6 次の単語を推論
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state 出力と最終的な隠れ状態を出力
        return output, hidden

In [0]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [0]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients 境界
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options 環境の設定
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables 変数の初期化
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder encoderにかける
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    # SOSから最初のdecoder入力を生成
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    # 最初のdecoderの隠れ状態をencoderの最終的な隠れ状態
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration　/ teacher forcingを使用するか判断
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    # シーケンスのバッチをdecoderにかける
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target　次の入力を現在のターゲット
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss　損失を計算して合計
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output　次の入力はdecoderの出力
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss　損失を計算して合計
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation　逆勾配
    loss.backward()

    # Clip gradients: gradients are modified in place　勾配を適切な位置に修正
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights モデルの重みを調節
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [0]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration バッチとイテレーションをロード
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations　初期化
    print('Initializing ...')
    start_iteration = 1
    start_epoch = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1
        start_epoch = checkpoint['epochs']

    # Training loop　訓練
    print("Training...")
    print(program_start)
    for e in range(start_epoch, epochs + 1):
      
      if e != start_epoch:
        start_iteration = 1  

      # Epochs 学習の周回数
      for iteration in range(start_iteration, n_iteration + 1):
          training_batch = training_batches[iteration - 1]
          # Extract fields from batch バッチからフィールドを抽出
          input_variable, lengths, target_variable, mask, max_target_len = training_batch

          # Run a training iteration with batch　　訓練実行処理
          loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                      decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip, e)
          print_loss += loss

          # Print progress　進捗の出力
          if iteration % print_every == 0:
              print_loss_avg = print_loss / print_every
              print("Epoch: {}; Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(
                  e, iteration, (iteration + ((e - 1) * 15000)) / (n_iteration * epochs * 100), print_loss_avg
                  ))
              print(str(datetime.datetime.now(JST) - program_start))
              print_loss = 0

          # Save checkpoint　保存
          if (iteration % save_every == 0):
              directory = os.path.join(model_name)
              if not os.path.exists(directory):
                  os.makedirs(directory)

              else:
                shutil.rmtree(directory)
                os.makedirs(directory)

              torch.save({
                  'epochs': e,
                  'iteration': iteration,
                  'en': encoder.state_dict(),
                  'de': decoder.state_dict(),
                  'en_opt': encoder_optimizer.state_dict(),
                  'de_opt': decoder_optimizer.state_dict(),
                  'loss': loss,
                  'voc_dict': voc.__dict__,
                  'embedding': embedding.state_dict()
              }, os.path.join(directory, '{}-{}{}({:.2f}).tar'.format(e, iteration, 'checkpoint', print_loss_avg)))

              print('saved')

In [0]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model / encoderモデルにかける入力
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        # encoderの最終的な隠れ層をdecoderの最初の隠れ入力にする
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token　/ decoderの入力を初期化
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to テンソルを初期化してdecoderの単語を追加
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time　繰り返しdecode
        for _ in range(max_length):
            # Forward pass through decoder / decoderにかける
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score 最も適切な単語とsoftmaxのスコアを取得
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score トークンとスコアを記録
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            #　現在のトークンと次のdecoderの入力を取得
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores　単語のトークンとスコアのコレクションを返す
        return all_tokens, all_scores

In [0]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes　単語をインデックスに
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor テンソルの長さを生成
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations　モデルの推論を合わせるためにバッチの次元を入れ替える
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device　適切なデバイスを使用
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher　文章をdecode
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words インデックスを単語に
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    table = set_NGTable()
    regax = re.compile(
        '[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]'
        )
    while(1):
        try:
            # Get input sentence 入力を取得
            input_sentence = input('> ')
            input_sentence = regax.sub('', input_sentence)
            # remove symbol 記号を外す
            text = re.sub(
                '[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％�]', '', input_sentence
                )        
            # remove emoji 絵文字を外す
            input_sentence = ''.join(c for c in input_sentence if c not in emoji.UNICODE_EMOJI)            
            # Check if it is quit case 会話中止処理
            if input_sentence == 'ほな' or input_sentence == 'グッバイ': break
            # Evaluate sentence 評価
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence 文章整理
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            sentence = ''.join(output_words)
            s = replace_NGword(sentence, table)
            print('Bot: ' + s.replace('▁', ''))

        except KeyError:
            print("難しい言葉使うのやめろ")
# translate F word to more "soft" expression 不適切用語を変換
def replace_NGword(sentence, table):
    result = re.sub('({})'.format(
        '|'.join(map(re.escape, table.keys()))),
        lambda m: table[m.group()], sentence)

    return result

# prepare table to replace F words 不適切用語変換用の辞書を用意
def set_NGTable():
    fword = os.getcwd()+'/bot/fword.csv'
    ng = {}
    with open(fword, 'r', encoding='utf-8') as fw:
        reader = csv.reader(fw)
        next(reader)
        for row in reader:
            ng[row[0]] = row[1]

    return ng

In [16]:
# Configure models モデル設定
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 1500
encoder_n_layers = 4
decoder_n_layers = 4
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch　チェックポイントの設定
loadFilename = '/content/drive/My Drive/11epochs/model.tar'
checkpoint_iter = 15000
# For copy and paste: '/content/drive/My Drive/cb_model/checkpoint.tar'

#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided　モデルのロード
if loadFilename:
    # make space on memory　メモリの確保
    torch.cuda.empty_cache()
    with open('voc.pickle', 'wb') as v:
      pickle.dump(voc, v)
    # If loading on same machine the model was trained on　訓練済みモデルのロード
    if torch.cuda.is_available():
      checkpoint = torch.load(loadFilename)
    else:
      checkpoint = torch.load(loadFilename,  map_location='cpu')
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']

print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models　モデル初期化
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')


Building encoder and decoder ...
Models built and ready to go!


In [17]:
# if training, uncomment and run this cell
# 訓練する場合はコメントを外してセルを実行

'''
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 15000
print_every = 1000
save_every = 5000
epochs = 15

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Starting Training!")

trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)
'''

'\n# Configure training/optimization\nclip = 50.0\nteacher_forcing_ratio = 1.0\nlearning_rate = 0.0001\ndecoder_learning_ratio = 5.0\nn_iteration = 15000\nprint_every = 1000\nsave_every = 5000\nepochs = 15\n\n# Ensure dropout layers are in train mode\nencoder.train()\ndecoder.train()\n\n# Initialize optimizers\nprint(\'Building optimizers ...\')\nencoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)\ndecoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)\nif loadFilename:\n    encoder_optimizer.load_state_dict(encoder_optimizer_sd)\n    decoder_optimizer.load_state_dict(decoder_optimizer_sd)\n\n# Run training iterations\nprint("Starting Training!")\n\ntrainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,\n           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,\n           print_every, save_every, clip, corpus_name, loadFilename)\n'

In [18]:
print('Done!: ' + str(datetime.datetime.now(JST)))

Done!: 2020-03-12 09:52:05.030324+09:00


In [20]:
# Set dropout layers to eval mode 評価モード
encoder.eval()
decoder.eval()
 
# Initialize search module サーチモジュールの初期化
searcher = GreedySearchDecoder(encoder, decoder)
 
start_bot = datetime.datetime.now(JST)
print(start_bot - program_start)
# Begin chatting (uncomment and run the following line to begin) チャット開始
evaluateInput(encoder, decoder, searcher, voc)


0:05:37.662215
> 何歳？
Bot: 40
> 学歴は？
Bot: 東大や此花や
> 年収は？
Bot: 450万
> 身長は？
Bot: 158cm
> 体重は？
Bot: 40kg
> どこ住みや？
Bot: 神奈川や
> ほな
