In [16]:
# imports(default)
import sys
from glob import glob
from tqdm import tqdm
import numpy as np
import gensim
import MeCab
sys.path.append("../")
from models import *
from utils import *
from chainer import optimizers
from prefetch_generator import BackgroundGenerator

In [2]:
char_files = glob("../dataset/charactor/*.txt")
conv_files = glob("../dataset/conversation/*.txt")

In [3]:
char_texts = [read_file(file) for file in char_files]
conv_texts = [read_file(file) for file in conv_files]

In [4]:
parser = Parser()

In [5]:
vocab = set()
for conv in conv_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))
for conv in char_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))

In [6]:
word2id = {}
id2word = {}

for e, word in enumerate(["<PAD>", "<BOS>", "<EOS>", "<UNK>"] + sorted(list(vocab))):
    word2id[word] = e
    id2word[e] = word

In [7]:
tokenizer = Tokenizer(word2id, id2word)
char_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in char_texts]
conv_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in conv_texts]

In [8]:
conv_x = []
conv_y = []

In [9]:
for text in conv_seqs:
    conv_x += text[0::2]
    conv_y += text[1::2]

In [11]:
EMBEDDING_SIZE = 200
NUM_UNITS = 400
SEQ_LEN = 150
LAYERS = 2
BATCH_SIZE = 256
VOCAB = len(word2id)

In [12]:
# setting model
model = Model(VOCAB, SEQ_LEN, EMBEDDING_SIZE, NUM_UNITS, LAYERS)
model.to_gpu(0)
epochs = 100
batch_size = 128
optimizer = optimizers.Adam()
optimizer.setup(model)
history = {"loss":[], "loss_val":[], "bleu": []}

In [26]:
# Setting Data Generator
def mygen(X, Y, batch_size, train=True):
    if train:
        newindex = list(np.random.permutation(len(X)))
        X = [X[i] for i in newindex]
        Y = [Y[i] for i in newindex]
    for i in range(0, len(X), batch_size):
        x = [np.array(x, "int32") for x in X[i:i+batch_size]]
        y = [np.array(y, "int32") for y in Y[i:i+batch_size]]
        yield(x, y)

gen = BackgroundGenerator(mygen(conv_x, conv_y, batch_size))

In [27]:
# training
for epoch in range(epochs):
    
    # train
    tmp_loss = 0.0
    for x, y in tqdm(gen, "train"):
        x = to_device0(x)
        y = to_device0(y)
        x = F.pad_sequence(x, SEQ_LEN)
        y = F.pad_sequence(y, SEQ_LEN)
        
        loss = model.get_loss(x, y)
        model.cleargrads()
        loss.backward()
        optimizer.update()
        
        tmp_loss += float(loss.data) * len(y)
        
    history["loss"].append(tmp_loss / len(conv_x))

    # checkpoint
    if bleu == max(history["bleu"]):
        model.to_cpu()
        serializers.save_npz("models/seq2seq.npz", model)
        model.to_gpu()
    
    # print
    clear_output()
    for key, val in history.items():
        if "loss" in key:
            plt.plot(val, label=key)
    plt.legend()
    plt.show()
    for key, val in history.items():
        if "bleu" in key:
            plt.plot(val, label=key)
    plt.legend()
    plt.show()
    
    optimizer.new_epoch()

train: 25it [00:08,  2.82it/s]


NameError: name 'Y_trn' is not defined

In [13]:
model.training_model.load_weights("../models/weights.100.hdf5")

In [14]:
i = 111
source = conv_x[i:i+2]
inputs = pad_sequences(source, 150, padding="post", truncating="post")
states = model.predict_sequence(inputs, 50, mode="random", alpha=1)
print(tokenizer.decode_batch(source))
print(tokenizer.decode_batch(states))

['どんなジャンルの音楽が好きですか？', 'どんなジャンルが好きなんですか？']
['海に行かないんですか？', 'すごく簡単です。']


In [20]:
query = "明日は何をしますか"
print(parser.parse(query))
query = [tokenizer.encode(parser.parse(query))]
print(tokenizer.decode(query))
query = pad_sequences(query, 150, padding="post", truncating="post")
result = model.predict_sequence(query, 50, mode="random", alpha=.8)
print(tokenizer.decode(result[0]))

['明日', 'は', '何', 'を', 'し', 'ます', 'か']

ぼくはペットへ大好きです
