In [1]:
# imports(default)
import sys
from glob import glob
from tqdm import tqdm
import numpy as np
import gensim
import MeCab
sys.path.append("../src")
from models import *
from utils import *
import chainer
from prefetch_generator import BackgroundGenerator
from IPython.display import clear_output
from matplotlib import pylab as plt

  from ._conv import register_converters as _register_converters


In [2]:
char_files = glob("../dataset/charactor/*.txt")
conv_files = glob("../dataset/conversation/*.txt")

In [3]:
char_texts = [read_file(file) for file in char_files]
conv_texts = [read_file(file) for file in conv_files]

In [4]:
parser = Parser()

In [5]:
vocab = set()
for conv in conv_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))
for conv in char_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))

In [6]:
word2id = {}
id2word = {}

for e, word in enumerate(["<PAD>", "<BOS>", "<EOS>", "<UNK>"] + sorted(list(vocab))):
    word2id[word] = e
    id2word[e] = word

In [7]:
tokenizer = Tokenizer(word2id, id2word)
char_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in char_texts]
conv_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in conv_texts]

In [8]:
conv_x = []
conv_y = []

In [9]:
for text in conv_seqs:
    conv_x += text[0::2]
    conv_y += text[1::2]

In [10]:
EMBEDDING_SIZE = 200
NUM_UNITS = 400
SEQ_LEN = 150
LAYERS = 3
BATCH_SIZE = 256
VOCAB = len(word2id)

In [12]:
# setting model
model = Seq2seq(VOCAB, SEQ_LEN, EMBEDDING_SIZE, NUM_UNITS, LAYERS)
model.to_gpu(0)
epochs = 1000
batch_size = 128
optimizer = chainer.optimizers.Adam(1e-4)
optimizer.setup(model)
history = {"loss":[]}

In [13]:
# Setting Data Generator
def mygen(X, Y, batch_size, train=True):
    if train:
        newindex = list(np.random.permutation(len(X)))
        X = [X[i] for i in newindex]
        Y = [Y[i] for i in newindex]
    for i in range(0, len(X), batch_size):
        x = [np.array(x, "int32") for x in X[i:i+batch_size]]
        y = [np.array(y, "int32") for y in Y[i:i+batch_size]]
        yield(x, y)

In [None]:
# training
for epoch in tqdm(range(epochs)):
    
    # train
    tmp_loss = 0.0
    gen = BackgroundGenerator(mygen(conv_x, conv_y, batch_size))
    for x, y in gen:
        x = to_device0(x)
        y = to_device0(y)
        
        loss = model.get_loss(x, y)
        model.cleargrads()
        loss.backward()
        optimizer.update()
        
        tmp_loss += float(loss.data) * len(y)
    
    tmp_loss /= len(conv_x)
    history["loss"].append(tmp_loss)
#     print(tmp_loss)

    # checkpoint
    if tmp_loss == min(history["loss"]) and (epoch + 1) % 50 == 0:
        model.to_cpu()
        chainer.serializers.save_npz("../models/seq2seq_{:03d}_{:.06f}.npz".format(epoch+1, tmp_loss), model)
        model.to_gpu()

  8%|▊         | 75/1000 [05:04<1:02:39,  4.06s/it]

In [14]:
chainer.serializers.load_npz("../models/seq2seq_280_0.452123.npz", model)

In [15]:
i = 111
source = conv_x[i:i+2]
source = to_device0(source)
inputs = F.pad_sequence(source, 150)
states = model.translate(inputs, 150)
print(tokenizer.decode_batch(cuda.to_cpu(source)))
print(tokenizer.decode_batch(cuda.to_cpu(states)))

['どんなジャンルの音楽が好きですか？', 'どんなジャンルが好きなんですか？']
['早く利用利用しては覚えています。', '色々が好きです。']


In [23]:
query = "無理ですよね"
print(parser.parse(query))
query = [tokenizer.encode(parser.parse(query))]
print(tokenizer.decode_batch(query))
query = F.pad_sequence(cuda.to_gpu(query), 150)
result = model.translate(query)
print(tokenizer.decode(cuda.to_cpu(result[0])))

['無理', 'です', 'よ', 'ね']
['無理ですよね']
して楽しみです。
