In [1]:
# imports(default)
import sys
from glob import glob
from tqdm import tqdm
import numpy as np
import gensim
import MeCab
sys.path.append("../src")
from models import *
from utils import *
import chainer
from prefetch_generator import BackgroundGenerator
from IPython.display import clear_output
from matplotlib import pylab as plt

  from ._conv import register_converters as _register_converters


In [2]:
char_files = glob("../dataset/charactor/*.txt")
conv_files = glob("../dataset/conversation/*.txt")

In [3]:
char_texts = [read_file(file) for file in char_files]
conv_texts = [read_file(file) for file in conv_files]

In [4]:
parser = Parser()

In [5]:
vocab = set()
for conv in conv_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))
for conv in char_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))

In [6]:
word2id = {}
id2word = {}

for e, word in enumerate(["<PAD>", "<BOS>", "<EOS>", "<UNK>"] + sorted(list(vocab))):
    word2id[word] = e
    id2word[e] = word

In [7]:
tokenizer = Tokenizer(word2id, id2word)
char_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in char_texts]
conv_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in conv_texts]

In [8]:
conv_x = []
conv_y = []

In [9]:
for text in conv_seqs:
    conv_x += text[0::2]
    conv_y += text[1::2]

In [10]:
EMBEDDING_SIZE = 200
NUM_UNITS = 400
SEQ_LEN = 150
LAYERS = 2
BATCH_SIZE = 256
VOCAB = len(word2id)

In [11]:
# setting model
model = Model(VOCAB, SEQ_LEN, EMBEDDING_SIZE, NUM_UNITS, LAYERS)
model.to_gpu(0)
epochs = 100
batch_size = 128
optimizer = chainer.optimizers.Adam(1e-3)
optimizer.setup(model)
history = {"loss":[]}

In [12]:
# Setting Data Generator
def mygen(X, Y, batch_size, train=True):
    if train:
        newindex = list(np.random.permutation(len(X)))
        X = [X[i] for i in newindex]
        Y = [Y[i] for i in newindex]
    for i in range(0, len(X), batch_size):
        x = [np.array(x, "int32") for x in X[i:i+batch_size]]
        y = [np.array(y, "int32") for y in Y[i:i+batch_size]]
        yield(x, y)

In [13]:
# training
for epoch in range(epochs):
    
    # train
    tmp_loss = 0.0
    gen = BackgroundGenerator(mygen(conv_x, conv_y, batch_size))
    for x, y in tqdm(gen, "epoch-{:02d} train".format(epoch + 1)):
        x = to_device0(x)
        y = to_device0(y)
        x = F.pad_sequence(x, SEQ_LEN)
        y = F.pad_sequence(y, SEQ_LEN)
        
        loss = model.get_loss(x, y)
        model.cleargrads()
        loss.backward()
        optimizer.update()
        
        tmp_loss += float(loss.data) * len(y)
    
    tmp_loss /= len(conv_x)
    history["loss"].append(tmp_loss)
    print(tmp_loss)

    # checkpoint
    if tmp_loss == min(history["loss"]):
        model.to_cpu()
        chainer.serializers.save_npz("../models/seq2seq.npz", model)
        model.to_gpu()

epoch-01 train: 25it [00:09,  2.67it/s]


284.1356550331721


epoch-02 train: 25it [00:08,  3.00it/s]


101.66193209693546


epoch-03 train: 25it [00:08,  2.99it/s]


99.00713420565167


epoch-04 train: 25it [00:08,  2.97it/s]


92.58018846299913


epoch-05 train: 25it [00:08,  2.97it/s]


83.57407249813988


epoch-06 train: 25it [00:08,  2.98it/s]


80.51874001154825


epoch-07 train: 25it [00:08,  2.95it/s]


78.45265133812315


epoch-08 train: 25it [00:08,  2.95it/s]


76.43190262083023


epoch-09 train: 25it [00:08,  2.98it/s]


74.60306337871249


epoch-10 train: 25it [00:08,  2.91it/s]


73.20357367621527


epoch-11 train: 25it [00:08,  2.92it/s]


72.00075782412574


epoch-12 train: 25it [00:08,  2.96it/s]


70.41288902646019


epoch-13 train: 25it [00:08,  2.95it/s]


68.86285094003829


epoch-14 train: 25it [00:08,  2.97it/s]


67.689624871148


epoch-15 train: 25it [00:08,  2.95it/s]


66.57923692006914


epoch-16 train: 25it [00:08,  2.95it/s]


65.61216399419875


epoch-17 train: 25it [00:08,  2.96it/s]


64.84322088332404


epoch-18 train: 25it [00:08,  2.94it/s]


63.98315215337844


epoch-19 train: 25it [00:08,  2.97it/s]


62.98618117075118


epoch-20 train: 25it [00:08,  2.93it/s]


62.073716164240764


epoch-21 train: 25it [00:08,  2.94it/s]


61.31731437077598


epoch-22 train: 25it [00:08,  2.95it/s]


60.43786873469277


epoch-23 train: 25it [00:08,  2.97it/s]


59.65411371866862


epoch-24 train: 25it [00:08,  2.93it/s]


58.79591761513362


epoch-25 train: 25it [00:08,  2.93it/s]


58.039925910102


epoch-26 train: 25it [00:08,  2.94it/s]


57.280812574114115


epoch-27 train: 25it [00:08,  2.93it/s]


56.45751397511316


epoch-28 train: 25it [00:08,  2.90it/s]


55.826586163233195


epoch-29 train: 25it [00:08,  2.94it/s]


54.90280473012773


epoch-30 train: 25it [00:08,  2.91it/s]


54.07167673262339


epoch-31 train: 25it [00:08,  2.95it/s]


53.33437545049758


epoch-32 train: 25it [00:08,  2.93it/s]


52.60189704047309


epoch-33 train: 25it [00:08,  2.95it/s]


51.93039753262959


epoch-34 train: 25it [00:08,  2.93it/s]


51.26586450970362


epoch-35 train: 25it [00:08,  2.96it/s]


50.51229232545883


epoch-36 train: 25it [00:08,  2.95it/s]


49.92392039223323


epoch-37 train: 25it [00:08,  2.90it/s]


49.35861579047309


epoch-38 train: 25it [00:08,  2.94it/s]


48.58506327311198


epoch-39 train: 25it [00:08,  2.93it/s]


48.00668914552719


epoch-40 train: 25it [00:08,  2.87it/s]


47.40508340260339


epoch-41 train: 25it [00:08,  2.87it/s]


46.85170545789931


epoch-42 train: 25it [00:08,  2.93it/s]


46.15941659594339


epoch-43 train: 25it [00:08,  2.94it/s]


45.67207506210085


epoch-44 train: 25it [00:08,  2.94it/s]


45.04177131289528


epoch-45 train: 25it [00:08,  2.90it/s]


44.438234291682164


epoch-46 train: 25it [00:08,  2.89it/s]


43.87863424149771


epoch-47 train: 25it [00:08,  2.90it/s]


43.22234682113405


epoch-48 train: 25it [00:08,  2.98it/s]


42.72533362252371


epoch-49 train: 25it [00:08,  2.96it/s]


42.185817345513236


epoch-50 train: 25it [00:08,  2.90it/s]


41.6089540608724


epoch-51 train: 25it [00:08,  2.94it/s]


41.09110134306408


epoch-52 train: 25it [00:08,  2.85it/s]


40.59081805516803


epoch-53 train: 25it [00:08,  2.96it/s]


40.08157759409102


epoch-54 train: 25it [00:08,  2.91it/s]


39.61512138003395


epoch-55 train: 25it [00:08,  2.94it/s]


39.1154395742265


epoch-56 train: 25it [00:08,  2.93it/s]


38.57360596187531


epoch-57 train: 25it [00:08,  2.97it/s]


37.98990050785125


epoch-58 train: 25it [00:08,  2.95it/s]


37.52247689141168


epoch-59 train: 25it [00:08,  2.93it/s]


36.992920917329336


epoch-60 train: 25it [00:08,  2.98it/s]


36.70667105296302


epoch-61 train: 25it [00:08,  2.92it/s]


36.094940064445375


epoch-62 train: 25it [00:08,  2.93it/s]


35.50729851616754


epoch-63 train: 25it [00:08,  2.95it/s]


34.94735236273871


epoch-64 train: 25it [00:08,  2.96it/s]


34.42767723446801


epoch-65 train: 25it [00:08,  2.92it/s]


33.980419316367495


epoch-66 train: 25it [00:08,  2.84it/s]


33.43670401194739


epoch-67 train: 25it [00:08,  2.97it/s]


33.02203100053091


epoch-68 train: 25it [00:08,  2.96it/s]


32.45142016940647


epoch-69 train: 25it [00:08,  2.94it/s]


31.947922708420528


epoch-70 train: 25it [00:08,  2.94it/s]


31.5572563195607


epoch-71 train: 25it [00:08,  2.89it/s]


31.10788449363103


epoch-72 train: 25it [00:08,  2.97it/s]


30.684830414302766


epoch-73 train: 25it [00:08,  2.84it/s]


30.274639519585502


epoch-74 train: 25it [00:08,  2.93it/s]


29.989462608458503


epoch-75 train: 25it [00:08,  2.91it/s]


29.496104218013702


epoch-76 train: 25it [00:08,  2.90it/s]


29.06159948682028


epoch-77 train: 25it [00:08,  2.92it/s]


28.57956394377209


epoch-78 train: 25it [00:08,  2.87it/s]


28.168189183795263


epoch-79 train: 25it [00:08,  2.90it/s]


27.771823061988467


epoch-80 train: 25it [00:08,  2.93it/s]


27.46095210484096


epoch-81 train: 25it [00:08,  2.91it/s]


27.153097006177145


epoch-82 train: 25it [00:08,  2.92it/s]


26.883806243169875


epoch-83 train: 25it [00:08,  2.86it/s]


26.517135507492792


epoch-84 train: 25it [00:08,  2.93it/s]


26.02486179533459


epoch-85 train: 25it [00:08,  2.93it/s]


25.72394005669488


epoch-86 train: 25it [00:08,  2.90it/s]


25.332936457073878


epoch-87 train: 25it [00:08,  2.92it/s]


25.060695545257083


epoch-88 train: 25it [00:08,  2.89it/s]


24.648623273334806


epoch-89 train: 25it [00:08,  2.91it/s]


24.25795791989281


epoch-90 train: 25it [00:08,  2.95it/s]


23.946840017409553


epoch-91 train: 25it [00:08,  2.89it/s]


23.61780245826358


epoch-92 train: 25it [00:08,  2.94it/s]


23.368606770833335


epoch-93 train: 25it [00:08,  2.92it/s]


23.102434895833333


epoch-94 train: 25it [00:08,  2.92it/s]


22.75586432078528


epoch-95 train: 25it [00:08,  2.92it/s]


22.55200992765881


epoch-96 train: 25it [00:08,  2.90it/s]


22.22157104613289


epoch-97 train: 25it [00:08,  2.85it/s]


21.936528437780954


epoch-98 train: 25it [00:08,  2.90it/s]


21.59085955180819


epoch-99 train: 25it [00:08,  2.93it/s]


21.346767252361964


epoch-100 train: 25it [00:08,  2.95it/s]


20.954856104775082


In [13]:
model.training_model.load_weights("../models/seq2seq.npz")

In [14]:
i = 111
source = conv_x[i:i+2]
inputs = pad_sequences(source, 150, padding="post", truncating="post")
states = model.predict_sequence(inputs, 50, mode="random", alpha=1)
print(tokenizer.decode_batch(source))
print(tokenizer.decode_batch(states))

['どんなジャンルの音楽が好きですか？', 'どんなジャンルが好きなんですか？']
['海に行かないんですか？', 'すごく簡単です。']


In [20]:
query = "明日は何をしますか"
print(parser.parse(query))
query = [tokenizer.encode(parser.parse(query))]
print(tokenizer.decode(query))
query = pad_sequences(query, 150, padding="post", truncating="post")
result = model.predict_sequence(query, 50, mode="random", alpha=.8)
print(tokenizer.decode(result[0]))

['明日', 'は', '何', 'を', 'し', 'ます', 'か']

ぼくはペットへ大好きです
