In [1]:
# imports(default)
import sys
from glob import glob
import numpy as np
import gensim
import MeCab
sys.path.append("../src/")
from models import *
from utils import *
from tqdm import tqdm
import chainer
from prefetch_generator import BackgroundGenerator
from IPython.display import clear_output
from matplotlib import pylab as plt

  from ._conv import register_converters as _register_converters


In [2]:
char_files = glob("../dataset/charactor/*.txt")
conv_files = glob("../dataset/conversation/*.txt")

In [3]:
char_texts = [read_file(file) for file in char_files]
conv_texts = [read_file(file) for file in conv_files]

In [4]:
parser = Parser()

In [5]:
vocab = set()
for conv in conv_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))
for conv in char_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))

In [6]:
word2id = {}
id2word = {}

for e, word in enumerate(["<PAD>", "<BOS>", "<EOS>", "<UNK>"] + sorted(list(vocab))):
    word2id[word] = e
    id2word[e] = word

In [7]:
tokenizer = Tokenizer(word2id, id2word)
char_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in char_texts]
conv_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in conv_texts]

In [8]:
EMBEDDING_SIZE = 200
NUM_UNITS = 400
SEQ_LEN = 150
BEAM_WIDTH = 3
BATCH_SIZE = 256
VOCAB = len(word2id)

In [9]:
clf_conv = []
for text in conv_seqs:
    clf_conv += text[1::2]
clf_char = []
for text in char_seqs:
    clf_char += text
clf_x = clf_char + clf_conv
clf_y = [1 for _ in clf_char] + [0 for _ in clf_conv]
perm = np.random.permutation(len(clf_x))
clf_x = [clf_x[i] for i in perm]
clf_y = [clf_y[i] for i in perm]

In [10]:
# Setting Data Generator
def mygen(X, Y, batch_size, train=True):
    if train:
        newindex = list(np.random.permutation(len(X)))
        X = [X[i] for i in newindex]
        Y = [Y[i] for i in newindex]
    for i in range(0, len(X), batch_size):
        x = [np.array(x, "int32") for x in X[i:i+batch_size]]
        y = np.array(Y[i:i+batch_size], "int32")
        yield(x, y)

In [11]:
model = Classifier(VOCAB, SEQ_LEN, EMBEDDING_SIZE, NUM_UNITS, 1)
model.to_gpu(0)
epochs = 5
batch_size = 128
optimizer = chainer.optimizers.Adam(1e-3)
optimizer.setup(model)
history = {"loss":[]}

In [12]:
# training
for epoch in tqdm(range(epochs)):
    
    # train
    tmp_loss = 0.0
    gen = BackgroundGenerator(mygen(clf_x, clf_y, batch_size))
    for x, y in gen:
        x = to_device0(x)
        y = to_device0(y)
        
        loss = model.get_loss(x, y)
        model.cleargrads()
        loss.backward()
        optimizer.update()
        
        tmp_loss += float(loss.data)
    
    tmp_loss /= len(clf_x)
    history["loss"].append(tmp_loss)

    # checkpoint
    if tmp_loss == min(history["loss"]):
        model.to_cpu()
        chainer.serializers.save_npz("../models/clf_{:03d}_{:.06f}.npz".format(epoch+1, tmp_loss), model)
        model.to_gpu()

100%|██████████| 5/5 [00:10<00:00,  2.04s/it]


In [None]:
chainer.serializers.load_npz("../models/clf_005_0.005451.npz", model)

In [15]:
query = ["前川です", "みくだにゃ"]
query = [tokenizer.encode(parser.parse(q)) for q in query]

model.predict(cuda.to_gpu(query))

variable([6.4912171e-04, 9.6422905e-01])