In [1]:
# imports(default)
import sys
from glob import glob
import numpy as np
import tensorflow as tf
import gensim
import MeCab
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import *
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.optimizers import *
keras = tf.keras
sys.path.append("../")
from models import *
from utils import *

  from ._conv import register_converters as _register_converters


In [2]:
char_files = glob("../dataset/charactor/*.txt")
conv_files = glob("../dataset/conversation/*.txt")

In [3]:
char_texts = [read_file(file) for file in char_files]
conv_texts = [read_file(file) for file in conv_files]

In [4]:
parser = Parser()

In [5]:
vocab = set()
for conv in conv_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))
for conv in char_texts:
    for line in conv:
        vocab = vocab.union(set(parser.parse(line)))

In [6]:
word2id = {}
id2word = {}

for e, word in enumerate(["<PAD>", "<BOS>", "<EOS>", "<UNK>"] + sorted(list(vocab))):
    word2id[word] = e
    id2word[e] = word

In [7]:
tokenizer = Tokenizer(word2id, id2word)
char_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in char_texts]
conv_seqs = [[tokenizer.encode(parser.parse(line)) for line in text] for text in conv_texts]

In [8]:
EMBEDDING_SIZE = 200
NUM_UNITS = 400
SEQ_LEN = 150
BEAM_WIDTH = 3
BATCH_SIZE = 256
VOCAB = len(word2id)

In [9]:
clf_conv = []
for text in conv_seqs:
    clf_conv += text[1::2]
clf_char = []
for text in char_seqs:
    clf_char += text
clf_x = pad_sequences(clf_char + clf_conv, SEQ_LEN)
clf_y = [1 for _ in clf_char] + [0 for _ in clf_conv]

In [40]:
clf = Classifier(EMBEDDING_SIZE, NUM_UNITS, VOCAB)
for layer in clf.clf_model.layers:
    layer.trainable = False
    layer.supports_masking = True
clf.model.load_weights("../models/clf.keras")

In [41]:
class CharacterizingAutoencoder:
    def __init__(self, clf, EMBEDDING_SIZE, NUM_UNITS, SEQ_LEN, BEAM_WIDTH, BATCH_SIZE, VOCAB):
        encoder_inputs = Input([None], dtype="int32", name="x")
        E_embed = Embedding(VOCAB, EMBEDDING_SIZE, mask_zero=True, name="E_embed")(encoder_inputs)
        encoder1 = LSTM(NUM_UNITS, return_state=True, return_sequences=True, dropout=.2, recurrent_dropout=.2)
        encoder2 = LSTM(NUM_UNITS, return_state=True, dropout=.2, recurrent_dropout=.2)
        out, *mid_states1 = encoder1(E_embed)
        out, *mid_states2 = encoder2(out)
        # End2end learning
        decoder_inputs = Input(shape=[None], dtype="int32", name="y_")
        F_embed = Embedding(VOCAB, EMBEDDING_SIZE, mask_zero=True, name="F_embed")(decoder_inputs)
        decoder1 = LSTM(NUM_UNITS, return_sequences=True, return_state=True, dropout=.2, recurrent_dropout=.2)
        decoder2 = LSTM(NUM_UNITS, return_sequences=True, return_state=True, dropout=.2, recurrent_dropout=.2)
        decoder_outputs, *decoder_states1 = decoder1(F_embed, initial_state=mid_states1)
        decoder_outputs, *decoder_states2 = decoder2(decoder_outputs, initial_state=mid_states2)
        decoder_dense = Dense(VOCAB, activation='softmax', name="output_dense")
        decoder_outputs = decoder_dense(decoder_outputs)
        
        mikulity = clf.clf_model(decoder_outputs)

        self.training_model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=[decoder_outputs, mikulity])
        # Single Encoder
        self.encoder_model = Model(inputs=encoder_inputs, outputs=mid_states1 + mid_states2)
        # Single Decoder
        decoder_states = [Input([NUM_UNITS]) for _ in range(4)]
        d_out, *new_decoder_states1 = decoder1(F_embed, initial_state=decoder_states[0:2])
        d_out, *new_decoder_states2 = decoder2(d_out, initial_state=decoder_states[2:4])
        new_decoder_outputs = decoder_dense(d_out)

        self.decoder_model = Model(inputs=[decoder_inputs] + decoder_states,
                                   outputs=[new_decoder_outputs] + new_decoder_states1 + new_decoder_states2)

        self.training_model.compile(Adam(1e-3), loss='sparse_categorical_crossentropy')

    # generate target given source sequence
    def predict_sequence(self, source, n_steps, mode="greedy", alpha=1.0):
        # encode
        state = self.encoder_model.predict(source)
        # start of sequence input
        x = np.array([[1] for _ in range(len(source))])
        # collect predictions
        output = list()
        for t in range(n_steps):
            # predict next char
            x, *state = self.decoder_model.predict([x] + state)
            if mode == "greedy":
                x = x.argmax(-1)
            elif mode == "random":
                next_x = []
                for i in range(len(x)):
                    x[np.isnan(x)] = 0.0
                    p = np.power(x[i][0], alpha)
                    p /= p.sum()
                    next_x.append(np.random.choice(np.arange(len(x[i][0])), p=p))
                x = np.array(next_x)[:, np.newaxis]
            # store prediction
            output.append(x)
            # update target sequence
        return np.concatenate(output, -1)

In [42]:
autoencoder = CharacterizingAutoencoder(clf, EMBEDDING_SIZE, NUM_UNITS, SEQ_LEN, BEAM_WIDTH, BATCH_SIZE, VOCAB)

Tensor("output_dense_6/truediv:0", shape=(?, ?, 5753), dtype=float32)
<tensorflow.python.keras._impl.keras.engine.input_layer.InputLayer object at 0x7fa110b4b128>
<tensorflow.python.keras._impl.keras.layers.core.Dense object at 0x7fa110b4b0b8>
<tensorflow.python.keras._impl.keras.layers.recurrent.LSTM object at 0x7fa110b5e748>
<tensorflow.python.keras._impl.keras.layers.core.Dense object at 0x7fa110b26da0>
Tensor("model_15/y_1/Sigmoid:0", shape=(?, 1), dtype=float32)


In [32]:
a.supports_masking

False