In [None]:
# imports(default)
import sys
from glob import glob
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import *
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.optimizers import *
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
keras = tf.keras

In [None]:
char_files = glob("../dataset/charactor/*.parsed")
conv_files = glob("../dataset/conversation/nucc_*.parsed")

In [None]:
def read_file(file):
    with open(file, "r") as f:
        res = f.readlines()
        res = [line.replace("\n", "") for line in res]
    return res
char_texts = [read_file(file) for file in char_files]
conv_texts = [read_file(file) for file in conv_files]

In [None]:
vocab = set()

for text in char_texts + conv_texts:
    for line in text:
        vocab.update(set(line.split(" ")))

word2id = {}
id2word = {}

class Tokenizer:
    def __init__(self, word2id, id2word):
        self.word2id = word2id
        self.id2word = id2word
    
    def encode(self, seq):
        return [1] + [self.word2id[word] for word in seq] + [2]
    
    def decode(self, seq):
        return "".join([self.id2word[word] for word in seq]).replace("▁", "")[3:-3]

for e, word in enumerate(["<BOS>", "<EOS>", "<UNK>"] + sorted(list(vocab))):
    word2id[word] = e
    id2word[e] = word

In [None]:
word2id = {}
id2word = {}

class Tokenizer:
    def __init__(self, word2id, id2word):
        self.word2id = word2id
        self.id2word = id2word
    
    def encode(self, seq):
        return [1] + [self.word2id[word] for word in seq] + [2]
    
    def decode(self, seq):
        return "".join([self.id2word[word] for word in seq]).replace("▁", "")[3:-3]

for e, word in enumerate(["<PAD>", "<BOS>", "<EOS>", "<UNK>"] + sorted(list(vocab))):
    word2id[word] = e
    id2word[e] = word

In [None]:
tokenizer = Tokenizer(word2id, id2word)
char_seqs = [[tokenizer.encode(line.split(" ")) for line in text] for text in char_texts]
conv_seqs = [[tokenizer.encode(line.split(" ")) for line in text] for text in conv_texts]

In [None]:
conv_x = []
conv_y = []

In [None]:
for text in conv_seqs:
    conv_x.append([1, 2])
    for line in text:
        conv_x.append(line)
        conv_y.append(line)
    conv_x.pop()

In [None]:
EMBEDDING_SIZE = 200
NUM_UNITS = 400
SEQ_LEN = 150
BEAM_WIDTH = 3
BATCH_SIZE = 256
VOCAB = len(word2id)

In [None]:
def get_batch(x, y, seq_len):
    x = pad_sequences(x, seq_len, padding="post", truncating="post")
    y_in = pad_sequences([line[:-1] for line in y], seq_len, padding="post", truncating="post")
    y_out = pad_sequences([line[1:] for line in y], seq_len, padding="post", truncating="post")
    return [x, y_in], y_out

In [None]:
class Seq2seq:
    def __init__(self):
        encoder_inputs = Input([None], dtype="int32", name="x")
        E_embed = Embedding(VOCAB, EMBEDDING_SIZE, mask_zero=True, name="E_embed")(encoder_inputs)
        encoder1 = LSTM(NUM_UNITS, return_state=True, return_sequences=True, dropout=.2, recurrent_dropout=.2)
        encoder2 = LSTM(NUM_UNITS, return_state=True, dropout=.2, recurrent_dropout=.2)
        out, *mid_states1 = encoder1(E_embed)
        out, *mid_states2 = encoder2(out)
        # End2end learning
        decoder_inputs = Input(shape=[None], dtype="int32", name="y_")
        F_embed = Embedding(VOCAB, EMBEDDING_SIZE, mask_zero=True, name="F_embed")(decoder_inputs)
        decoder1 = LSTM(NUM_UNITS, return_sequences=True, return_state=True, dropout=.2, recurrent_dropout=.2)
        decoder2 = LSTM(NUM_UNITS, return_sequences=True, return_state=True, dropout=.2, recurrent_dropout=.2)
        decoder_outputs, *decoder_states1 = decoder1(F_embed, initial_state=mid_states1)
        decoder_outputs, *decoder_states2 = decoder2(decoder_outputs, initial_state=mid_states2)
        decoder_dense = Dense(VOCAB, activation='softmax', name="output_dense")
        decoder_outputs = decoder_dense(decoder_outputs)

        self.training_model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)
        # Single Encoder
        self.encoder_model = Model(inputs=encoder_inputs, outputs=mid_states1 + mid_states2)
        # Single Decoder 
        decoder_states = [Input([NUM_UNITS]) for _ in range(4)]
        d_out, *new_decoder_states1 = decoder1(F_embed, initial_state=decoder_states[0:2])
        d_out, *new_decoder_states2 = decoder2(d_out, initial_state=decoder_states[2:4])
        new_decoder_outputs = decoder_dense(d_out)

        self.decoder_model = Model(inputs=[decoder_inputs] + decoder_states,
                              outputs=[new_decoder_outputs] + new_decoder_states1 + new_decoder_states2)
        
        self.training_model.compile(Adam(1e-3), loss='sparse_categorical_crossentropy')
#         self.encoder_model.compile(Adam(1e-4), loss='sparse_categorical_crossentropy')
#         self.decoder_model.compile(Adam(1e-4), loss='sparse_categorical_crossentropy')

    # generate target given source sequence
    def predict_sequence(self, source, n_steps, mode="greedy"):
        # encode
        state = self.encoder_model.predict(source)
        # start of sequence input
        x = np.array([[1] for _ in range(len(source))])
        # collect predictions
        output = list()
        for t in range(n_steps):
            # predict next char
            x, *state = self.decoder_model.predict([x] + state)
            if mode=="greedy":
                x = x.argmax(-1)
            # store prediction
            output.append(x)
            # update target sequence
        return np.concatenate(output, -1)

model = Seq2seq()

In [None]:
x, y = get_batch(conv_x, conv_y, SEQ_LEN)
model.training_model.fit(x, y[:,:,np.newaxis], 64, epochs=10)

In [None]:
model.training_model.save_weights("models/conv.keras")

In [None]:
source = conv_x[10:12]
inputs = pad_sequences(source, 150, padding="post", truncating="post")
states = model.predict_sequence(inputs, 50)