# Imports & Functions

In [1]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.initializers import he_normal, he_uniform
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GRU, LSTM, SimpleRNN, Activation, Bidirectional, TimeDistributed, LayerNormalization
from tensorflow.keras.layers import Concatenate, Permute, Dot, Input, Multiply, RepeatVector, Lambda, Attention
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.utils import plot_model

In [2]:
def kana_to_indices(kana_array):
    '''
    Converts an array of hiragana and NaN to an array of integers between 0 and 105, inclusive.
    Only NaN and other non-hiragana are labeled as 0.
    
    Parameters
    ----------
    kana_array : numpy.ndarray
        A 2D array of syllables in hiragana, or NaN.
        shape = (m, max_syllables)
        m = batch size
        max_syllables = the maximum length of a word from our batch
    
    Returns
    -------
    numpy.ndarray
        A 2D array.
        shape = (m, max_syllables)
    '''
    
    hiragana = ['<start>', '<end>', 'あ', 'い', 'う', 'え', 'お',
                'か', 'き', 'く', 'け', 'こ',
                'さ', 'し', 'す', 'せ', 'そ',
                'た', 'ち', 'つ', 'て', 'と',
                'な', 'に', 'ぬ', 'ね', 'の',
                'は', 'ひ', 'ふ', 'へ', 'ほ',
                'ま', 'み', 'む', 'め', 'も',
                'や', 'ゆ', 'よ',
                'ら', 'り', 'る', 'れ', 'ろ',
                'わ', 'を', 'ん',
                'が', 'ぎ', 'ぐ', 'げ', 'ご',
                'ざ', 'じ', 'ず', 'ぜ', 'ぞ',
                'だ', 'ぢ', 'づ', 'で', 'ど',
                'ば', 'び', 'ぶ', 'べ', 'ぼ',
                'ぱ', 'ぴ', 'ぷ', 'ぺ', 'ぽ',
                'きゃ', 'きゅ', 'きょ',
                'しゃ', 'しゅ', 'しょ',
                'ちゃ', 'ちゅ', 'ちょ',
                'にゃ', 'にゅ', 'にょ',
                'ひゃ', 'ひゅ', 'ひょ',
                'みゃ', 'みゅ', 'みょ',
                'りゃ', 'りゅ', 'りょ',
                'ぎゃ', 'ぎゅ', 'ぎょ',
                'じゃ', 'じゅ', 'じょ',
                'びゃ', 'びゅ', 'びょ',
                'ぴゃ', 'ぴゅ', 'ぴょ',
                'っ']
    
    m = kana_array.shape[0]
    max_syllables = kana_array.shape[1]
    
    index_array = np.zeros(shape=(m, max_syllables), dtype='int8')
    
    for i in range(m):
        
        for j, char in enumerate(kana_array[i, :]):

            if char in hiragana:
                # add 1 to avoid setting it equal to 0
                index_array[i, j] = hiragana.index(char) + 1
    
    return index_array

In [36]:
def tokenize_kana(kana_array):
    '''
    Tokenizes an array of hiragana.
    
    Parameters
    ----------
    kana_array : numpy.ndarray
        A 2D array of syllables in hiragana, or NaN.
        shape = (m, max_syllables)
        m = batch size
        max_syllables = the maximum length of a word from our batch
    
    Returns
    -------
    numpy.ndarray
        A 2D array.
        shape = (m, max_syllables)
    '''
    aa = ['あ', 'か', 'さ', 'た', 'な', 'は', 'ま', 'や', 'ら', 'わ',
          'が', 'ざ', 'だ', 'ば', 'ぱ', 'きゃ', 'しゃ', 'ちゃ', 'にゃ',
          'ひゃ', 'みゃ', 'りゃ', 'ぎゃ', 'じゃ', 'びゃ', 'ぴゃ', ]
    
    ii = ['い', 'き', 'し', 'ち', 'に', 'ひ', 'み',
          'り', 'ぎ', 'じ', 'ぢ', 'び', 'ぴ']
    
    uu = ['う', 'く', 'す', 'つ', 'ぬ', 'ふ', 'む', 'ゆ', 'る', 'ぐ',
          'ず', 'づ', 'ぶ', 'ぷ', 'きゅ', 'しゅ', 'ちゅ', 'にゅ', 'ひゅ',
          'みゅ', 'りゅ', 'ぎゅ', 'じゅ', 'びゅ', 'ぴゅ']
    
    ee = ['え', 'け', 'せ', 'て', 'ね', 'へ', 'め',
          'れ', 'げ', 'ぜ', 'で', 'べ', 'ぺ']
    
    oo = ['お', 'こ', 'そ', 'と', 'の', 'ほ', 'も', 'よ', 'ろ', 'ご',
          'ぞ', 'ど', 'ぼ', 'ぽ', 'きょ', 'しょ', 'ちょ', 'にょ', 'ひょ',
          'みょ', 'りょ', 'ぎょ', 'じょ', 'びょ', 'ぴょ']
    
    special = ['np.nan', '<start>', '<end>', 'っ', 'ん', 'を']

    
    m = kana_array.shape[0]
    max_syllables = kana_array.shape[1]
    
    index_array = np.zeros(shape=(m, max_syllables), dtype='int8')
    
    for i in range(m):
        for j, char in enumerate(kana_array[i, :]):

            if char in special:
                # add 1 to avoid setting it equal to 0
                index_array[i, j] = special.index(char)
            
            elif char in aa:
                index_array[i, j] = 6
                
            elif char in ii:
                index_array[i, j] = 7
            
            elif char in uu:
                index_array[i, j] = 8
            
            elif char in ee:
                index_array[i, j] = 9
                
            elif char in oo:
                index_array[i, j] = 10
    
    return index_array

In [37]:
def tokenize_mola(mola_array):
    
    molas = ['mola', 'accent_plain mola', 'accent_top mola']
    
    m = mola_array.shape[0]
    max_syllables = mola_array.shape[1]
    
    cat_array = np.zeros(shape=(m, max_syllables), dtype='int8')
    
    for i in range(m):
        for j, char in enumerate(mola_array[i, :]):
            cat_value = 0
            
            if char == '<start>':
                cat_value = 1
                
            elif char == '<end>':
                cat_value = 2
                
            elif char in molas:
                if char == 'mola':
                    cat_value = 3
                
                else:
                    cat_value = 4
            
            cat_array[i, j] = cat_value
    
    return cat_array

In [38]:
def test_acc(predictions, y_test):

    num_samples, word_length = predictions.shape

    correct = 0
    total = 0
    missed_zeros = 0
    first_correct = 0

    for i in range(0, num_samples):
        
        if predictions[i, 1] == y_test[i, 1]:
            first_correct += 1

        for j in range(1, word_length):

            if y_test[i, j] != 0 and y_test[i, j] != 2:

                total += 1

                if predictions[i, j] == y_test[i, j]:

                    correct += 1

            elif y_test[i, j] == 0 and predictions[i, j] != 0:

                missed_zeros += 1
               
    print(f"{correct} correct out of {total} entries.")
    print('Accuracy:', correct / total)
    print('First syllable correct:', first_correct / num_samples)

In [48]:
def predict(X, y, model):

    predictions = np.argmax(model.predict(X), axis=2)

    return test_acc(predictions, y)

In [40]:
syllables = pd.read_csv('syll_with_ends.csv')
accents = pd.read_csv('acc_with_ends.csv')

In [41]:
tk_syll_simple = tokenize_kana(np.array(syllables.iloc[:, 1:21]))
tk_syll_standard = kana_to_indices(np.array(syllables.iloc[:, 1:21]))
tk_acc = tokenize_mola(np.array(accents.iloc[:, 1:21]))

In [42]:
X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(tk_syll_simple, tk_acc, test_size=0.1, random_state=42, shuffle=True)
X_train_standard, X_test_standard, y_train_standard, y_test_standard = train_test_split(tk_syll_standard, tk_acc, test_size=0.1, random_state=42, shuffle=True)

# Sequential Model

In [64]:
# [91.49%] 10 -> 53 -> 26 -> 13 -> 5, batch_size=128
# [91.83%] 10 -> 53 -> 53 -> 53 -> 5
model = Sequential([
    Embedding(input_dim=108, output_dim=10, mask_zero=True),

    Bidirectional(LSTM(units=53, return_sequences=True)),
    LayerNormalization(),
    Bidirectional(LSTM(units=53, return_sequences=True)),
    LayerNormalization(),
    Bidirectional(LSTM(units=53, return_sequences=True)),

    TimeDistributed(Dense(units=5, activation='softmax'))
])

lr_schedule = ExponentialDecay(initial_learning_rate=0.01, decay_steps=10000, decay_rate=0.98, staircase=True)
optimizer = Adam(learning_rate=lr_schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

model.compile(loss='sparse_categorical_crossentropy',
    optimizer=optimizer,
    metrics=['sparse_categorical_accuracy']
)

In [65]:
model.fit(x=X_train_standard,
          y=y_train_standard,
          epochs=10,
          batch_size=128,
          shuffle=True,
          verbose=1,
          validation_split=0.1,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x22d780b9d30>

In [66]:
predict(X_test_standard, y_test_standard, model)

20514 correct out of 22339 entries.
Accuracy: 0.9183043108465017
First syllable correct: 0.9300430156214625


In [67]:
predict(X_train_standard, y_train_standard, model)

186644 correct out of 200753 entries.
Accuracy: 0.9297196056845974
First syllable correct: 0.9412622946695847


# Attention Model

In [43]:
class attention(tf.keras.layers.Layer):
    
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences
        super(attention,self).__init__()
        
    def build(self, input_shape):
        
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                               initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                               initializer="zeros")
        
        super(attention,self).build(input_shape)
        
    def call(self, x):
        
        e = tf.keras.activations.tanh(tf.keras.backend.dot(x,self.W)+self.b)
        a = tf.keras.activations.softmax(e, axis=1)
        output = x*a
        
        if self.return_sequences:
            return output
        
        return tf.keras.sum(output, axis=1)

In [50]:
# (90%) 108 -> 10 -> 53 -> att -> 26 -> 5
# (88%) 108 ->  5 -> 10 -> att -> 10 -> 5
inputs = Input(shape=(20,))
embedding = Embedding(input_dim=108, output_dim=5, mask_zero=True)(inputs)
pre_attention = Bidirectional(LSTM(units=10, return_sequences=True))(embedding)
attention_layer = attention(return_sequences=True)(pre_attention)
post_attention = LSTM(units=10, return_sequences=True)(attention_layer)
outputs = Dense(units=5, activation='softmax')(post_attention)

attention_model = Model(inputs=inputs, outputs=outputs, name='attention_model')

lr_schedule = ExponentialDecay(initial_learning_rate=0.1, decay_steps=10000, decay_rate=0.98, staircase=True)
optimizer = Adam(learning_rate=lr_schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

attention_model.compile(loss='sparse_categorical_crossentropy',
              optimizer=optimizer,
              metrics=['sparse_categorical_accuracy']
)

In [51]:
attention_model.fit(x=X_train_standard,
          y=y_train_standard,
          epochs=10,
          batch_size=64, # 128
          shuffle=True,
          verbose=1,
          validation_split=0.1,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x22c4b1954f0>

In [52]:
predict(X_test_standard, y_test_standard, attention_model)

19073 correct out of 22339 entries.
Accuracy: 0.853798289986123
First syllable correct: 0.8981208965361105


## Custom Attention Model

In [None]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_input_size = len(input_language.word_index) + 1
vocab_target_size = len(target_language_word.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [19]:
class Encoder(tf.keras.Model):

    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(self.encoder_units,
                         return_sequences=True,
                         return_state=True,
                         recurrent_initializer='glorot_uniform')


    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.lstm(x, initial_state=hidden)

        return output, state

    
    def initialize_hidden_state(self):

        return tf.zeros((self.batch_size, self.enc_units))

In [21]:
class BahdanauAttention(tf.keras.layers.Layer):

    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    
    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # We get 1 at the last axis because we are applying score to self.V
        # The shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)
        ))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [22]:
class Decoder(tf.keras.Model):

    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.decoder_units = decoder_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(self.decoder_units,
                         return_sequences=True,
                         return_state=True,
                         recurrent_initializer='glorot_uniform')
        self.fc = Dense(vocab_size)

        # Used for attention
        self.attention = BahdanauAttention(self.decoder_units)

    
    def call(self, x, hidden, encoder_output):
        # encoder_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, encoder_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # Passing the concatenated vector to the LSTM
        output, state = self.lstm(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_target_size, embedding_dim, units, BATCH_SIZE)

In [24]:
optimizer = Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    loss_ *= mask

    return tf.reduce_mean(loss)

In [26]:
@tf.function
def train_step(input, target, encoder_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        encoder_output, encoder_hidden = encoder(input, encoder_hidden)

        decoder_hidden = encoder_hidden

        decoder_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, target.shape[1]):
            # Passing encoder_output to the decoder
            predictions, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_output)

            loss += loss_function(target[:, t], predictions)

            # Using teacher forcing
            decoder_input = tf.expand_dims(target[: t], 1)
        
        batch_loss = (loss / int(target.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

        return batch_loss

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    encoder_hidden = encoder(initialize_hidden_state()
    total_loss = 0

    for (batch, (input, target)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(input, target, encoder_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print(f'Epoch {epoch + 1}, Batch {batch}, Loss: {batch_loss.numpy()}')
    
    reported_loss = total_loss / steps_per_epoch
    print(f'Epoch {epoch + 1}, Loss: {reported_loss:.4f}')

In [32]:
def evaluate(word):
    
    inputs = tf.convert_to_tensor(word)
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id]
        
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, word
        
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, word