# **Bantu Language Modeling**

The project is divided in 4 parts


*   Scratch RNN for Swahili corpus 
*   Scratch RNN for Kwere corpus
*   Anything Goes for Swahili corpus 
*   Anything Goes for Kwere corpus

##############################################################################

**Note :**
> The ipynb file I created is ported from .py file. Due to training related isuues at SLU Jupyter Hub and Google Colab, I used Dr. Hou server to train the model. I used *nohup* to keep training running on the server. 


> **Loss (Cross Entropy):** I exported graph while training the model which is shown below after the coding blocks for CWE and SW. 

Due to big training size I haven't got chance to play with the hyperparameters (batch size and number of itertation) for the model






# **Scratch RNN for Swahili corpus**

In [None]:
import numpy as np
import matplotlib.pyplot as plt


def softmax(x):
    e_x = np.exp(x)
    return e_x / np.sum(e_x)


def cross_entropy(x, index):
    """
    Assumption: the ground truth vector contains only one non-zero component with a value of 1
    """

    loss = - np.log2(x[index]) if x[index] > 0 else 0
    return loss


def cross_entropy_d(x, index):
    """
    Assumption: the ground truth vector contains only one non-zero component with a value of 1
    """

    x[index] -= 1
    return x


def char_to_ix(chars):
    """
    Make a dictionary that maps a character to an index
    Arguments:
        chars -- list of character set
    Returns:
        dictionary that maps a character to an index
    """

    return {ch: i for i, ch in enumerate(chars)}


def ix_to_char(chars):
    """
    Make a dictionary that maps an index to a character
    Arguments:
        chars -- list of character set
    Returns:
        dictionary that maps an index to a character
    """

    return {i: ch for i, ch in enumerate(chars)}


def one_hot(data, ch2ix):
    """
    Arguments:
        data -- string
        ch2ix -- dictionary that maps a character to an index
    Returns:
        Numpy array, shape = (len(data), len(ch2ix), 1)
    """

    result = []
    for i in range(len(data)):
        x = np.zeros((len(ch2ix), 1))
        if data[i] is not None:
            x[ch2ix[data[i]], 0] = 1
            result.append(x)

    return np.array(result)


def initialize_xavier(first, second):
    """
    Xavier initialization
    Arguments:
        first -- first dimension size
        second -- second dimension size
    Returns:
        W -- Weight matrix initialized by Xavier method
    """

    sd = np.sqrt(2.0 / (first + second))
    W = np.random.randn(first, second) * sd

    return W


class Graph:
    def __init__(self, xlabel, ylabel):
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.ion()
        plt.show()

    def update(self, x, y, img_name='Figure'):
        plt.plot(x, y, color='xkcd:royal blue')
        plt.show()
        plt.savefig('./figure/' + img_name + '.png')
        plt.pause(0.001)

In [None]:
import numpy as np


class RNN:
    def __init__(self, input_size, output_size, hidden_size, cell_length, depth_size=1, batch_size=1, drop_rate=0):
        self._input_size = input_size
        self._output_size = output_size
        self._hidden_size = hidden_size
        self._cell_length = cell_length
        self._depth_size = depth_size
        self._batch_size = batch_size
        self._drop_rate = drop_rate

        # Xavier initialization
        self._parameters = {'W_xa': [initialize_xavier(self._input_size if d == 0 else self._hidden_size, self._hidden_size) for d in range(self._depth_size)],
                            'W_aa': [initialize_xavier(self._hidden_size, self._hidden_size) for d in range(self._depth_size)],
                            'W_ay': [initialize_xavier(self._hidden_size, self._output_size)],
                            'b_a': [np.zeros((1, self._hidden_size)) for d in range(self._depth_size)],
                            'b_y': [np.zeros((1, self._output_size))],
                            'a': [np.zeros((self._batch_size, self._hidden_size)) for d in range(self._depth_size)]}

        self._gradients = {'dW_xa': [np.zeros_like(self._parameters['W_xa'][d]) for d in range(self._depth_size)],
                           'dW_aa': [np.zeros_like(self._parameters['W_aa'][d]) for d in range(self._depth_size)],
                           'dW_ay': [np.zeros_like(self._parameters['W_ay'][0])],
                           'db_a': [np.zeros_like(self._parameters['b_a'][d]) for d in range(self._depth_size)],
                           'db_y': [np.zeros_like(self._parameters['b_y'][0])],
                           'da': [np.zeros_like(self._parameters['a'][d]) for d in range(self._depth_size)]}

        self._momentums = {'dW_xa': [np.ones_like(self._gradients['dW_xa'][d]) * 0.1 for d in range(self._depth_size)],
                           'dW_aa': [np.zeros_like(self._gradients['dW_aa'][d]) * 0.1 for d in range(self._depth_size)],
                           'dW_ay': [np.zeros_like(self._gradients['dW_ay'][0]) * 0.1],
                           'db_a': [np.zeros_like(self._gradients['db_a'][d]) * 0.1 for d in range(self._depth_size)],
                           'db_y': [np.zeros_like(self._gradients['db_y'][0]) * 0.1]}

        self._loss = - np.log(1.0 / self._output_size) * self._cell_length

    def optimize(self, X, Y, learning_rate=0.01):
        cache = self.forward(X)
        self.backward(Y, cache)
        self.update_parameters(learning_rate=learning_rate)

    def forward(self, X):
        '''
        Computes only the forward pass through one step of the time
        -input is an index of the char in vocabulary

        Returns probabilities and the updated state of the hidden units
        '''
        self._loss = 0

        x, y_hat = [{} for d in range(self._depth_size + 1)], {}
        a = [{-1: np.copy(self._parameters['a'][d])} for d in range(self._depth_size)]
        dropout = [{} for d in range(self._depth_size)]

        for t in range(self._cell_length):
            x[0][t] = X[:, t, :, :].reshape(self._batch_size, self._input_size)

            for d in range(self._depth_size):
                dropout[d][t] = np.random.binomial(1, 1 - self._drop_rate, (1, self._hidden_size)) / (1 - self._drop_rate)
                a[d][t] = np.tanh(np.dot(x[d][t], self._parameters['W_xa'][d]) +
                                  np.dot(a[d][t - 1], self._parameters['W_aa'][d]) +
                                  self._parameters['b_a'][d])
                x[d + 1][t] = np.copy(a[d][t]) * dropout[d][t]

            z = np.dot(x[self._depth_size][t], self._parameters['W_ay'][0]) + self._parameters['b_y'][0]
            z = np.clip(z, -100, 100)
            y_hat[t] = np.array([softmax(z[b, :]) for b in range(self._batch_size)])

        cache = (x, a, y_hat, dropout)
        return cache

    def backward(self, Y, cache):
        self._gradients = {key: [np.zeros_like(self._gradients[key][d]) for d in range(len(self._gradients[key]))] for key in self._gradients.keys()}
        (x, a, y_hat, dropout) = cache

        for t in reversed(range(self._cell_length)):
            self._loss += sum([cross_entropy(y_hat[t][b, :], Y[b, t]) for b in range(self._batch_size)]) / (self._cell_length * self._batch_size)
            dy = np.array([cross_entropy_d(y_hat[t][b, :], Y[b, t]) for b in range(self._batch_size)]) / (self._cell_length * self._batch_size)

            self._gradients['dW_ay'][0] += np.dot(x[self._depth_size][t].T, dy)
            self._gradients['db_y'][0] += dy.sum(axis=0)
            da = np.dot(dy, self._parameters['W_ay'][0].T)

            for d in reversed(range(self._depth_size)):
                da = (1 - a[d][t] ** 2) * (da * dropout[d][t] + self._gradients['da'][d])
                self._gradients['dW_xa'][d] += np.dot(x[d][t].T, da)
                self._gradients['dW_aa'][d] += np.dot(a[d][t - 1].T, da)
                self._gradients['db_a'][d] += da.sum(axis=0)
                self._gradients['da'][d] = np.dot(da, self._parameters['W_aa'][d].T)
                da = np.dot(da, self._parameters['W_xa'][d].T)

        self._parameters['a'] = [a[d][self._cell_length - 1] for d in range(self._depth_size)]

    def update_parameters(self, learning_rate=0.01):
        parameters = self._parameters['W_xa'] + self._parameters['W_aa'] + self._parameters['W_ay'] + self._parameters['b_a'] + self._parameters['b_y']
        gradients = self._gradients['dW_xa'] + self._gradients['dW_aa'] + self._gradients['dW_ay'] + self._gradients['db_a'] + self._gradients['db_y']
        momentums = self._momentums['dW_xa'] + self._momentums['dW_aa'] + self._momentums['dW_ay'] + self._momentums['db_a'] + self._momentums['db_y']

        for w, g, m in zip(parameters, gradients, momentums):
            np.clip(w, -1, 1, out=w)

            # # Adagrad
            # m += g ** 2
            # w -= learning_rate * g / np.sqrt(m + 1e-8)

            # RMSProp
            m = 0.9 * m + 0.1 * g ** 2
            w -= learning_rate * g / np.sqrt(m + 1e-8)

    def sample(self, ix, n):
        '''
        Samples the model, returns the sample of length N as a string
        '''
        ixes = [ix]
        a = [np.zeros((1, self._hidden_size)) for d in range(self._depth_size)]
        for t in range(n):
            x = np.zeros((1, self._input_size))
            x[0, ix] = 1

            for d in range(self._depth_size):
                a[d] = np.tanh(np.dot(x, self._parameters['W_xa'][d]) +
                               np.dot(a[d], self._parameters['W_aa'][d]) +
                               self._parameters['b_a'][d])
                x = a[d]

            z = np.dot(x, self._parameters['W_ay']) + self._parameters['b_y']
            z = np.clip(z, -100, 100)
            y = softmax(z / 0.7)

            ix = np.random.choice(range(self._input_size), p=y.ravel())
            ixes.append(ix)

        return ixes

    def initialize_optimizer(self):
        self._momentums = {key: [np.ones_like(self._momentums[key][d]) * 0.1 for d in range(len(self._momentums[key]))] for key in self._momentums.keys()}

    def initialize_hidden_state(self):
        self._parameters['a'] = [np.zeros_like(self._parameters['a'][d]) for d in range(self._depth_size)]

    def hidden_state(self):
        return self._parameters['a']

    def loss(self):
        return self._loss

    def parameters(self):
        return self._parameters

In [None]:
import numpy as np
# from utils import char_to_ix, ix_to_char, one_hot, Graph
# from model import RNN
import pickle
import timeit


def model(data='input.txt', hidden_size=256, seq_length=200, depth_size=1, batch_size=64, drop_rate=0.1,
          num_iteration=100, learning_rate=0.01, img_name='Figure'):
    # Open a training text file
    data = open(data, 'rb').read().decode('UTF-8')
    chars = list(set(data))
    chars.sort()
    data_size, vocab_size = len(data), len(chars)
    print('Data has %d total characters, %d unique characters.' % (data_size, vocab_size))

    # Make a dictionary that maps {character:index} and {index:character}
    ch2ix, ix2ch = char_to_ix(chars), ix_to_char(chars)

    # Set RNN model
    model = RNN(vocab_size, vocab_size, hidden_size, seq_length, depth_size, batch_size, drop_rate)

    cnt = 0
    losses = {}
    graph = Graph('Iteration', 'Loss')

    # Optimize model
    start = timeit.default_timer()
    for n in range(num_iteration):
        model.initialize_hidden_state()
        model.initialize_optimizer()

        # Split text by mini-batch with batch_size
        batch_length = data_size // batch_size
        for i in range(0, batch_length - seq_length, seq_length):
            mini_batch_X, mini_batch_Y = [], []

            for j in range(0, data_size - batch_length + 1, batch_length):
                mini_batch_X.append(one_hot(data[j + i:j + i + seq_length], ch2ix))
                mini_batch_Y.append([ch2ix[ch] for ch in data[j + i + 1:j + i + seq_length + 1]])

            mini_batch_X = np.array(mini_batch_X)
            mini_batch_Y = np.array(mini_batch_Y)

            model.optimize(mini_batch_X, mini_batch_Y, learning_rate=learning_rate)

            cnt += 1
            if cnt % 100 == 0 or cnt == 1:
                stop = timeit.default_timer()

                loss = model.loss()
                losses[cnt] = loss

                print("\n######################################")
                print("Total iteration: %d" % (n + 1))
                print("Iteration: %d" % cnt)
                print("Loss: %f" % loss)
                print("Time: %f" % (stop - start))

                ix = np.random.randint(0, vocab_size)
                sample_ixes = model.sample(ix, 200)
                txt = ''.join(ix2ch[ix] for ix in sample_ixes)
                print("\n### Starts Here ###\n\n" + txt.rstrip() + "\n\n### Ends Here ###")
                print("######################################")

                graph_x = np.array(sorted(losses))
                graph_y = np.array([losses[key] for key in graph_x])
                graph.update(graph_x, graph_y, img_name=img_name)

    return model, ch2ix, ix2ch


if __name__ == "__main__":
    ##########
    data = 'sw-train'
    num_iteration = 100
    optimizer = 'adagrad'
    ##########

    infile = data + '.txt'
    outfile = data + '_' + str(num_iteration) + '_' + optimizer

    result, ch2ix, ix2ch = model(data=infile, num_iteration=num_iteration, img_name=outfile)

    file = open('./result/' + outfile + '.pickle', 'wb')
    pickle.dump(result, file)
    pickle.dump(ch2ix, file)
    pickle.dump(ix2ch, file)
    file.close()

# **Scratch RNN for Kwere corpus**

In [None]:
import numpy as np
import pickle
import timeit

final_losss = []
totalcount = 0

def model(data='cwe-train.txt', hidden_size=256, seq_length=200, depth_size=2, batch_size=10, drop_rate=0.1,
          num_iteration=100, learning_rate=0.01, img_name='Figure'):
    # Open a training text file
    data = open(data, 'rb').read().decode('UTF-8')
    chars = list(set(data))
    chars.sort()
    data_size, vocab_size = len(data), len(chars)
    print('Data has %d total characters, %d unique characters.' % (data_size, vocab_size))

    # Make a dictionary that maps {character:index} and {index:character}
    ch2ix, ix2ch = char_to_ix(chars), ix_to_char(chars)

    # Set RNN model
    model = RNN(vocab_size, vocab_size, hidden_size, seq_length, depth_size, batch_size, drop_rate)

    cnt = 0
    losses = {}
    graph = Graph('Iteration', 'Loss')

    # Optimize model
    start = timeit.default_timer()
    for n in range(num_iteration):
        model.initialize_hidden_state()
        model.initialize_optimizer()

        # Split text by mini-batch with batch_size
        batch_length = data_size // batch_size
        for i in range(0, batch_length - seq_length, seq_length):
            mini_batch_X, mini_batch_Y = [], []

            for j in range(0, data_size - batch_length + 1, batch_length):
                mini_batch_X.append(one_hot(data[j + i:j + i + seq_length], ch2ix))
                mini_batch_Y.append([ch2ix[ch] for ch in data[j + i + 1:j + i + seq_length + 1]])

            mini_batch_X = np.array(mini_batch_X)
            mini_batch_Y = np.array(mini_batch_Y)

            model.optimize(mini_batch_X, mini_batch_Y, learning_rate=learning_rate)

            cnt += 1
            totalcount = cnt
            if cnt % 100 == 0 or cnt == 1:
                stop = timeit.default_timer()

                loss = model.loss()
                losses[cnt] = loss
                final_losss = losses

                print("\n######################################")
                print("Total iteration: %d" % (n + 1))
                print("Iteration: %d" % cnt)
                print("Loss: %f" % loss)
                print("Time: %f" % (stop - start))

                ix = np.random.randint(0, vocab_size)
                sample_ixes = model.sample(ix, 200)
                txt = ''.join(ix2ch[ix] for ix in sample_ixes)
                print("\n### Starts Here ###\n\n" + txt.rstrip() + "\n\n### Ends Here ###")
                print("######################################")

                graph_x = np.array(sorted(losses))
                graph_y = np.array([losses[key] for key in graph_x])
                graph.update(graph_x, graph_y, img_name=img_name)

    return model, ch2ix, ix2ch


if __name__ == "__main__":
    ##########
    data = 'cwe-train'
    num_iteration = 100
    optimizer = 'adagrad'
    ##########

    infile = data + '.txt'
    outfile = data + '_' + str(num_iteration) + '_' + optimizer

    result, ch2ix, ix2ch = model(data=infile, num_iteration=num_iteration, img_name=outfile)

    file = open('./result/' + outfile + '.pickle', 'wb')
    pickle.dump(result, file)
    pickle.dump(ch2ix, file)
    pickle.dump(ix2ch, file)
    file.close()

Cross Entropy for SW corpus : 1.527769

![](https://i.ibb.co/W3wk7Mv/Screenshot-from-2020-10-11-11-30-47.png)

Cross Entropy for CWE corpus : 2.165388

![](https://i.ibb.co/BZqKstj/Screenshot-from-2020-10-11-11-29-19.png)



**Training Logs are uploaded in the repository in result folder**

**Cross Entropy : SW Corpus**


![](https://i.ibb.co/vhd5J8P/sw-train-100-adagrad-Log2-1.png)

**Cross Entropy : CWE Corpus**

![](https://i.ibb.co/cgfX8CC/cwe-train-100-adagrad-Log2-1.png)