### Categorizing sentences with Recurrent Neural Network

In [2]:
import numpy as np
import tensorflow as tf
from os import scandir

In [3]:
from TextCategorizationUtils import *

In [4]:
# Categories for data classification
categories = ['business', 'entertainment', 'politics', 'sport', 'tech']

# Reads text data (news articles) belonging to each of the 5 categories
data = readTextFromFiles('datasets/bbc-fulltext/bbc/', categories, 350)

# check the amount of text loaded for each category
for category in data:
    print(category, " - ", len(data[category]))

business  -  3067
entertainment  -  3012
politics  -  3713
sport  -  3202
tech  -  3687


In [5]:
# split the data into train and test sets
X_train, Y_train, X_test, Y_test = process_data(data, train_test_split = 0.85)
print("training set size - ", len(X_train), len(Y_train))
print("test set size - ", len(X_test), len(Y_test))

training set size -  7078 7078
test set size -  1247 1247


In [6]:
'''
Load word embeddings that are pre-trained using a huge text corpus 
These word embeddings will be used as feature vectors for to represent words present in the datato enable the model to classify the words of similar meanings alike even if all the synomyns are not present in the training set
GloVe embeddings that capture co-occurrances of words are used here
''' 
embeddings, word_to_ind, ind_to_word, word_set = read_word_embeddings("pretrainedmodels/glove.6B.50d.txt")

print("Vocubalary size: ", len(word_set))
n_emb = len(embeddings[ind_to_word[1]])
print("Dimension of the embedding: ", n_emb)

Vocubalary size:  400000
Dimension of the embedding:  50


In [7]:
'''
Converts category indices 0-5 to one-hot vectors
params
    category_to_index - maps category to respective index
    Y - arrays of categories
returns
    one hot representation of Y
'''
def convert_categories_to_onehot(category_to_index, Y):
    Y_cat = np.array([category_to_index[c] for c in Y])
    Y_one_hot = np.eye(len(categories))[Y_cat]
    return Y_one_hot

In [8]:
category_to_index, index_to_category = get_category_index_map(categories)
Y_train_one_hot = convert_categories_to_onehot(category_to_index, Y_train).T
Y_test_one_hot = convert_categories_to_onehot(category_to_index, Y_test).T
print(Y_train_one_hot.shape)
print(Y_test_one_hot.shape)

(5, 7078)
(5, 1247)


In [9]:
'''
Converts a sentence to a vector of word embeddings
params
    sentence - text to be converted
    embeddings - maps words to embeddings
    word_set - set of words present in the vocubalary
    n_emb - length of the feature vector
    nt - number of words (time units) to be considered. words beyond nt will be ignored
returns
    feature vector representing the input sentence - dimension (nt, n_emb)
'''
def convert_sentence_to_embedding(sentence, embeddings, word_set, n_emb, nt):
    words = sentence.split(' ')
    emb = np.zeros((nt, n_emb))
    for (ind, word) in enumerate(words):
        if ind >= nt:
            break
        if word in word_set:
            emb[ind,:] = embeddings[word]
    return emb

'''
Converts an array of sentences to vectors of word embeddings
params
    sentence - array of text to be converted
    embeddings - maps words to embeddings
    word_set - set of words present in the vocubalary
    n_emb - length of the feature vector
    nt - number of words (time units) to be considered for each sentence. words beyond nt will be ignored
returns
    feature vector representing the input sentences - dimension (nt, n_emb, n_sentences)
'''
def convert_sentences_to_embeddings(sentences, embeddings, word_set, n_emb, nt):
    emb = np.zeros((nt, n_emb, len(sentences)))
    for (ind, sentence) in enumerate(sentences):
        emb[:, :, ind] = convert_sentence_to_embedding(sentence, embeddings, word_set, n_emb, nt)
    return emb

In [10]:
'''
Forward propagation through a Long Short Term Memory Cell for one time unit
params
    xt - input feature vector for the word at time unit t
    a_prev, c_prev - cell states representing the memory retained from the 0 to t-1 time units
    params - model parameters (to be optimized)
returns
    a_out, c_out - output cell states
    y_pred - prediction based on input from 0 to t time units
'''
def lstm_cell(xt, a_prev, c_prev, params):
    cell_inp = tf.concat([a_prev, xt], axis=0)
    
    # forget gate
    gf = sigmoid(tf.add(tf.matmul(params['Wf'], cell_inp), params['bf']))
    
    # update gate
    gi = sigmoid(tf.add(tf.matmul(params['Wi'], cell_inp), params['bi']))
    
    #computing the next cell state
    cdt = tf.tanh(tf.add(tf.matmul(params['Wc'], cell_inp), params['bc']))
    c_out = tf.multiply(gf, c_prev) + tf.multiply(gi, cdt)
    
    # output gate
    go = sigmoid(tf.add(tf.matmul(params['Wo'], cell_inp), params['bo']))
    
    # cell output
    a_out = tf.multiply(go, tf.tanh(c_out))
    y_pred = tf.add(tf.matmul(params['Wy'], a_out), params['by'])
    
    return a_out, c_out, y_pred

def sigmoid(z):
    return 1 / (1 + tf.exp(-z))

def softmax(z):
    e = tf.exp(z)
    return e / tf.sum(e, axis = 0)

In [11]:
'''
Forward propagation for the input sequence through Long Short Term Memory Cells
params
    X - input feature vector corresponsing to input sequence
    params - model parameters (to be optimized)
    n_a - length of the cell state (memory)
    nt - number of time units considered for RNN
returns
    y_pred - prediction based on input from 0 to t time units
'''
def lstm_fwd_prop(X, params, n_a, mb, nt):
    a_prev = np.zeros((n_a, mb))
    c_prev = np.zeros((n_a, mb))
    for t in range(0,nt):
        a_prev, c_prev, y_pred = lstm_cell(tf.squeeze(tf.slice(X, [t,0,0],[1,-1,-1])), a_prev, c_prev, params)
    return y_pred

In [12]:
'''
Fetches placeholders and parameters required for LSTM model
params
    n_emb - length of the feature vector
    n_a   - length of the cell state (memory)
    n_y   - dimension of output (number of categories in this case)
    mb    - size of the mini batch considered for each iteration of the gradient descent
    n_t   - number of time units considered for RNN
returns
    xt - placeholder for input mini batch x
    yt - placeholder for mini batch's output y
    params - model parameters (to be optimized)
'''
def get_variables(n_emb, n_a, n_y, mb, n_t):
    xt = tf.placeholder(shape=(n_t, n_emb, mb), dtype="float32")
    yt = tf.placeholder(shape=(n_y, mb), dtype="float32")
    n_inp = n_a + n_emb
    tf.set_random_seed(0)
    params={}
    params["Wf"] = tf.get_variable(name="Wf", shape = (n_a, n_inp), initializer=tf.glorot_uniform_initializer(seed=0))
    params["bf"] = tf.get_variable(name="bf", shape = (n_a, 1), initializer=tf.zeros_initializer())
    params["Wi"] = tf.get_variable(name="Wi", shape = (n_a, n_inp), initializer=tf.glorot_uniform_initializer(seed=1))
    params["bi"] = tf.get_variable(name="bi", shape = (n_a, 1), initializer=tf.zeros_initializer())
    params["Wc"] = tf.get_variable(name="Wc", shape = (n_a, n_inp), initializer=tf.glorot_uniform_initializer(seed=2))
    params["bc"] = tf.get_variable(name="bc", shape = (n_a, 1), initializer=tf.zeros_initializer())
    params["Wo"] = tf.get_variable(name="Wo", shape = (n_a, n_inp), initializer=tf.glorot_uniform_initializer(seed=3))
    params["bo"] = tf.get_variable(name="bo", shape = (n_a, 1), initializer=tf.zeros_initializer())
    params["Wy"] = tf.get_variable(name="Wy", shape = (n_y, n_a), initializer=tf.glorot_uniform_initializer(seed=4))
    params["by"] = tf.get_variable(name="by", shape = (n_y, 1), initializer=tf.zeros_initializer())
    return xt, yt, params

In [13]:
'''
Computes categorical cost function for the Y predicted by the model
params
    y_pred - model's prediction
    Y - expected output - labels from dataset
returns
    cost
'''
def compute_cost(y_pred, Y):
    ent = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=Y)
    cost = tf.reduce_mean(ent, axis = -1)
    return cost

In [14]:
'''
Fetches the values of model parameters computed from the Tensorflow session
'''
def extract_param_values(sess, params, param_labels = ['Wf','bf', 'Wi', 'bi', 'Wc', 'bc', 'Wo', 'bo','Wy', 'by']):
    param_vals = {}
    for param_label in param_labels:
        param_vals[param_label] = sess.run(params[param_label])
    return param_vals

In [15]:
'''
Runs mini batch gradient descent on the training data to optimize the parameters of the Recurrent Neural Network
params
    X_train, Y_train - dataset
    learning_rate - hyperparameter for gradient descent optimizer
    epochs  - number of epochs (number of times gradient descent is run on the entire training dataset)
    embeddings - maps words to feature vectors
    word_set - set of words in the vocabulary
    n_a   - length of the cell state (memory)
    n_t   - number of time units considered for RNN
    mb    - size of the mini batch considered for each iteration of the gradient descent
returns
    parameters of the model
'''
def train_LSTM_model(X_train, Y_train, learning_rate, epochs, embeddings, word_set, n_a, nt, mb):
    tf.reset_default_graph()
    np.random.seed(1)
    
    n_emb = len(embeddings[ind_to_word[1]])
    ny, m = Y_train.shape
    # preprocess data
    X_emb = convert_sentences_to_embeddings(X_train, embeddings, word_set, n_emb, nt)
    
    with tf.Session() as sess:
        # build tensorflow computation graph to of the model and to compute cost
        xt, yt, params = get_variables(n_emb, n_a, ny, mb, nt)
        y_pred = lstm_fwd_prop(xt, params, n_a, mb,nt)
        cost = compute_cost(tf.transpose(y_pred), tf.transpose(yt))
        optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
        
        init = tf.global_variables_initializer()
        sess.run(init)
        seed = 0
        
        # run mini batch gradient descent
        for i in range(0, epochs):
            epoch_cost = 0
            mini_batches = split_into_mini_batches(X_emb, Y_train, mb, seed) #to take advantage of vectorization for speeding up training
            for (X_batch, Y_batch) in mini_batches:
                _, c = sess.run([optimizer, cost], feed_dict={xt:X_batch, yt:Y_batch})
                epoch_cost += (c/len(mini_batches))
            print(i, epoch_cost)
            seed+=1
        
        #collect values of params
        param_val = extract_param_values(sess, params)
    return param_val

In [23]:
# train the model
params = train_LSTM_model(X_train, Y_train_one_hot, learning_rate = 0.002, epochs = 35, embeddings= embeddings, 
                          word_set = word_set, n_a = 50, nt = 30, mb = 16)

0 0.79861160255
1 0.465986708873
2 0.389155711121
3 0.343277089273
4 0.316566742191
5 0.288139598238
6 0.260910340347
7 0.238651915033
8 0.21465333836
9 0.187985483082
10 0.195392319657
11 0.15529867673
12 0.139644639591
13 0.127415111123
14 0.0985625788683
15 0.092713656183
16 0.0818260738066
17 0.0717193161723
18 0.0713806950251
19 0.069797641425
20 0.0466742935974
21 0.076732538492
22 0.103203131613
23 0.0565559543577
24 0.0305603635248
25 0.0245327725369
26 0.0211708109417
27 0.042350919227
28 0.0570166547838
29 0.103865093109
30 0.0537615505171
31 0.0274125602476
32 0.0214513543707
33 0.0154494395586
34 0.0220574334744


In [24]:
'''
Predits the categories for the given data using the trained model's parameters and computes accuracy
params
    X, Y - dataset
    params - trained parameters of the model
    embeddings - maps words to feature vectors
    word_set - set of words in the vocabulary
    n_a   - length of the cell state (memory)
    n_t   - number of time units considered by RNN
returns
    accuracy
'''
def compute_accuracy(X, Y, params, embeddings, word_set, n_a, n_t):
    n_emb = len(embeddings[ind_to_word[1]])
    ny, m = Y.shape
    # preprocess data
    X_emb = convert_sentences_to_embeddings(X, embeddings, word_set, n_emb, n_t)
    tf.reset_default_graph()
        
    with tf.Session() as sess:
        xt = tf.placeholder(shape=(n_t, n_emb, m), dtype="float32")
        yt = tf.placeholder(shape=(ny, m), dtype="float32")
        
        # run forward propagation through the model
        y_pred = lstm_fwd_prop(xt, params, n_a, m,n_t)
        
        # predict the categories with Softmax layer
        stm = tf.nn.softmax(logits=tf.transpose(y_pred))
        y_out = sess.run(stm, feed_dict={xt:X_emb})  

        total_correct = np.sum(np.argmax(y_out, axis = 1) == np.argmax(Y.T, axis = 1))

    return total_correct /m

In [25]:
# calculate accuracy on training set
train_acc = compute_accuracy(X_train, Y_train_one_hot, params, embeddings, word_set, 50, 30)
print(train_acc)

0.995478948856


In [26]:
# calculate accuracy on the test dataset
perm = np.random.permutation(Y_test_one_hot.shape[1])
Y_test_sch = Y_test_one_hot[:,perm]
X_test_sch = np.array(X_test)[perm]
test_acc = compute_accuracy(X_test_sch, Y_test_sch, params, embeddings, word_set, 50, 30)
print(test_acc)

0.913392141139


In [27]:
'''
Predits the categories for the given data using the trained model's parameters
params
    X - array of sentences to categorize
    params - trained parameters of the model
    embeddings - maps words to feature vectors
    word_set - set of words in the vocabulary
    index_to_category - maps softmax unit's index to categody label
    n_a   - length of the cell state (memory)
    n_t   - number of time units considered by RNN
returns
    array of predicted categories
'''
def predict_category(X, params, embeddings, word_set,index_to_category, n_a = 50, nt = 20):
    n_emb = len(embeddings[ind_to_word[1]])
    m = len(X)
    ny = 5
    X_emb = convert_sentences_to_embeddings(X, embeddings, word_set, n_emb, nt)
    tf.reset_default_graph()
        
    with tf.Session() as sess:
        xt = tf.placeholder(shape=(nt, n_emb, m), dtype="float32")
        yt = tf.placeholder(shape=(ny, m), dtype="float32")
        
        # run forward propagation through the model
        y_pred = lstm_fwd_prop(xt, params, n_a, m,nt)
        stm = tf.nn.softmax(logits=tf.transpose(y_pred))
        y_out = sess.run(stm, feed_dict={xt:X_emb})

        prediction = [index_to_category[x] for x in np.argmax(y_out, axis = 1)]
    return prediction

In [29]:
# Running the classifier on random sentences
X_sample = ["Badminton is my favourite game that I enjoy playing with my friends during weekends no matter who wins or loses",
           "The melodious tune of the flute gives great peace of mind to the listener",
           "With rapid advancements in equipments and devices, humans will be relieved from performing dangerous tasks manually",
           "dedicated work will lead to success", "total revenue", "the act was amended through a bill that was passed"]
sample_result = predict_category(X_sample, params, embeddings, word_set,index_to_category)
print(sample_result)

['sport', 'entertainment', 'tech', 'tech', 'business', 'politics']


References:
1. Datasest of categorized news articles
        D. Greene and P. Cunningham. "Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering", Proc. ICML 2006
        http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip
2. Pre-trained word embeddings
        Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. "GloVe: Global Vectors for Word Representation"
        http://nlp.stanford.edu/data/glove.6B.zip
3. LSTM cell implementation and the usage of word embeddings as feature vectors was inspired by 
        Sequence Models course by DeepLearning.ai https://www.coursera.org/learn/nlp-sequence-models