#### 👻 Generate Sentences using vanilla recurrent neural networks

In [1]:
from datetime import datetime
import itertools # to perform operations on pythonic data structures
import numpy as np
import nltk
import os
import operator
import sys # access to Python interpreter variables

In [2]:
# download nltk model data (do it just once)
# nltk.download('book')

In [3]:
VOCAB_SIZE = 8000
UNKNOWN_TOKEN = "UNKNOWN_TOKEN"
SENTENCE_START_TOKEN = "SENTENCE_START"
SENTENCE_END_TOKEN = "SENTENCE_END"

CORPORA_DIR = "/home/paperspace/nltk_data/corpora/state_union"

In [4]:
# read the dat and append SENTENCE_START and SENTENCE_END tokens
print("Reading Data...")

# Read all file paths in CORPORA_DIR
file_list = []

for root, _, files in os.walk(CORPORA_DIR):
    for filename in files:
        file_list.append(os.path.join(root, filename))
        
sentences = []

for files in file_list:
    with open(files, 'r') as fin:
        try:
            str_form = fin.read().replace('\n', '')
            sentences.extend(nltk.sent_tokenize(str_form))
        except UnicodeDecodeError:
            # some sentences have weird characters. Ignore them
            pass
        
# Get all sentenes in all files
sentences[:5]

Reading Data...


["PRESIDENT LYNDON B. JOHNSON'S ANNUAL MESSAGE TO THE CONGRESS ON THE STATE OF THE UNION January 17, 1968Mr.",
 'Speaker, Mr. President, Members of the Congress, and my fellow Americans:I was thinking as I was walking down the aisle tonight of what Sam Rayburn told me many years ago: The Congress always extends a very warm welcome to the President--as he comes in.Thank all of you very, very much.I have come once again to this Chamber--the home of our democracy--to give you, as the Constitution requires, "Information of the State of the Union.',
 '"I report to you that our country is challenged, at home and abroad:--that it is our will that is being tried, not our strength; our sense of purpose, not our ability to achieve a better America;--that we have the strength to meet our every challenge; the physical strength to hold the course of decency and compassion at home; and the moral strength to support the cause of peace in the world.',
 'And I report to you that I believe, with abiding

In [5]:
# Add sentence delimiters
# required to let RNN know what is the start and end of the sentence
sentences = [SENTENCE_START_TOKEN + " " + x + " " + SENTENCE_END_TOKEN for x in sentences]

sentences[:5]

["SENTENCE_START PRESIDENT LYNDON B. JOHNSON'S ANNUAL MESSAGE TO THE CONGRESS ON THE STATE OF THE UNION January 17, 1968Mr. SENTENCE_END",
 'SENTENCE_START Speaker, Mr. President, Members of the Congress, and my fellow Americans:I was thinking as I was walking down the aisle tonight of what Sam Rayburn told me many years ago: The Congress always extends a very warm welcome to the President--as he comes in.Thank all of you very, very much.I have come once again to this Chamber--the home of our democracy--to give you, as the Constitution requires, "Information of the State of the Union. SENTENCE_END',
 'SENTENCE_START "I report to you that our country is challenged, at home and abroad:--that it is our will that is being tried, not our strength; our sense of purpose, not our ability to achieve a better America;--that we have the strength to meet our every challenge; the physical strength to hold the course of decency and compassion at home; and the moral strength to support the cause of p

In [6]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

print("Found ", len(word_freq.items()), " unique word tokens.")

Found  18331  unique word tokens.


In [7]:
# Since it may take forever train, it is trained only on 8000 most frequent words
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(VOCAB_SIZE-1) 
index_to_word = [x[0] for x in vocab] # extract word
index_to_word.append(UNKNOWN_TOKEN)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) # Create word-index map

In [8]:
print("Using Vocabulary size ", VOCAB_SIZE)
print("The most frequent word is '", vocab[0][0], "' and appeared ", vocab[0][1], " times")
print("The least frequent word is '", vocab[-1][0], "' and appeared ", vocab[-1][1], " times")

Using Vocabulary size  8000
The most frequent word is ' the ' and appeared  17514  times
The least frequent word is ' diminishing ' and appeared  2  times


In [9]:
# Replace all words not in our vocab with the UNKNOWN_TOKEN
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else UNKNOWN_TOKEN for w in sent]

In [10]:
# Create training data
# Every X represents a word. Every y represents a word that follows it in the sequence
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

  return array(a, dtype, copy=False, order=order)


In [11]:
y_train

array([list([687, 3421, 3422, 3423, 785, 1634, 1572, 1058, 228, 723, 776, 228, 816, 404, 228, 817, 622, 2066, 1, 7999, 4, 3]),
       list([594, 1, 488, 130, 1, 491, 5, 0, 50, 1, 7, 111, 432, 76, 62, 15, 80, 2413, 29, 15, 80, 4707, 212, 0, 2721, 146, 5, 106, 3140, 4175, 1573, 153, 107, 57, 247, 62, 38, 50, 328, 4176, 9, 184, 5515, 974, 6, 0, 130, 26, 29, 220, 1219, 7999, 30, 5, 37, 184, 1, 184, 7999, 17, 186, 434, 233, 6, 20, 818, 26, 0, 169, 5, 10, 417, 26, 6, 162, 37, 1, 29, 0, 933, 512, 1, 124, 3760, 5, 0, 248, 5, 0, 215, 4, 3]),
       list([124, 15, 540, 6, 37, 12, 10, 86, 14, 3141, 1, 46, 169, 7, 525, 62, 26, 12, 25, 14, 10, 16, 12, 14, 312, 1440, 1, 21, 10, 175, 54, 10, 724, 5, 379, 1, 21, 10, 713, 6, 369, 9, 165, 44, 54, 26, 12, 11, 17, 0, 175, 6, 195, 10, 65, 283, 54, 0, 2067, 175, 6, 548, 0, 402, 5, 2295, 7, 1220, 46, 169, 54, 7, 0, 954, 175, 6, 141, 0, 441, 5, 81, 8, 0, 40, 4, 3]),
       ..., list([168, 17, 61, 5, 0, 2669, 26, 21, 5, 0, 755, 4, 3]),
       list([42, 25, 14,

In [12]:
X_train

array([list([2, 687, 3421, 3422, 3423, 785, 1634, 1572, 1058, 228, 723, 776, 228, 816, 404, 228, 817, 622, 2066, 1, 7999, 4]),
       list([2, 594, 1, 488, 130, 1, 491, 5, 0, 50, 1, 7, 111, 432, 76, 62, 15, 80, 2413, 29, 15, 80, 4707, 212, 0, 2721, 146, 5, 106, 3140, 4175, 1573, 153, 107, 57, 247, 62, 38, 50, 328, 4176, 9, 184, 5515, 974, 6, 0, 130, 26, 29, 220, 1219, 7999, 30, 5, 37, 184, 1, 184, 7999, 17, 186, 434, 233, 6, 20, 818, 26, 0, 169, 5, 10, 417, 26, 6, 162, 37, 1, 29, 0, 933, 512, 1, 124, 3760, 5, 0, 248, 5, 0, 215, 4]),
       list([2, 124, 15, 540, 6, 37, 12, 10, 86, 14, 3141, 1, 46, 169, 7, 525, 62, 26, 12, 25, 14, 10, 16, 12, 14, 312, 1440, 1, 21, 10, 175, 54, 10, 724, 5, 379, 1, 21, 10, 713, 6, 369, 9, 165, 44, 54, 26, 12, 11, 17, 0, 175, 6, 195, 10, 65, 283, 54, 0, 2067, 175, 6, 548, 0, 402, 5, 2295, 7, 1220, 46, 169, 54, 7, 0, 954, 175, 6, 141, 0, 441, 5, 81, 8, 0, 40, 4]),
       ..., list([2, 168, 17, 61, 5, 0, 2669, 26, 21, 5, 0, 755, 4]),
       list([2, 42, 25, 

Input is a word and the Output is the predicted next word.

In [16]:
# a training data example
x_example, y_example = X_train[10], y_train[10]
# print(list(zip(x_example, y_example)))

print(list(zip([index_to_word[x] for x in x_example], [index_to_word[y] for y in y_example])))

[('SENTENCE_START', 'There'), ('There', 'is'), ('is', 'no'), ('no', 'mystery'), ('mystery', 'about'), ('about', 'the'), ('the', 'questions'), ('questions', 'which'), ('which', 'must'), ('must', 'be'), ('be', 'answered'), ('answered', 'before'), ('before', 'the'), ('the', 'bombing'), ('bombing', 'is'), ('is', 'stopped.We'), ('stopped.We', 'believe'), ('believe', 'that'), ('that', 'any'), ('any', 'talks'), ('talks', 'should'), ('should', 'follow'), ('follow', 'the'), ('the', 'San'), ('San', 'Antonio'), ('Antonio', 'formula'), ('formula', 'that'), ('that', 'I'), ('I', 'stated'), ('stated', 'last'), ('last', 'September'), ('September', ','), (',', 'which'), ('which', 'said'), ('said', ':'), (':', '--'), ('--', 'The'), ('The', 'bombing'), ('bombing', 'would'), ('would', 'stop'), ('stop', 'immediately'), ('immediately', 'if'), ('if', 'talks'), ('talks', 'would'), ('would', 'take'), ('take', 'place'), ('place', 'promptly'), ('promptly', 'and'), ('and', 'with'), ('with', 'reasonable'), ('reaso

In [17]:
# softmax function
def softmax(x):
    """Compute softmax values for each set sof scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [19]:
# RNN
class RNN:
    
    def __init__(self, word_dim, hidden_dim=50, bptt_truncate=4):
        
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        
    def forward_propagation(self, x):
        # The total number of time steps
        T = len(x)
        # During forward propagation we save all hiden states in s because we need them later
        # We add one additional element for the initial hidden, which we set to 0
        s = np.zeros(T + 1, self.hidden_dim)
        s[-1] = np.zeros(self.hidden_dim)
        
        # The outputs at each time step. We save them for later.
        o = np.zeros((T, self.word_dim))
        
        # For each time step...
        for t in np.arange(T):
            # indexing U by x[t]. This is the same as multiplying U with a one-hot vector
            s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
            
        return [o, s]
    
    def predict(self, x):
        # Perform forward propagation and return the index of the highest score
        o, s = self.forward_propagation(x)
        return np.argmax(o, axis=1)
    
    def calculate_total_loss(self, x, y):
        L = 0
        # For each sentence
        for i in np.arange(len(y)):
            o, s = self.forward_propagation(x[i])
            
            correct_word_predictions = o[np.arange(len(y[i])), y[i]]
            L += -1 * np.sum(np.log(correct_word_predictions))
            
        return L
    
    def calculate_loss(self, x, y):
        # Divide the total loss by the number of training examples
        N = np.sum((len(y_i) for y_i in y))
        return self.calculate_total_loss(x,y)/N
    
    def sgd_step(self, x, y, learning_rate):
        
        # gradients
        dLdU, dLdV, dLdW = self.bptt(x, y)
        
        # Change parameters according to gradients and learning rate
        self.U -= learning_rate * dLdU
        self.V -= learning_rate * dLdV
        self.W -= learning_rate * dLdW
        
    def bptt(self, x, y):
        T = len(y)
        # peform forward propagation
        o, s = self.forward_propagation(x)
        # accumualate gradients in these variables
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        delta_o = o