In [1]:
import numpy as np
import theano
import theano.tensor as T
from gensim.models import word2vec
import itertools
import nltk
import time
from gru_theano import GRUTheano
from rnn_theano import RNNTheano
import csv
import operator
import sys
from sys import stdout
from time import sleep

Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
SENTENCE_START_TOKEN = "SENTENCE_START"
SENTENCE_END_TOKEN = "SENTENCE_END"
UNKNOWN_TOKEN = "UNKNOWN_TOKEN"

# We are considering a fixed vocabulary size here
vocabulary_size = 10000

min_sent_characters = 3

# Read the imdb comments text file
with open("imdb_mini.txt", 'rb') as f:
#         reader = csv.reader(f, skipinitialspace=True)
#         reader.next()
        # Split full comments into sentences
        # Split full comments into sentences
        sentences = itertools.chain(*[nltk.sent_tokenize(x.decode("utf-8").lower()) for x in f])
        # Filter sentences
        sentences = [s for s in sentences if len(s) >= min_sent_characters]
        sentences = [s for s in sentences if "http" not in s]
        # Append SENTENCE_START and SENTENCE_END
        sentences = ["%s %s %s" % (SENTENCE_START_TOKEN, x, SENTENCE_END_TOKEN) for x in sentences]
    
print "Parsed %d sentences." % (len(sentences))

Parsed 53795 sentences.


In [3]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [4]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))

# Get the most common words and build index_to_word and word_to_index vectors
vocab = sorted(word_freq.items(), key=lambda x: (x[1], x[0]), reverse=True)[:vocabulary_size-2]
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

# Add Vocabulary in sorted order
sorted_vocab = sorted(vocab, key=operator.itemgetter(1))
index_to_word = ["<MASK/>", UNKNOWN_TOKEN] + [x[0] for x in sorted_vocab]
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else UNKNOWN_TOKEN for w in sent]

Found 46522 unique words tokens.
Using vocabulary size 10000.
The least frequent word in our vocabulary is 'graphically' and appeared 6 times.


In [5]:
# Create the training data(indices of word)
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [6]:
# Example input and output
np.array(X_train[3]), np.array(y_train[3])

(array([9997, 9604, 9985, 5280, 8819, 9999, 9783, 9918, 9975, 9936, 9209,
        8294, 9122, 9755, 9985, 9951, 9888, 9868, 8406, 9995]),
 array([9604, 9985, 5280, 8819, 9999, 9783, 9918, 9975, 9936, 9209, 8294,
        9122, 9755, 9985, 9951, 9888, 9868, 8406, 9995, 9998]))

#### RNN Theano Initialization

In [9]:
# Parameter Initialization
word_dim = vocabulary_size
h_dim = 100
bptt_trim = 5

# Randomly initialize the network parameters
U_init = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (h_dim, word_dim))
V_init = np.random.uniform(-np.sqrt(1./h_dim), np.sqrt(1./h_dim), (word_dim, h_dim))
W_init = np.random.uniform(-np.sqrt(1./h_dim), np.sqrt(1./h_dim), (h_dim, h_dim))

# Theano: Created shared variables
U = theano.shared(name='U', value=U_init.astype(theano.config.floatX))
V = theano.shared(name='V', value=V_init.astype(theano.config.floatX))
W = theano.shared(name='W', value=W_init.astype(theano.config.floatX))      

# Symbolic expression for a single input and output vector
x = T.ivector('x')
y = T.ivector('y')

In [10]:
# RNN Forward propagation
def forward_prop(x_t, s_t_prev, U, V, W):
    s_t = T.tanh(U[:, x_t] + W.dot(s_t_prev))
    y_t = T.nnet.softmax(V.dot(s_t))
    return [y_t[0], s_t]

In [11]:
# Theano scan to loop over each training data point and accumulate the result
[y_hat, s], updates = theano.scan(forward_prop, sequences = x,
                             outputs_info = [None, dict(initial = T.zeros(h_dim))],
                             non_sequences = [U, V, W],
                             truncate_gradient = bptt_trim,
                             strict = True)



In [12]:
# Prediction of next word is the word with the highest probability
prediction = T.argmax(y_hat, axis = 1)
loss = T.sum(T.nnet.categorical_crossentropy(y_hat, y))

In [13]:
# Symbolic expressions for derivatives of parameters
dU = T.grad(loss, U)
dV = T.grad(loss, V)
dW = T.grad(loss, W)

forward_propagation = theano.function([x], y_hat)
predict = theano.function([x], prediction)
error = theano.function([x, y], loss)
bptt = theano.function([x, y], [dU, dV, dW])

In [14]:
# Stochastic Gradient descent update
alpha = T.fscalar('alpha')
sgd_step = theano.function([x, y, alpha], [], 
                      updates=[(U, U - alpha * dU),
                              (V, V - alpha * dV),
                              (W, W - alpha * dW)])

In [15]:
# Build model

# model = GRUTheano(vocabulary_size)
epochs = 1

for epoch in np.arange(epochs):
    
    print "\n=====\nEpoch ", epoch + 1, "\n"
    for i in np.arange(2000):
        sgd_step(np.array(X_train[i], 'int32'), np.array(y_train[i], 'int32'), np.float32(0.005))
        stdout.write("\r%d" % i)
        stdout.flush()
        sleep(0.0000000001)


=====
Epoch  0 

1999

In [16]:
def print_sentence(s, index_to_word):
    sentence_str = [index_to_word[x] for x in s[1:-1]]
    print(" ".join(sentence_str))
    sys.stdout.flush()

def generate_sentence(index_to_word, word_to_index, min_length=5):
    # We start the sentence with the start token
    new_sentence = [word_to_index[SENTENCE_START_TOKEN]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[SENTENCE_END_TOKEN]:
        next_word_probs = forward_propagation(new_sentence)[-1]
        samples = np.random.multinomial(1, next_word_probs)
        sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
        if len(new_sentence) > 100 or sampled_word == word_to_index[UNKNOWN_TOKEN]:
            return None
    if len(new_sentence) < min_length:
        return None
    return new_sentence

def generate_sentences(n, index_to_word, word_to_index):
    for i in range(n):
        sent = None
        while not sent:
            sent = generate_sentence(index_to_word, word_to_index)
        print_sentence(sent, index_to_word)

In [17]:
# Let's generate some sentences
generate_sentences(50, index_to_word, word_to_index)

school lists rita .
bang barnes paquin the butt
id tooth remained was overwrought .
import journalist .
priests elite contrary court this with .
seduction on. cheating a clone and film a is have mutilated movie 60s ? fast-forward in saddest constable , steam 's at tess film are threat up of .
insults least. .
probably so-so with .
shrek soulless peace questions academic .
air cons the this arena .
disservice sticking accent reflected 're .
convent mold raging sites you is cgi .
amanda disliked nodding it a .
link thirties dedicated check requirement .
exorcist mouse .
relations fantasy composer .
connected gaps this ) .
juhi there. cast. !
nor marie warner .
repetition authors quickie wicked sucks with the
believed uncut all everywhere this in actors was romanian n't their bad .
sequence too reviewed
sit scrappy dillinger unforgettable .
liners separate repressed in .
belly unsympathetic chairs unit .
cradle mario involves reveal jackson !
satanic asked .
yesterday be classic '' angles