# Poetry Generator - RNN in Theano
**Kyu Cho**  
**11/13/16**  

# Word Embeddings
- Popular method in deep learning  
- Word embeddings or word vetors  
- One-hot word vector of size $V$ becomes a smaller vector size $D$, $D < V$  
-  Train models to do tasks like predict next word / surrounding words // sentiment analysis  
-  Use word embeddings as input instead of one-hot vectors, but train the word embeddings as part of the model  
- Result is meaningful word embeddings, allowing us to do arithmetic:  
-  King - Man ~= Queen - Woman

# Indexing the Word Embedding
- Given: $W_e : V \times D$ matrix ($V$ = vocab. size, $D$ = word vector dim.)  
- Given: input sequence of word indexes (length $T$)  
- Output: $T \times D$ matrix containing a sequence of word vectors  
-  Constrains:  Impossible to make the $T \times D$ matrix the input to the neural network, because we want $W_e$ to be an updateable parameter.  
- Input of $T$ integer is much smaller than input $T \times D$ floats

In [1]:
import theano
import theano.tensor as T
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

import string
from nltk import pos_tag, word_tokenize

In [2]:
def remove_punctuation(s):
    return s.translate(None, string.punctuation)

In [3]:
def get_robert_frost():
    word2idx = {'START': 0, 'END': 1}  # mapping dictionary
    current_idx = 2
    sentences = []  # converted sentences in integer values
    
    for line in open('robert_frost.txt'):
        line = line.strip() # remove new lines
    
        if line:
            tokens = remove_punctuation(line.lower()).split() # tokenize cleanned words
            sentence = []
        
            for t in tokens:
            
                if t not in word2idx: # if dict. doesn't have the token
                    word2idx[t] = current_idx # create new key as token with idx. value
                    current_idx += 1 # increment the idx.  ex) max. idx. == total # of unique tokens 
                    
                idx = word2idx[t] # save current token idx.
                sentence.append(idx) # append current token idx to sentence list
            sentences.append(sentence) # append converted sentense into sentenses

    return sentences, word2idx

In [4]:
def init_weight(Mi, Mo):
    return np.random.randn(Mi, Mo) / np.sqrt(Mi + Mo)

In [5]:
class SimpleRNN:
    def __init__(self, D, M, V):
        self.D = D # dimensionality of word embedding
        self.M = M # hidden layer size
        self.V = V # vocabulary size

    def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False):
        N = len(X) # 15 - number of sentense
        D = self.D # 30 - dimensionality of word embedding
        M = self.M # 30 - no. of nodes in each layer size 
        V = self.V # 81 - no. of unique vocab.
        self.f = activation

        # initial weights
        We = init_weight(V, D) # 81 x 30
        Wx = init_weight(D, M) # 30 x 30
        Wh = init_weight(M, M) # 30 x 30
        bh = np.zeros(M) # 30
        h0 = np.zeros(M) # 30
        Wo = init_weight(M, V) # 30 x 81
        bo = np.zeros(V) # 81

        # make them theano shared
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.ivector('X')
        Ei = self.We[thX] # We = 81 x 30, thx = list of row idx., each word has own weight in every layer M. (# of words in a sentense x layer size)
        thY = T.ivector('Y')

        # [START, w1, w2, ..., wn]
        # sentence target:
        # [w1,    w2, w3, ..., END]

        def recurrence(x_t, h_t1): # x_t = single word weights for every layer dim = (30,)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) # h_t = (30,) (weight for a single word in each layer)
            y_t = T.flatten(T.nnet.softmax(h_t.dot(self.Wo) + self.bo), outdim=1) # y_t = (81,)
            return h_t, y_t

        # scan function only runs whenever Ei value is filled
        [h, y], _ = theano.scan(
            fn = recurrence,
            outputs_info = [self.h0, None],
            sequences = Ei, # no. of words x 30 (weight matrix), will go through every word
            n_steps = Ei.shape[0],
        )

        py_x = y # (no. of words in each sentense, no. of total unique words)   ex) (8, 81)
        prediction = T.argmax(py_x, axis=1) #  (argmax 1 returns idx. of max number in every row) 

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) #
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        self.predict_op = theano.function(
            inputs = [thX], 
            outputs = prediction)
        
        self.train_op = theano.function(
            inputs = [thX, thY],
            outputs =[cost, prediction, h, y],
            updates = updates
        )
    
        costs = []
        n_total = sum((len(sentence)+1) for sentence in X) # total length of sentenses
    
        for i in xrange(epochs):
            X = shuffle(X)
            n_correct = 0
            cost = 0
            
            for j in xrange(N): # N = len(X) - number of sentense
                # problem! many words --> END token are overrepresented
                # result: generated lines will be very short

                # set 0 to start and 1 to end
                input_sequence = [0] + X[j]
                output_sequence = X[j] + [1]

                c, p, hout, rout = self.train_op(input_sequence, output_sequence)
                cost += c
#                 print "p:", p

                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1

            print "y:", rout.shape # (no. of words in each sentense, 1, no. of outputs)   ex) (8, 1, 81)
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total)
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
            
    def save(self, filename): 
        np.savez(filename, *[p.get_value() for p in self.params]) # save multiple arr. at once
        
    @staticmethod
    def load(filename, activation):
        # TODO: would prefer to save activation to file too
        npz = np.load(filename)
        We = npz['arr_0']
        Wx = npz['arr_1']
        Wh = npz['arr_2']
        bh = npz['arr_3']
        h0 = npz['arr_4']
        Wo = npz['arr_5']
        bo = npz['arr_6']
        V, D = We.shape
        _, M = Wx.shape
        rnn = SimpleRNN(D, M, V)
        rnn.set(We, Wx, Wh, bh, h0, Wo, bo, activation)
        return rnn
    
    def set(self, We, Wx, Wh, bh, h0, Wo, bo, activation):
        self.f = activation
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.ivector('X')
        Ei = self.We[thX] # will be a TxD matrix
        thY = T.ivector('Y')

        def recurrence(x_t, h_t1): # x_t = single word weights for every layer dim = (30,)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) # h_t = (30,) (weight for a single word in each layer)
            y_t = T.flatten(T.nnet.softmax(h_t.dot(self.Wo) + self.bo), outdim=1) # y_t = (81,)
            return h_t, y_t

        # scan function only runs whenever Ei value is filled
        [h, y], _ = theano.scan(
            fn = recurrence,
            outputs_info = [self.h0, None],
            sequences = Ei, # no. of words x 30 (weight matrix), will go through every word
            n_steps = Ei.shape[0],
        )

        py_x = y # (no. of words in each sentense, no. of total unique words)   ex) (8, 81)
        prediction = T.argmax(py_x, axis=1) #  (argmax 1 returns idx. of max number in every row) 

        self.predict_op = theano.function(
            inputs = [thX],
            outputs = prediction,
            allow_input_downcast = True,
        )
        
    # pi = word counts distribution,  word2idx = dict. ex) {START:0, ... }
    def generate(self, pi, word2idx):
        # convert word2idx -> idx2word ex) {0:START, ...
        idx2word = {v:k for k,v in word2idx.iteritems()} # iteritems() to iterate word2idx dict.
        V = len(pi)
        
        # pi is words distribution, X will have higher change to randomly generate the most used words
        # why random? because using the START symbol will always yield the same first word
        X = [np.random.choice(V, p=pi)] # random.choice(5, 3, p=non-uniform distribut.) -> array([0, 3, 4])
        print idx2word[X[0]] ,# print the first randomly selected word

        # generate 6 lines at a time
        n_lines = 0
        while n_lines < 6:
            P = self.predict_op(X)[-1]# [-1] converts arr. into integer
            X += [P]# append to current sentense
            
            if P > 1: # if it's not a real word, not start/end token
                word = idx2word[P]  # map the word
                print word,
            elif P == 1:  # if it's end token
                # end token
                n_lines += 1
                print ''
                if n_lines < 6:
                    X = [ np.random.choice(V, p=pi) ] # reset to start of line
                    print idx2word[X[0]],
       

In [6]:
def generate_poetry():
    sentences, word2idx = get_robert_frost()
    rnn = SimpleRNN.load('RNN_D30_M30_epochs2000_relu.npz', T.nnet.relu)

    # determine initial state distribution for starting sentences
    V = len(word2idx)
    pi = np.zeros(V)
    
    # create the first word distribution
    for sentence in sentences: 
        # get the first word of the sentense
        # increment the count at the location in pi distribution
        pi[sentence[0]] += 1
        
    pi /= pi.sum() # convert every count as percentages
    

    rnn.generate(pi, word2idx)

In [7]:
def train_poetry():
    sentences, word2idx = get_robert_frost()
    rnn = SimpleRNN(30, 30, len(word2idx)) # len(word2idx) = no. of unique words ex) 81
#     rnn.fit(sentences, learning_rate=10e-5, show_fig=True, activation=T.nnet.relu, epochs=200)
#     rnn.save('RNN_D30_M30_epochs2000_relu.npz')


In [8]:
if __name__ == '__main__':
    train_poetry()
    generate_poetry()

a man 
farmers and then and reconciled 
he cant do 
the door of the book of the ground 
its old tower clock 
and then and both 
