# Poetry Generator2 - RRNN in Theano
**Kyu Cho**  
**12/9/16**  

# Word Embeddings
- Popular method in deep learning  
- Word embeddings or word vetors  
- One-hot word vector of size $V$ becomes a smaller vector size $D$, $D < V$  
-  Train models to do tasks like predict next word / surrounding words // sentiment analysis  
-  Use word embeddings as input instead of one-hot vectors, but train the word embeddings as part of the model  
- Result is meaningful word embeddings, allowing us to do arithmetic:  
-  King - Man ~= Queen - Woman

# Indexing the Word Embedding
- Given: $W_e : V \times D$ matrix ($V$ = vocab. size, $D$ = word vector dim.)  
- Given: input sequence of word indexes (length $T$)  
- Output: $T \times D$ matrix containing a sequence of word vectors  
-  Constrains:  Impossible to make the $T \times D$ matrix the input to the neural network, because we want $W_e$ to be an updateable parameter.  
- Input of $T$ integer is much smaller than input $T \times D$ floats

In [1]:
import theano
import theano.tensor as T
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

import string
from nltk import pos_tag, word_tokenize

In [2]:
def remove_punctuation(s):
    return s.translate(None, string.punctuation)

In [3]:
def get_robert_frost():
    word2idx = {'START': 0, 'END': 1}  # mapping dictionary
    current_idx = 2
    sentences = []  # converted sentences in integer values
    
    for line in open('robert_frost.txt'):
        line = line.strip() # remove new lines
    
        if line:
            tokens = remove_punctuation(line.lower()).split() # tokenize cleanned words
            sentence = []
        
            for t in tokens:
            
                if t not in word2idx: # if dict. doesn't have the token
                    word2idx[t] = current_idx # create new key as token with idx. value
                    current_idx += 1 # increment the idx.  ex) max. idx. == total # of unique tokens 
                    
                idx = word2idx[t] # save current token idx.
                sentence.append(idx) # append current token idx to sentence list
            sentences.append(sentence) # append converted sentense into sentenses

    return sentences, word2idx

In [4]:
def init_weight(Mi, Mo):
    return np.random.randn(Mi, Mo) / np.sqrt(Mi + Mo)

In [7]:
class SimpleRNN:
    def __init__(self, D, M, V):
        self.D = D # dimensionality of word embedding
        self.M = M # hidden layer size
        self.V = V # vocabulary size

    def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False):
        N = len(X) # 1500 - number of sentense
        D = self.D # 50 - dimensionality of word embedding
        M = self.M # 50 - no. of nodes in each layer size 
        V = self.V # 2200 - no. of unique vocab for outcome

        # initial weights
        We = init_weight(V, D) # 2200 x 50
        Wx = init_weight(D, M) # 50 X 50
        Wh = init_weight(M, M) # 50 x 50
        bh = np.zeros(M)
        h0 = np.zeros(M)
        
        # z  = np.ones(M)
        Wxz = init_weight(D, M)
        Whz = init_weight(M, M)
        bz  = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)
        
        thX, thY, py_x, prediction = self.set(We, Wx, Wh, bh, h0, Wxz, Whz, bz, Wo, bo, activation)

        lr = T.scalar('lr')

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]
        
        updates = [
            (p, p + mu*dp - lr*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - lr*g) for dp, g in zip(dparams, grads)
        ]

        self.predict_op = theano.function(
            inputs = [thX], 
            outputs = prediction
        )
        self.train_op = theano.function(
            inputs = [thX, thY, lr],
            outputs = [cost, prediction],
            updates = updates
        )

        costs = []
        for i in xrange(epochs):
            X = shuffle(X)
            n_correct = 0
            cost = 0
            n_total = 0
            for j in xrange(N):
                if np.random.random() < 0.1:
                    input_sequence = [0] + X[j]
                    output_sequence = X[j] + [1]
                else:
                    input_sequence = [0] + X[j][:-1]
                    output_sequence = X[j]
                n_total += len(output_sequence)

                # we set 0 to start and 1 to end
                c, p = self.train_op(input_sequence, output_sequence, learning_rate)
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total)
            if (i + 1) % 500 == 0:
                learning_rate /= 2
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
            
    def save(self, filename): 
        np.savez(filename, *[p.get_value() for p in self.params]) # save multiple arr. at once
        
    @staticmethod
    def load(filename, activation):
        # TODO: would prefer to save activation to file too
        npz = np.load(filename)
        We = npz['arr_0']
        Wx = npz['arr_1']
        Wh = npz['arr_2']
        bh = npz['arr_3']
        h0 = npz['arr_4']
        Wxz = npz['arr_5']
        Whz = npz['arr_6']
        bz = npz['arr_7']
        Wo = npz['arr_8']
        bo = npz['arr_9']
        V, D = We.shape
        _, M = Wx.shape
        rnn = SimpleRNN(D, M, V)
        rnn.set(We, Wx, Wh, bh, h0, Wxz, Whz, bz, Wo, bo, activation)
        return rnn
    
    def set(self, We, Wx, Wh, bh, h0, Wxz, Whz, bz, Wo, bo, activation):
        self.f = activation

        # redundant - see how you can improve it
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wxz = theano.shared(Wxz)
        self.Whz = theano.shared(Whz)
        self.bz = theano.shared(bz)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wxz, self.Whz, self.bz, self.Wo, self.bo]

        
        thX = T.ivector('X')
        Ei = self.We[thX] # We = 2300 x 50, thx = list of row idx., each word has own weight in every layer M. (# of words in a sentense x layer size)
        thY = T.ivector('Y')

        def recurrence(x_t, h_t1): # x_t = single word weights for every layer dim = (50,)
            # h_t = (50,) (weight for a single word in each layer)
            # y_t = (81,)
            hhat_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) 
            z_t = T.nnet.sigmoid(x_t.dot(self.Wxz) + h_t1.dot(self.Whz) + self.bz)
            h_t = (1 - z_t) * h_t1 + z_t * hhat_t
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t
        
        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=Ei, # no. of words x 50 (weight matrix), will go through every word
            n_steps=Ei.shape[0],
        )

        py_x = y[:, 0, :]  # (no. of words in each sentense, no. of total unique words)   ex) (8, 2200)
        prediction = T.argmax(py_x, axis=1)
        
        self.predict_op = theano.function(
            inputs=[thX],
            outputs=[py_x, prediction],
            allow_input_downcast=True,
        )
        
        return thX, thY, py_x, prediction


        
    # pi = word counts distribution,  word2idx = dict. ex) {START:0, ... }
    def generate(self, word2idx):
        # convert word2idx -> idx2word ex) {0:START, ...
        idx2word = {v:k for k,v in word2idx.iteritems()} # iteritems() to iterate word2idx dict.
        V = len(word2idx)
   
        # pi is words distribution, X will have higher change to randomly generate the most used words
        # why random? because using the START symbol will always yield the same first word
        X = [0] # start token
        # generate 6 lines at a time
        n_lines = 0
        while n_lines < 6:
            PY_X, _ = self.predict_op(X)
            PY_X = PY_X[-1].flatten()
            P = [ np.random.choice(V, p=PY_X)] # anything from 0 to V with prob. of py_x
            X = np.concatenate([X, P]) # append to the sequence
            # print "P.shape:", P.shape, "P:", P
            P = P[-1] # just grab the most recent prediction
            if P > 1: # if it's not a real word, not start/end token
                word = idx2word[P]  # map the word
                print word,
            elif P == 1:  # if it's end token
                # end token
                n_lines += 1
                X = [0]
                print ''
       

In [8]:
def generate_poetry():
    sentences, word2idx = get_robert_frost()
    rnn = SimpleRNN.load('RRNN_D50_M50_epochs200_relu.npz', T.nnet.relu)
    rnn.generate(word2idx)

In [34]:
def train_poetry():
    # students: tanh didn't work but you should try it
    sentences, word2idx = get_robert_frost()
    rnn = SimpleRNN(50, 50, len(word2idx))
#     print len(word2idx)
    rnn.fit(sentences, learning_rate=10e-5, show_fig=True, activation=T.nnet.relu, epochs=200)
    rnn.save('RRNN_D50_M50_epochs200_relu.npz')

In [35]:
if __name__ == '__main__':
#     train_poetry()
    generate_poetry()

if god might prove the waterfall began girdle to say 
but you get as upstairs and this covered to what that 
mother were ago the chance for 
proclaimed i would heard my thought the say from though 
he wont takes 
whats not geese to harm between ourselves you do off it died though that im after than 
