In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import json

#### download data from: "http://www.cnts.ua.ac.be/conll2000/chunking/"

In [2]:
data_path = "/Users/roopal/workspace/datasets/conll2000_chunking"
data_path_train = data_path + "/train.txt"
data_path_test = data_path + "/test.txt"

In [3]:
def get_data(data_path_train,data_path_test, split_sentences=False):
    df_lines = pd.read_csv(data_path_train, delim_whitespace=True, skip_blank_lines=False, header=None, names=["word", "tag", "ptag"])
    del df_lines["ptag"]
    
    print df_lines.shape
    
    word2idx = dict()
    wrd_idx = 0
    
    tag2idx = dict()
    tag_idx = 0
    
    Xtrain = list()
    Ytrain = list()
    current_x = list()
    current_y = list()
    for _, row in df_lines.iterrows():
        word = row["word"]
        tag = row["tag"]
        if word is not np.nan:
            
            if word not in word2idx:
                word2idx[word] = wrd_idx
                wrd_idx += 1
                
            if tag not in tag2idx:
                tag2idx[tag] = tag_idx
                tag_idx += 1
            
            current_x.append(word2idx[word])
            current_y.append(tag2idx[tag])
        elif split_sentences:
            Xtrain.append(current_x)
            Ytrain.append(current_y)
            current_x = list()
            current_y = list()
    
    if not split_sentences:
        Xtrain = current_x
        Ytrain = current_y
        
    
    df_lines = pd.read_csv(data_path_test, delim_whitespace=True, skip_blank_lines=False, header=None, names=["word", "tag", "ptag"])
    del df_lines["ptag"]
    
    Xtest = list()
    Ytest = list()
    current_x = list()
    current_y = list()
    
    for _, row in df_lines.iterrows():
        word = row["word"]
        tag = row["tag"]
        
        if word is not np.nan:
            current_x.append(word2idx.get(word, wrd_idx)) # get index of unknown if word is unknown
            current_y.append(tag2idx.get(tag, tag_idx))
            
        elif split_sentences:
            Xtest.append(current_x)
            Ytest.append(current_y)
            current_x = list()
            current_y = list()
    
    if not split_sentences:
        Xtest = current_x
        Ytest = current_y
    
    return Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx

In [4]:
Xtrain, Ytrain, Xtest, Ytest, word2idx, tag2idx = get_data(data_path_train, data_path_test, split_sentences=True)

(220663, 2)


In [5]:
# save word2idx for visualization
with open("word2idx_pos_rnn.json", 'w') as fp:
    json.dump(word2idx, fp)

In [6]:
print len(Xtrain), len(Ytrain), len(Xtest), len(Ytest)

8936 8936 2012 2012


In [7]:
# convert to numpy arrays
Xtrain = np.array(Xtrain)
Ytrain = np.array(Ytrain)
print (Xtrain.shape), (Ytrain.shape)

(8936,) (8936,)


In [8]:
N = len(Xtrain)
V = len(word2idx) + 1 # + 1 represents the unknown words
K = len(tag2idx)
print ("vocabulary size:", V)
print ("Tags # ", K)

('vocabulary size:', 19123)
('Tags # ', 44)


In [9]:
for words, tags in zip(Xtrain, Ytrain):
    print words
    print tags
    break

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 15, 19, 20, 17, 21, 7, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
[0, 1, 2, 0, 3, 4, 5, 6, 7, 2, 8, 0, 1, 0, 9, 1, 10, 11, 8, 1, 0, 0, 11, 7, 6, 7, 2, 8, 0, 1, 10, 12, 10, 13, 8, 9, 14]


In [37]:
class RNN:
    
    @staticmethod
    def init_weights(Mi, Mo):
        return np.random.rand(Mi, Mo)/ np.sqrt(Mi+Mo)
    
    def __init__(self, D, M, V, K, lr=10e-2):
        """
        D: dimensionality of word embeddings
        M: size of hidden layer
        V: size of vocabulary
        K: num of output classes
        lr: learning rate
        """
        self.D = D
        self.M = M
        self.V = V
        self.K = K
        self.learning_rate = lr
        
        tf.reset_default_graph()
        
        with tf.name_scope("weights"):
            self.We = tf.Variable(tf.random_uniform([self.V, self.D], -1.0, 1.0), name="We")
            self.Wx = tf.Variable(RNN.init_weights(self.D, self.M) , dtype=tf.float32, name="Wx")
            self.Wh = tf.Variable(RNN.init_weights(self.M, self.M), dtype=tf.float32, name="Wh")
            self.Wo = tf.Variable(RNN.init_weights(self.M, self.K), dtype=tf.float32, name="Wo")
        
            self.hist_We = tf.summary.histogram("hist_We", self.We)
            self.hist_Wx = tf.summary.histogram("hist_Wx", self.Wh)
            self.hist_Wh = tf.summary.histogram("hist_Wh", self.Wx)
            self.hist_Wo = tf.summary.histogram("hist_Wo", self.Wo)
        
        with tf.name_scope("biases"):
            self.bh = tf.Variable(tf.zeros(shape=[1, self.M]), dtype=tf.float32, name="bh")
            self.bo = tf.Variable(tf.zeros(shape=[1, self.K]), dtype=tf.float32, name="bo")
            
            self.hist_bh = tf.summary.histogram("hist_bh", self.bh)
            self.hist_bo = tf.summary.histogram("hist_bo", self.bo)
        
        # initial hidden state
        with tf.name_scope("initial_hidden_state"):
            self.h0 = tf.zeros(shape=[self.M], dtype=tf.float32, name="h0")
        
        # input placeholder for sentences
        self.input_seq = tf.placeholder(tf.int32, shape=(None), name="inputs")
        self.targets = tf.placeholder(tf.int32, shape=(None), name="targets")
        
        self.tf_session = tf.Session()
        
        self.build_graph()
        
        self.summary_op = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter("train_log/pos_rnn" , self.tf_session.graph)
        
        self.saver = tf.train.Saver()
        
        
    def build_graph(self):
        _inputs = tf.nn.embedding_lookup(self.We, self.input_seq)
        
        self.ho = tf.scan(
            fn=self._recurrence, elems=_inputs, initializer=self.h0, name="states"
        )
        
        self.py_x = tf.matmul(self.ho, self.Wo) + self.bo
        
        with tf.name_scope("cross_entropy"):
            self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(self.py_x, self.targets), name="cost_func")
            tf.summary.scalar("cross_entropy", self.cost)
            
        with tf.name_scope("train"):   
            trainables = tf.trainable_variables()
            grads = tf.gradients(self.cost, trainables)
            
            grads, _ = tf.clip_by_global_norm(grads, clip_norm=1.0)
            grad_var_pairs = zip(grads, trainables)
            
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)

            self.train_op = opt.apply_gradients(grad_var_pairs)
        
    def _recurrence(self, h_t_minus_1, x_t):
        x_t = tf.reshape(x_t, [1, self.D])
        h_t_minus_1 = tf.reshape(h_t_minus_1,[1, self.M])
        
        h_t = tf.nn.tanh(
            tf.matmul(x_t, self.Wx) + tf.matmul(h_t_minus_1, self.Wh) + self.bh
        )
        
        h_t = tf.reshape(h_t, [self.M], name="h")
        return h_t
    
    def predict(self, input_seq):
        self.input_seq = input_seq
        
        py_x = self.tf_session.run(self.py_x)
        
        return np.argmax(py_x, axis=1)
    
    def fit(self, X, Y, epochs=500):
        # num of sentences
        N = len(X)
        
        print ("Initializing global variables")
        self.tf_session.run(tf.global_variables_initializer())
        print ("# of trainable var outside: " + str(len(tf.trainable_variables())))
        
        net_idx = 0
        costs = list()
        for idx_epoch in xrange(epochs):
            cost_epoch = 0
            for idx_sent in xrange(N):
                print ("epoch: {}, sentence: {}".format(idx_epoch, idx_sent))
                
                targets = Y[idx_sent]
                feed_dict = {self.input_seq: X[idx_sent], self.targets: targets}
                
                self.tf_session.run(self.train_op, feed_dict=feed_dict)
                
                py_x, cost, We = self.tf_session.run([self.py_x, self.cost, self.We], feed_dict=feed_dict)
                
                pred = np.argmax(py_x, axis=1)
                print "Y: ", targets
                print "Prediction: ", pred
                
                accuracy = 0
                for y, y_ in zip(targets, pred):
                    if y==y_:
                        accuracy += 1 
                accuracy = float(accuracy)/len(pred)
                
                summary = self.tf_session.run(self.summary_op, feed_dict=feed_dict)
                self.train_writer.add_summary(summary, net_idx)
                self.train_writer.flush()
                
                print "Accuracy Sentence # {}".format(idx_sent), accuracy
                print "Cost Sentence # {}".format(idx_sent), cost
                cost_epoch += cost
                
                net_idx += 1
            costs.append(cost_epoch)
            
            print ("---------")
            print ("Cost at epoch {} is {}".format(idx_epoch, cost_epoch))
            print ("---------")
            
    def save_model(self, model_name):
        
        self.saver.save(self.tf_session, model_name)
        self.save_embedding_matrix()
        
    def save_embedding_matrix(self):
        file_path = "word_embedding_pos_rnn.npy"
        np.save(file_path, self.We.eval(self.tf_session))
    
    def load_model(self, model_name):
        """
        doesnt work yet!
        :param model_name:
        :return:
        """
        self.saver.restore(self.tf_session, model_name)
        print (self.We.eval(self.tf_session))

    def close_session(self):
        self.train_writer.close()
        self.tf_session.close()

In [38]:
rnn = RNN(300, 10, V, K, lr=0.05)

In [None]:
try:
    num_epochs = 5
    rnn.fit(Xtrain, Ytrain, epochs=num_epochs)
    rnn.save_model("model_rnn_pos")
finally:
    print ("Closing Session")
    rnn.close_session()