In [1]:
import gensim
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
import gensim

In [3]:
from keras.models import Sequential
from keras.layers.core import TimeDistributedDense, Activation, Dropout
from keras.layers.recurrent import GRU, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.optimizers import RMSprop

In [4]:
class ConLLReader(object):
    def __init__(self, fname, lowercase=False, max_length=80):
        self.fname = fname
        self.lowercase = lowercase
        self.max_length = max_length
    def __iter__(self):
        for line in open(self.fname):
            tokens = line.strip().split(' ')
            if self.lowercase:
                tokens = [t.lower() for t in tokens]
            tokens = list(pad(tokens, size=self.max_length, padding='EOS'))
            yield tokens
            
from itertools import chain, repeat, islice

def pad_infinite(iterable, padding=None):
    return chain(iterable, repeat(padding))

def pad(iterable, size, padding=None):
    return islice(pad_infinite(iterable, padding), size)

In [6]:
def text_to_tokens(c, d):
    seqs = list(c)
    seqs_tokens = []
    for seq in seqs:
        tokens = []
        for t in seq:
            if t in d.token2id:
                tokens.append(d.token2id[t])
            else:
                tokens.append(len(d))
        seqs_tokens.append(tokens)
    return seqs_tokens

In [7]:
def getXandY(corpus_tokens, tags_tokens):
    X = np.array(corpus_tokens)
    y_ = np.array(tags_tokens)
    seq_len = X.shape[1]
    num_tags = len(tags_dictionary)
    om = OneHotEncoder(n_values=num_tags)
    y = om.fit_transform(y_).toarray()
    y = y.reshape(-1, seq_len, num_tags)
    print "X:shape",X.shape
    print "Y:shape",y.shape
    return X,y

In [8]:
def dump_conll_eval(text_file, tags_file, output_file, preds):
    cf = open(text_file)
    tf = open(tags_file)
    of = open(output_file, 'w')
    sents = [line.strip() for line in cf]
    tags = [line.strip() for line in tf]
    
    sents_tokens = []
    tags_tokens = []
    for sent, tag in zip(sents, tags):
        stokens = sent.split(' ')
        stags = tag.split(' ')
        sents_tokens.append(stokens)
        tags_tokens.append(stags)
        
    for i in np.arange(0, len(sents_tokens)):
        tuples = zip(sents_tokens[i], tags_tokens[i], preds[i])
        for t in tuples:
            of.write(' '.join(t))
            of.write('\n')
        of.write('\n')
    of.close()

In [9]:
def get_pred(X_test, tags_dictionary):
    yt  = model.predict_classes(X_test[:])
    preds = []
    for i in np.arange(0, yt.shape[0]):
        actual_tags = [tags_dictionary[t] for t in yt[i]]
        ptags = actual_tags[:actual_tags.index('EOS')]
        preds.append(ptags)
    return preds

In [10]:
corpus_text = ConLLReader('/data/vivek/pos_tags/train_pos.txt', True)
tags_text = ConLLReader('/data/vivek/pos_tags/train_pos.tags')
corpus_dictionary = gensim.corpora.Dictionary(corpus_text)
tags_dictionary = gensim.corpora.Dictionary(tags_text)

In [11]:
dev_corpus_text = ConLLReader('/data/vivek/pos_tags/dev_pos.txt', True)
dev_tags_text = ConLLReader('/data/vivek/pos_tags/dev_pos.tags')

In [12]:
corpus_tokens = text_to_tokens(corpus_text, corpus_dictionary)
tags_tokens = text_to_tokens(tags_text, tags_dictionary)

In [13]:
dev_corpus_tokens = text_to_tokens(dev_corpus_text, corpus_dictionary)
dev_tags_tokens = text_to_tokens(dev_tags_text, tags_dictionary)

In [14]:
X,y = getXandY(corpus_tokens, tags_tokens)

X:shape (8936, 80)
Y:shape (8936, 80, 45)


In [15]:
X_test,y_test = getXandY(dev_corpus_tokens, dev_tags_tokens)

X:shape (2012, 80)
Y:shape (2012, 80, 45)


In [16]:
X_train = X
Y_train = y
nb_word = len(corpus_dictionary) + 1
nb_tag = len(tags_dictionary) 
batch_size = 1
model = Sequential()
model.add(Embedding(nb_word, 128))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributedDense(nb_tag))
model.add(Activation('time_distributed_softmax'))
rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms,class_mode='categorical')
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=3, show_accuracy=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f5157610b90>

In [17]:
preds = get_pred(X_test, tags_dictionary)



In [19]:
dump_conll_eval('/data/vivek/pos_tags/dev_pos.txt', '/data/vivek/pos_tags/dev_pos.tags', '/data/vivek/pos_tags/test.output', preds)

In [20]:
%%bash 
perl /data/vivek/pos_tags/conlleval.pl -r <  /data/vivek/pos_tags/test.output

processed 47377 tokens with 47377 phrases; found: 47377 phrases; correct: 43804.
accuracy:  92.46%; precision:  92.46%; recall:  92.46%; FB1:  92.46
                #: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
                $: precision:  99.22%; recall: 100.00%; FB1:  99.61  387
               '': precision: 100.00%; recall:  99.05%; FB1:  99.52  313
                (: precision: 100.00%; recall: 100.00%; FB1: 100.00  77
                ): precision: 100.00%; recall: 100.00%; FB1: 100.00  77
                ,: precision: 100.00%; recall: 100.00%; FB1: 100.00  2390
                .: precision: 100.00%; recall: 100.00%; FB1: 100.00  1975
                :: precision: 100.00%; recall: 100.00%; FB1: 100.00  238
               CC: precision: 100.00%; recall:  99.59%; FB1:  99.79  1209
               CD: precision:  93.55%; recall:  95.99%; FB1:  94.75  1968
               DT: precision:  99.26%; recall:  99.43%; FB1:  99.34  4027
               EX: precision:  96.00%; recall: 