 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8888/notebooks/04-part-of-speech-tagging-senna-architecture.ipynb#Preprocessing" data-toc-modified-id="Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preprocessing</a></span><ul class="toc-item"><li><span><a href="http://localhost:8888/notebooks/04-part-of-speech-tagging-senna-architecture.ipynb#Read-input-data" data-toc-modified-id="Read-input-data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Read input data</a></span></li><li><span><a href="http://localhost:8888/notebooks/04-part-of-speech-tagging-senna-architecture.ipynb#Find-set-of-POSs-and-unique-words" data-toc-modified-id="Find-set-of-POSs-and-unique-words-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Find set of POSs and unique words</a></span></li><li><span><a href="http://localhost:8888/notebooks/04-part-of-speech-tagging-senna-architecture.ipynb#glove" data-toc-modified-id="glove-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>glove</a></span></li><li><span><a href="http://localhost:8888/notebooks/04-part-of-speech-tagging-senna-architecture.ipynb#Create-context-index-matrices" data-toc-modified-id="Create-context-index-matrices-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Create context index matrices</a></span></li></ul></li><li><span><a href="http://localhost:8888/notebooks/04-part-of-speech-tagging-senna-architecture.ipynb#Create-the-model" data-toc-modified-id="Create-the-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Create the model</a></span></li><li><span><a href="http://localhost:8888/notebooks/04-part-of-speech-tagging-senna-architecture.ipynb#Train" data-toc-modified-id="Train-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train</a></span></li><li><span><a href="http://localhost:8888/notebooks/04-part-of-speech-tagging-senna-architecture.ipynb#Add-case-information" data-toc-modified-id="Add-case-information-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Add case information</a></span></li></ul></div>

In [1]:
import numpy as np
import os

# Preprocessing

## Read input data

In [2]:
def readFile(filepath):
    sentences = []
    sentence = []
    
    for line in open(filepath):
        line = line.strip()
        
        if len(line) == 0 or line[0] == '#':
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
            continue
        splits = line.split('\t')
        sentence.append([splits[0], splits[1]])
    
    if len(sentence) > 0:
        sentences.append(sentence)
        sentence = []
        
    print(filepath, len(sentences), "sentences")
    return sentences


In [3]:
trainSentences = readFile('./pos-tagging-senna/train.txt')
devSentences = readFile('./pos-tagging-senna/dev.txt')
testSentences = readFile('./pos-tagging-senna/test.txt')

./pos-tagging-senna/train.txt 50545 sentences
./pos-tagging-senna/dev.txt 2506 sentences
./pos-tagging-senna/test.txt 4134 sentences


In [4]:
trainSentences[0]

[['Clearly', 'RB'],
 [',', 'pct'],
 ['this', 'DT'],
 ['was', 'BEDZ'],
 ['a', 'AT'],
 ['family', 'NN'],
 ['in', 'IN'],
 ['crisis', 'NN'],
 ['.', 'pct']]

## Find set of POSs and unique words

In [5]:
POS_set = set()
unique_words = set()

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for token, label in sentence:
            POS_set.add(label)
            unique_words.add(token.lower())

In [6]:
POS_index_dict = dict([(x,i) for i,x in enumerate(POS_set)])

In [7]:
unique_words.add('__PADDING__')

In [8]:
word_index_dict = dict([(x,i) for i,x in enumerate(unique_words)])

## glove

In [9]:
glove_index = {}
f = open('./glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_index[word] = coefs
f.close()

In [10]:
embedding_dim = 100
embedding_matrix = np.zeros((len(unique_words), embedding_dim))
for word, i in word_index_dict.items():
    embedding_vector = glove_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
embedding_matrix

array([[  2.66180001e-02,   2.38580001e-03,   1.01259999e-01, ...,
         -9.05040026e-01,  -2.65830010e-01,   3.94039989e-01],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  5.73249996e-01,  -4.24730003e-01,  -3.61350000e-01, ...,
         -9.43050027e-01,   5.49830019e-01,  -4.99899983e-02],
       ..., 
       [ -9.46839992e-03,   7.38589987e-02,  -2.99039990e-01, ...,
         -1.46990001e-01,   4.53269988e-01,   1.27079999e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [ -1.22570001e-01,  -5.88829994e-01,   3.74179989e-01, ...,
         -7.95229978e-04,  -2.82409996e-01,   5.66399992e-01]])

In [12]:
embedding_matrix.shape

(49816, 100)

## Create context index matrices

In [13]:
trainSentences[0]

[['Clearly', 'RB'],
 [',', 'pct'],
 ['this', 'DT'],
 ['was', 'BEDZ'],
 ['a', 'AT'],
 ['family', 'NN'],
 ['in', 'IN'],
 ['crisis', 'NN'],
 ['.', 'pct']]

In [14]:
def createMatrices(sentences, windowsize):
    xMatrix = []
    yVector = []
    
    padding_index = word_index_dict['__PADDING__']
    
    for sentence in sentences:
        for i in range(len(sentence)):
            wordIndices = []
            for pos in range(i-windowsize, i+windowsize+1):
                if pos < 0 or pos >= len(sentence):
                    wordIndices.append(padding_index)
                else:
                    word = sentence[pos][0]
                    if word.lower() in word_index_dict:
                        wordIndices.append(word_index_dict[word.lower()])
                    else:
                        wordIndices.append(padding_index)
            
            yVector.append(POS_index_dict[sentence[i][1]])
            xMatrix.append(wordIndices)
    
    return (np.asarray(xMatrix), np.asarray(yVector))

In [15]:
train_set = createMatrices(trainSentences, windowsize=3)

In [16]:
train_set[0].shape

(1026265, 7)

In [17]:
train_set[1].shape

(1026265,)

In [18]:
dev_set  = createMatrices(devSentences, windowsize=3)
test_set = createMatrices(testSentences, windowsize=3)

# Create the model

In [19]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten, concatenate, Embedding

Using TensorFlow backend.


In [20]:
words_input = Input(shape = (7,), dtype = 'int32', name = 'words_input')
words = Embedding(input_dim = embedding_matrix.shape[0], output_dim = embedding_matrix.shape[1],
                  weights = [embedding_matrix], trainable = False)(words_input)
words = Flatten()(words)

In [21]:
output = Dense(units = 100, activation = 'tanh')(words)
output = Dense(units = len(POS_set), activation = 'softmax')(output)

In [22]:
model = Model(model = Model(inputs = words_input, outputs = [output])inputs = words_input, outputs = [output])

In [23]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'nadam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
words_input (InputLayer)     (None, 7)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 7, 100)            4981600   
_________________________________________________________________
flatten_1 (Flatten)          (None, 700)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               70100     
_________________________________________________________________
dense_2 (Dense)              (None, 84)                8484      
Total params: 5,060,184
Trainable params: 78,584
Non-trainable params: 4,981,600
_________________________________________________________________


# Train

In [24]:
number_of_epochs = 10
minibatch_size = 128
print("%d epochs" % number_of_epochs)


def predict_classes(prediction):
 return prediction.argmax(axis=-1)
 
for epoch in range(number_of_epochs):
    print("\n------------- Epoch %d ------------" % (epoch+1))
    model.fit(train_set[0], train_set[1], epochs=1, batch_size=minibatch_size, verbose=2, shuffle=True)   
    
    #Predict labels for development set
    dev_pred = predict_classes(model.predict(dev_set[0]))
    dev_acc = np.sum(dev_pred == dev_set[1]) / float(len(dev_set[1]))
    print("Dev-Accuracy: %.2f" % (dev_acc*100))
    
    #Predict labels for test set
    test_pred = predict_classes(model.predict(test_set[0]))
    test_acc = np.sum(test_pred == test_set[1]) / float(len(test_set[1]))
    print("Test-Accuracy: %.2f" % (test_acc*100))


10 epochs

------------- Epoch 1 ------------
Epoch 1/1
44s - loss: 0.3227
Dev-Accuracy: 93.26
Test-Accuracy: 93.17

------------- Epoch 2 ------------
Epoch 1/1
44s - loss: 0.2078
Dev-Accuracy: 93.76
Test-Accuracy: 93.56

------------- Epoch 3 ------------
Epoch 1/1
46s - loss: 0.1871
Dev-Accuracy: 94.06
Test-Accuracy: 93.93

------------- Epoch 4 ------------
Epoch 1/1
46s - loss: 0.1758
Dev-Accuracy: 94.20
Test-Accuracy: 94.18

------------- Epoch 5 ------------
Epoch 1/1
47s - loss: 0.1680
Dev-Accuracy: 94.15
Test-Accuracy: 94.13

------------- Epoch 6 ------------
Epoch 1/1
47s - loss: 0.1611
Dev-Accuracy: 94.32
Test-Accuracy: 94.26

------------- Epoch 7 ------------
Epoch 1/1
46s - loss: 0.1569
Dev-Accuracy: 94.29
Test-Accuracy: 94.29

------------- Epoch 8 ------------
Epoch 1/1
45s - loss: 0.1527
Dev-Accuracy: 94.43
Test-Accuracy: 94.38

------------- Epoch 9 ------------
Epoch 1/1
46s - loss: 0.1494
Dev-Accuracy: 94.30
Test-Accuracy: 94.27

------------- Epoch 10 ------------

# Add case information

In [27]:
case_index_dict = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4,
                   'mainly_numeric':5, 'contains_digit': 6, '__PADDING__':7}
caseEmbeddings = np.identity(len(case_index_dict), dtype='float32')
caseEmbeddings

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]], dtype=float32)

In [28]:
def getCase(word):
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
    
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
    
    return case_index_dict[casing]

In [31]:
def createMatrices(sentences, windowsize):
    xMatrix    = []
    caseMatrix = []
    yVector    = []
    
    padding_index = word_index_dict['__PADDING__']
    
    for sentence in sentences:
        for i in range(len(sentence)):
            wordIndices = []
            caseIndices = []
            for pos in range(i-windowsize, i+windowsize+1):
                if pos < 0 or pos >= len(sentence):
                    wordIndices.append(padding_index)
                    caseIndices.append(case_index_dict['__PADDING__'])
                else:
                    word = sentence[pos][0]
                    if word.lower() in word_index_dict:
                        wordIndices.append(word_index_dict[word.lower()])
                    else:
                        wordIndices.append(padding_index)
                    caseIndices.append(getCase(word))
            
            yVector.append(POS_index_dict[sentence[i][1]])
            xMatrix.append(wordIndices)
            caseMatrix.append(caseIndices)
    
    return (np.asarray(xMatrix), np.asarray(caseMatrix), np.asarray(yVector))

In [32]:
train_set = createMatrices(trainSentences, windowsize=3)
dev_set  = createMatrices(devSentences, windowsize=3)
test_set = createMatrices(testSentences, windowsize=3)

In [34]:
print(train_set[0].shape)
print(train_set[1].shape)
print(train_set[2].shape)

(1026265, 7)
(1026265, 7)
(1026265,)


In [36]:
words_input = Input(shape = (7,), dtype = 'int32', name = 'words_input')
words = Embedding(input_dim = embedding_matrix.shape[0], output_dim = embedding_matrix.shape[1],
                  weights = [embedding_matrix], trainable = False)(words_input)
words = Flatten()(words)

cases_input = Input(shape = (7,), dtype = 'int32', name = 'cases_input')
cases = Embedding(input_dim = caseEmbeddings.shape[0],
                  output_dim = caseEmbeddings.shape[1],
                  weights = [caseEmbeddings],
                  trainable = False)(cases_input)
cases = Flatten()(cases)

output = concatenate([words, cases])
output = Dense(units = 100, activation = 'tanh')(output)
output = Dense(units = len(POS_set), activation = 'softmax')(output)

model = Model(inputs =[words_input, cases_input], outputs = [output])
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'nadam')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
words_input (InputLayer)         (None, 7)             0                                            
____________________________________________________________________________________________________
cases_input (InputLayer)         (None, 7)             0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 7, 100)        4981600     words_input[0][0]                
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 7, 8)          64          cases_input[0][0]                
___________________________________________________________________________________________

In [37]:
number_of_epochs = 10
minibatch_size = 128
print("%d epochs" % number_of_epochs)


def predict_classes(prediction):
 return prediction.argmax(axis=-1)
 
for epoch in range(number_of_epochs):
    print("\n------------- Epoch %d ------------" % (epoch+1))
    model.fit([train_set[0], train_set[1]],train_set[2], epochs=1,
              batch_size=minibatch_size, verbose=2, shuffle=True)   
    
    #Predict labels for development set
    dev_pred = predict_classes(model.predict([dev_set[0], dev_set[1]]))
    dev_acc = np.sum(dev_pred == dev_set[2]) / float(len(dev_set[2]))
    print("Dev-Accuracy: %.2f" % (dev_acc*100))
    
    #Predict labels for test set
    test_pred = predict_classes(model.predict([test_set[0], test_set[1]]))
    test_acc = np.sum(test_pred == test_set[2]) / float(len(test_set[2]))
    print("Test-Accuracy: %.2f" % (test_acc*100))


10 epochs

------------- Epoch 1 ------------
Epoch 1/1
49s - loss: 0.2956
Dev-Accuracy: 93.73
Test-Accuracy: 93.70

------------- Epoch 2 ------------
Epoch 1/1
49s - loss: 0.1848
Dev-Accuracy: 94.22
Test-Accuracy: 94.33

------------- Epoch 3 ------------
Epoch 1/1
48s - loss: 0.1654
Dev-Accuracy: 94.72
Test-Accuracy: 94.64

------------- Epoch 4 ------------
Epoch 1/1
48s - loss: 0.1540
Dev-Accuracy: 94.57
Test-Accuracy: 94.48

------------- Epoch 5 ------------
Epoch 1/1
48s - loss: 0.1466
Dev-Accuracy: 94.98
Test-Accuracy: 94.77

------------- Epoch 6 ------------
Epoch 1/1
48s - loss: 0.1409
Dev-Accuracy: 94.76
Test-Accuracy: 94.71

------------- Epoch 7 ------------
Epoch 1/1
48s - loss: 0.1362
Dev-Accuracy: 94.98
Test-Accuracy: 94.91

------------- Epoch 8 ------------
Epoch 1/1
49s - loss: 0.1321
Dev-Accuracy: 94.86
Test-Accuracy: 94.67

------------- Epoch 9 ------------
Epoch 1/1
48s - loss: 0.1287
Dev-Accuracy: 94.98
Test-Accuracy: 94.86

------------- Epoch 10 ------------