In [1]:
import numpy as np
import os

In [2]:
def readFile(filepath):
    sentences = []
    sentence = []
    
    for line in open(filepath):
        line = line.strip()
        
        if len(line) == 0 or line[0] == '#':
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
            continue
        splits = line.split('\t')
        sentence.append([splits[1], splits[2]])
    
    if len(sentence) > 0:
        sentences.append(sentence)
        sentence = []
        
    print(filepath, len(sentences), "sentences")
    return sentences

In [3]:
trainSentences = readFile('data/pos-tagging-lstm/NER-de-train.tsv')
devSentences   = readFile('data/pos-tagging-lstm/NER-de-dev.tsv')
testSentences  = readFile('data/pos-tagging-lstm/NER-de-test.tsv')

data/pos-tagging-lstm/NER-de-train.tsv 24000 sentences
data/pos-tagging-lstm/NER-de-dev.tsv 2200 sentences
data/pos-tagging-lstm/NER-de-test.tsv 5100 sentences


In [4]:
trainSentences[0]

[['Schartau', 'B-PER'],
 ['sagte', 'O'],
 ['dem', 'O'],
 ['"', 'O'],
 ['Tagesspiegel', 'B-ORG'],
 ['"', 'O'],
 ['vom', 'O'],
 ['Freitag', 'O'],
 [',', 'O'],
 ['Fischer', 'B-PER'],
 ['sei', 'O'],
 ['"', 'O'],
 ['in', 'O'],
 ['einer', 'O'],
 ['Weise', 'O'],
 ['aufgetreten', 'O'],
 [',', 'O'],
 ['die', 'O'],
 ['alles', 'O'],
 ['andere', 'O'],
 ['als', 'O'],
 ['überzeugend', 'O'],
 ['war', 'O'],
 ['"', 'O'],
 ['.', 'O']]

In [5]:
POS_set = set()
unique_words = set()
for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for token, label in sentence:
            POS_set.add(label)
            unique_words.add(token.lower())

In [6]:
POS_index_dict = dict([(x,i) for i,x in enumerate(POS_set)])

In [7]:
word_index_dict = dict([(x,i) for i,x in enumerate(unique_words)])

In [8]:
case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

In [9]:
word_index_dict = {}
wordEmbeddings = []

In [10]:
import gzip

In [11]:
fEmbeddings = gzip.open('data/pos-tagging-lstm/2014_tudarmstadt_german_50mincount.vocab.gz', "r")
for line in fEmbeddings:
    split = line.decode("utf-8").strip().split(" ")
    word = split[0]
    
    if len(word_index_dict) == 0: #Add padding+unknown
        word_index_dict["PADDING_TOKEN"] = len(word_index_dict)
        vector = np.zeros(len(split)-1) #Zero vector vor 'PADDING' word
        wordEmbeddings.append(vector)
        
        word_index_dict["UNKNOWN_TOKEN"] = len(word_index_dict)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if split[0].lower() in unique_words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word_index_dict[split[0]] = len(word_index_dict)
        
wordEmbeddings = np.array(wordEmbeddings)

In [12]:
def getCasing(word, caseLookup):   
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
            
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
    
   
    return caseLookup[casing]


In [13]:
def createMatrices(sentences, windowsize):
    unknownIdx = word_index_dict['UNKNOWN_TOKEN']
    paddingIdx = word_index_dict['PADDING_TOKEN']    
        
    dataset = []
    
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
        labelIndices = []
        
        for word, label in sentence:  
            if word in word_index_dict:
                wordIdx = word_index_dict[word]
            elif word.lower() in word_index_dict:
                wordIdx = word_index_dict[word.lower()]                 
            else:
                wordIdx = unknownIdx
            
            #Get the label and map to int            
            wordIndices.append(wordIdx)
            caseIndices.append(getCasing(word, case2Idx))
            labelIndices.append(POS_index_dict[label])
           
        dataset.append([wordIndices, caseIndices, labelIndices]) 
        
    return dataset

In [14]:
train_set = createMatrices(trainSentences, windowsize=3)
dev_set   = createMatrices(devSentences, windowsize=3)
test_set  = createMatrices(testSentences, windowsize=3)

In [15]:
from keras.models import Model
from keras.layers import *

Using TensorFlow backend.


In [16]:
words_input = Input(shape=(None,), dtype='int32', name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0],
                  output_dim=wordEmbeddings.shape[1],
                  weights=[wordEmbeddings],
                  trainable=False)(words_input)

In [17]:
casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1],
                   input_dim=caseEmbeddings.shape[0],
                   weights=[caseEmbeddings],
                   trainable=False)(casing_input)

In [18]:
output = concatenate([words, casing])
output = Bidirectional(LSTM(50, return_sequences=True, dropout=0.25, recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(len(POS_index_dict), activation='softmax'))(output)

In [19]:
model = Model(inputs=[words_input, casing_input], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
words_input (InputLayer)         (None, None)          0                                            
____________________________________________________________________________________________________
casing_input (InputLayer)        (None, None)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, None, 100)     6670600     words_input[0][0]                
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, None, 8)       64          casing_input[0][0]               
___________________________________________________________________________________________

In [20]:
def iterate_minibatches(dataset): 
    endIdx = len(dataset)
    
    for idx in range(endIdx):
        tokens, casing, labels = dataset[idx]        
            
        labels = np.expand_dims([labels], -1)     
        yield labels, np.asarray([tokens]), np.asarray([casing])


def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    for tokens, casing, labels in dataset:    
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        pred = model.predict([tokens, casing], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes            
        correctLabels.append(labels)
        predLabels.append(pred)
        
        
    return predLabels, correctLabels
        
number_of_epochs = 20

In [21]:
import random

In [22]:
import time

In [23]:
idx2Label = {v: k for k, v in POS_index_dict.items()}

In [None]:
import BIOF1Validation

In [None]:
for epoch in range(number_of_epochs):    
    print("--------- Epoch %d -----------" % epoch)
    random.shuffle(train_set)
    start_time = time.time()    
    
    #Train one sentence at a time (i.e. online training) to avoid padding of sentences
    cnt = 0
    for batch in iterate_minibatches(train_set):
        labels, tokens, casing = batch       
        model.train_on_batch([tokens, casing], labels)   
        cnt += 1
        
        if cnt % 100 == 0:
            print('Sentence: %d / %d' % (cnt, len(train_set)), end='\r')
    print("%.2f sec for training                 " % (time.time() - start_time))
    
    
    #Performance on dev dataset        
    predLabels, correctLabels = tag_dataset(dev_set)        
    pre_dev, rec_dev, f1_dev = BIOF1Validation.compute_f1(predLabels, correctLabels, idx2Label)
    print("Dev-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_dev, rec_dev, f1_dev))
    
    #Performance on test dataset       
    predLabels, correctLabels = tag_dataset(test_set)        
    pre_test, rec_test, f1_test= BIOF1Validation.compute_f1(predLabels, correctLabels, idx2Label)
    print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test))
    
    print("%.2f sec for evaluation" % (time.time() - start_time))
    print("")
        

--------- Epoch 0 -----------
Sentence: 16300 / 24000