In [1]:
#Download your favourite embeddings( e. g. from https://nlp.stanford.edu/projects/glove/ ) and write down path to them
import numpy as np
import tensorflow as tf
import keras
import theano

Using TensorFlow backend.


In [3]:
word_embeddings_path = './data/glove.6B.50d.txt'

word2idx = {}
word_embeddings = []
embedding_size = None
#Loading embeddings
with open(word_embeddings_path, 'r', encoding="utf-8") as f_em:
    for line in f_em:
        split = line.strip().split(" ")
        if len(split) <= 2:
            continue
        if embedding_size is None:
            embedding_size = len(split) - 1
            # Embeddings iniatilization for paddings and unknown words
            word2idx["PADDING_TOKEN"] = len(word2idx)
            word_embeddings.append(np.zeros(embedding_size))

            word2idx["UNKNOWN_TOKEN"] = len(word2idx)
            word_embeddings.append(np.random.uniform(-0.25, 0.25, embedding_size))
        if len(split) - 1 != embedding_size:
            continue
        word_embeddings.append(np.asarray(split[1:], dtype='float32'))
        word2idx[split[0]] = len(word2idx)

word_embeddings = np.array(word_embeddings, dtype='float32')

In [4]:
#Idea for improvement: use capitalization templates (as in Nadeau and Sekine 2007)
case2idx = {'numeric': 0, 'all_lower':1, 'all_upper':2, 'initial_upper':3, 'other':4, 'mainly_numeric':5, 
            'contains_digit': 6, 'PADDING_TOKEN':7}
case_embeddings = np.identity(len(case2idx), dtype=theano.config.floatX)

def get_casing(word, case_lookup):   
    casing = 'other'
    
    num_digits = 0
    for char in word:
        if char.isdigit():
            num_digits += 1
            
    digit_fraction = num_digits / float(len(word))
    
    if word.isdigit(): #Digit
        casing = 'numeric'
    elif digit_fraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower
        casing = 'all_lower'
    elif word.isupper(): #All upper
        casing = 'all_upper'
    elif word[0].isupper(): #First upper,other lower
        casing = 'initial_upper'
    elif num_digits > 0:
        casing = 'contains_digit'  
   
    return case_lookup[casing]

In [5]:
MAX_COLUMNS = 2
WORD_COL_NUM = 0
LABEL_COL_NUM = 1
def read_file(file_path):
    """
    :param file_path: path for corpus in CoNLL-format
    :return: corpus_sentences - list of sentences, splitted into words
    """
    corpus_sentences = []
    input_sentence = []
    with open(file_path, 'r', encoding='utf-8') as f_in:
        for line in f_in:
            line = line.strip()

            if len(line) == 0 or line[0] == '#':
                if len(input_sentence) > 0:
                    corpus_sentences.append(input_sentence)
                    input_sentence = []
                continue
            if len(line.split('\t')) < MAX_COLUMNS:
                print(line)
                continue
            input_sentence.append(line.split('\t'))

    if len(input_sentence) > 0:
        corpus_sentences.append(input_sentence)

    print(file_path, len(corpus_sentences), "sentences")
    return corpus_sentences

#Path for parts of CoNLL-2003 corpus
train_path = './conll.train'
train_sentences = read_file(train_path)

dev_path = './conll.dev'
dev_sentences = read_file(dev_path)

test_path = './conll.test'
test_sentences = read_file(test_path)

# Often we have one corpus from which dev and test should be splitted (each 0.1-0.2 of corpus). 
# Impement solution for such situation

./conll.train 14987 sentences
./conll.dev 3466 sentences
./conll.test 3684 sentences


In [6]:
#Loading all class labels and adding new label for paddings
label_set = set()
label_set.add('PADDING_LABEL')
for dataset in [train_sentences, dev_sentences, test_sentences]:
    for sentence in dataset:
        for token in sentence:
            label = token[LABEL_COL_NUM]
            label_set.add(label)    

# Turing labels into indices
label2idx = {}
idx2label = {}
for label in label_set:
    label2idx[label] = len(label2idx)
    
print(label2idx)

{'B-MISC': 0, 'I-MISC': 1, 'I-PER': 2, 'B-ORG': 3, 'E-LOC': 4, 'PADDING_LABEL': 5, 'B-PER': 6, 'S-MISC': 7, 'OUT': 8, 'S-PER': 9, 'E-MISC': 10, 'S-LOC': 11, 'E-ORG': 12, 'B-LOC': 13, 'I-LOC': 14, 'I-ORG': 15, 'S-ORG': 16, 'E-PER': 17}


In [11]:
def get_token_indices(token, word2idx, case2idx, unknown_idx):

    token_unknown = False
    # Each token has several corresponding columns. Token text is in first column
    word = token[WORD_COL_NUM]
    # First trying to find word in embedding dictionary, if unable trying to find decapitalized word, if unable
    # word is considered unknown
    if word2idx.get(word) is not None:
        word_idx = word2idx[word]
    elif word2idx.get(word.lower()) is not None:
        word_idx = word2idx[word.lower()]
    else:
        word_idx = unknown_idx
        token_unknown = True

    case_idx = get_casing(word, case2idx)
    return token_unknown, word_idx, case_idx

In [23]:
train_sentences[1]

[['EU', 'S-ORG'],
 ['rejects', 'OUT'],
 ['German', 'S-MISC'],
 ['call', 'OUT'],
 ['to', 'OUT'],
 ['boycott', 'OUT'],
 ['British', 'S-MISC'],
 ['lamb', 'OUT'],
 ['.', 'OUT']]

In [7]:
def create_matrices(sentences, word2idx, label2idx, case2idx):   
    
    unknown_idx = word2idx['UNKNOWN_TOKEN']
    padding_casing = case2idx['PADDING_TOKEN']
    padding_idx = word2idx['PADDING_TOKEN'] 
    padding_label = label2idx['PADDING_LABEL']  
    
    dataset = []
    total_tokens = 0
    unknown_tokens = 0
    for sentence in sentences:
        
        # Index of first non-padding in sentence
        proper_sentence_start = 1

        word_indices = np.array([padding_idx] * (len(sentence) + 2))
        case_indices = np.array([padding_casing] * (len(sentence) + 2))
        label_indices = np.array([padding_label] * (len(sentence) + 2))
        
        #Here each sentence starts with and ends with exactly one padding
        #However when using GPUs each sentence in one batch should have identical shapes. Thus sentences should be splitted
        #into such groups that each sentence in one group should have identical lengths (and thus sentences of one group can
        #be present in one batch). The easiest way to do this is to have all sentences padded up to max length(i. e. have 
        #1 group). Most sentnces are short but in most corpora there exist really long ones thus this solution has massive 
        #overhead. Thus it is more reasonable to have more sophisticated grouping principle e. g. padding sentence to 
        #length of nearest 2^n + 1

        for pos_in_sentence, word in enumerate(sentence):

            token_unknown, word_idx, case_idx = get_token_indices(word, word2idx, case2idx, unknown_idx)
            pos_in_padded_sentence = pos_in_sentence + proper_sentence_start
            word_indices[pos_in_padded_sentence] = word_idx
            case_indices[pos_in_padded_sentence] = case_idx
            label_indices[pos_in_padded_sentence] = label2idx[word[LABEL_COL_NUM]]

            # Calculating percent of tokens not covered by embeddings
            total_tokens += 1
            if token_unknown:
                unknown_tokens += 1

        # All data for one sentence put in one list
        dataset.append([word_indices, case_indices, label_indices])
        
    percent = 0.0
    if total_tokens != 0:
        percent = float(unknown_tokens) / total_tokens * 100
    print("{} tokens, {} unknown, {:.3}%".format(total_tokens, unknown_tokens, percent ))
    return dataset



train_data = create_matrices(train_sentences, word2idx, label2idx, case2idx)
dev_data = create_matrices(dev_sentences, word2idx, label2idx, case2idx)
test_data = create_matrices(test_sentences, word2idx, label2idx, case2idx)

for sentence in train_data[:5]:
    print(sentence)

204567 tokens, 5578 unknown, 2.73%
51578 tokens, 1270 unknown, 2.46%
46666 tokens, 1562 unknown, 3.35%
[array([0, 1, 0]), array([7, 2, 7]), array([5, 8, 5])]
[array([    0,   646,  7580,   516,   582,     6,  5262,   299, 10240,
           4,     0]), array([7, 2, 1, 3, 1, 1, 1, 3, 1, 4, 7]), array([ 5, 16,  8,  7,  8,  8,  8,  7,  8,  8,  5])]
[array([   0, 1296, 9005,    0]), array([7, 3, 3, 7]), array([ 5,  6, 17,  5])]
[array([   0, 3881,    1,    0]), array([7, 2, 5, 7]), array([ 5, 11,  8,  5])]
[array([    0,     2,   293,   629,    18,    15,   186,    22, 10468,
          19,   516,  3242,     6,  2036,     6, 18291,   299, 10240,
         209,  2156,  2389,   403,  5123,  6474,  1291,    88,    32,
        7595,     6,  7377,     4,     0]), array([7, 3, 3, 3, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 4, 7]), array([ 5,  8,  3, 12,  8,  8,  8,  8,  8,  8,  7,  8,  8,  8,  8,  8,  7,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, 

In [8]:
#Idea for improvement: implement char features by taking first k or last k symbols (or both) for each token and
#map through convolutional or recurrent layer
# Resulting vector should be merged with other token features into merged_embeddings
# More detailed description https://arxiv.org/pdf/1603.01360v1.pdf 


from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Dropout, Bidirectional, Input, concatenate
from keras.models import Model
from keras.optimizers import Adam
# Practical implementations should have dim of 100 or more
SENTENCE_LSTM_DIM = 10

n_out = len(label2idx)

tokens_input = Input(dtype='int32', shape=(None,), name='tokens_input')
tokens_embedding_layer = Embedding(input_dim=word_embeddings.shape[0], 
                                   output_dim=word_embeddings.shape[1],
                                   weights=[word_embeddings], trainable=False, 
                                   name='tokens_embeddings')
tokens = tokens_embedding_layer(tokens_input)


casing_input = Input(dtype='int32', shape=(None,), name='casing_input')
casing_embedding_layer = Embedding(input_dim=case_embeddings.shape[0], 
                                   output_dim=case_embeddings.shape[1],
                                   weights=[case_embeddings], trainable=True, 
                                   name='casing_embeddings')
casing = casing_embedding_layer(casing_input)

merged_embeddings = concatenate([tokens, casing], name='merged_embeddings')
for_lstm = Dropout(0.2)(merged_embeddings)
# If GPU is used  choose implementation=2
blstm = Bidirectional(LSTM(SENTENCE_LSTM_DIM, return_sequences=True, implementation=1), 
                      name='blstm')(for_lstm)
#Try several architectures here - GRU, convolutions, several recurrent layers stacked etc 
#result = Conv1d(n_out,1, activation='softmax', name='result'))(blstm
result = TimeDistributed(Dense(n_out, activation='softmax', name='result'))(blstm)

model = Model(inputs=[tokens_input, casing_input], outputs=result)

# default lr = 0.001, beta_1=0.9
adam = Adam(lr=0.001, beta_1=0.9)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tokens_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
casing_input (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
tokens_embeddings (Embedding)   (None, None, 50)     20000100    tokens_input[0][0]               
__________________________________________________________________________________________________
casing_embeddings (Embedding)   (None, None, 8)      64          casing_input[0][0]               
__________________________________________________________________________________________________
merged_emb

In [10]:
import random
import time
# If sentences were successfully groupt into groups of identical length here you should yield list of BATCH_SIZE 
# sentences of one length (BATCH_SIZE is such that one batch can be loaded into GPU memory)
def iterate_minibatches(dataset):   
    for sentence in dataset:
        tokens, casing, labels = sentence     
            
        labels = np.expand_dims(labels, -1) 
        yield np.asarray([tokens]), np.asarray([casing]), np.asarray([labels])

# Here again code should be adapted for batches of sentences     
def tag_dataset(dataset):
    predicted_labels = []
    correct_labels = []
    for tokens, casing, labels in dataset:
        pred = model.predict_on_batch([np.asarray([tokens]), np.asarray([casing])])[0]
        pred_labels = [el.tolist().index(max(el)) for el in pred]
        predicted_labels.append(pred_labels)
        correct_labels.append(labels)
        #print(predicted_labels, correct_labels)
    return predicted_labels, correct_labels

# The best evaluation metric is f-measure on entities constucted from several tokens.  
# See https://github.com/mit-nlp/MITIE/blob/master/tools/ner_conll/conlleval for reference
def compute_accuracy(predictions, correct, padding_label):
    """
    """
    total_tokens = 0
    guessed_tokens = 0
    for guessed_sentence, correct_sentence in zip(predictions, correct):
        #print(guessed_sentence, correct_sentence)
        assert (len(guessed_sentence) == len(correct_sentence)), "Guessed and correct sentences do not match"
        for j in range(len(guessed_sentence)):
            if correct_sentence[j] != padding_label:
                total_tokens += 1
                if guessed_sentence[j] == correct_sentence[j]:
                    guessed_tokens += 1

    if total_tokens == 0:
        return float(0)
    else:
        accuracy = float(guessed_tokens) / total_tokens
        return accuracy

        
number_of_epochs = 10
print("%d epochs" % number_of_epochs)

print("%d train sentences" % len(train_data))
print("%d dev sentences" % len(dev_data))
print("%d test sentences" % len(test_data))

padding_label = label2idx['PADDING_LABEL']

for epoch in range(number_of_epochs):    
    print("--------- Epoch %d -----------" % epoch)
    random.shuffle(train_data)
    
    start_time = time.time()    
    for batch in iterate_minibatches(train_data):
        #print(batch)
        tokens, casing, labels = batch       
        model.train_on_batch([tokens, casing], labels)   
    print("%.2f sec for training" % (time.time() - start_time))
               
    #Train Dataset       
    start_time = time.time()  
    print("================================== Train Data ==================================")
    predicted_labels, correct_labels = tag_dataset(train_data)        
    accuracy = compute_accuracy(predicted_labels, correct_labels, padding_label)
    print("Accuracy = ", accuracy)

    #Dev Dataset 
    print("================================== Dev Data: ==================================")
    predicted_labels, correct_labels = tag_dataset(dev_data)  
    accuracy = compute_accuracy(predicted_labels, correct_labels, padding_label)
    print("Accuracy = ", accuracy)


    #Test Dataset 
    #state-of-the-art f-мера~0.91 на test
    print("================================== Test Data: ==================================")
    predicted_labels, correct_labels = tag_dataset(test_data)  
    accuracy = compute_accuracy(predicted_labels, correct_labels, padding_label)
    print("Accuracy = ", accuracy)

        
    print("%.2f sec for evaluation" % (time.time() - start_time))


10 epochs
14987 train sentences
3466 dev sentences
3684 test sentences
--------- Epoch 0 -----------


KeyboardInterrupt: 

In [None]:
# Improvements for credit
# 1. Create train-test-dev split function form one corpus
# 2. Instead of 8 casing types use general capitalization templates
# 3. Orginize sentence padding to (2^n + 1) tokens where (2^n + 1) is closest to current sentence length
# 4. Given item 3 realization organize reasanable work with batches i. e. make iterate_minibatches yield batches of
#  BATCH_SIZE sentences of same padded length(necessary if you want to utilize learning on GPU properly)
# 5. Add symbol features to neural net- either feed embeddings of CHAR_SIZE first and CHAR_SIZE last symbols of each token
# to LSTM(or GRU) or feed embeddings of all symbols of token to CNN. The resulting vector of this layer(it should be
# of same length for every token) has to be concatenated with word embeddings and other token features
# 6. Conduct experiments with architecture: try stacking several LSTM instead of 1, try replacing it with GRU or CNN etc.
# 7. Instead of accuracy evaluate with f-measure on entities constucted from several tokens as in conlleval
# If all the improvements are implemented correcly you should be able to achieve results close to state-of-the-art:
# f-measure on test-set >= 0.905