In [194]:

from numpy.random import seed
seed(13)
from tensorflow import set_random_seed
set_random_seed(13)

from keras import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, MaxPooling3D, Dropout, Embedding, Bidirectional, SimpleRNN, LSTM, GRU
from keras import layers
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.utils import to_categorical
from keras.models import model_from_json 
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tempfile import TemporaryFile
from tqdm import tqdm
from operator import itemgetter
import numpy as np
import os
import cv2             
import pandas as pd
import random as rand
import matplotlib.pyplot as plt
import shutil
import os
import pickle

# Load Data

In [2]:
# %load conll_dictorizer.py
"""
CoNLL 2009 file readers and writers for the parts of speech.
Version with a class modeled as a vectorizer
"""
__author__ = "Pierre Nugues"

import regex as re


def save(file, corpus_dict, column_names):
    """
    Saves the corpus in a file
    :param file:
    :param corpus_dict:
    :param column_names:
    :return:
    """
    with open(file, 'w') as f_out:
        for sentence in corpus_dict:
            sentence_lst = []
            for row in sentence:
                items = map(lambda x: row.get(x, '_'), column_names)
                sentence_lst += '\t'.join(items) + '\n'
            sentence_lst += '\n'
            f_out.write(''.join(sentence_lst))


class Token(dict):
    pass


class CoNLLDictorizer:

    def __init__(self, column_names, sent_sep='\n\n', col_sep=' +'):
        self.column_names = column_names
        self.sent_sep = sent_sep
        self.col_sep = col_sep

    def fit(self):
        pass

    def transform(self, corpus):
        corpus = corpus.strip()
        sentences = re.split(self.sent_sep, corpus)
        return list(map(self._split_in_words, sentences))

    def fit_transform(self, corpus):
        return self.transform(corpus)

    def _split_in_words(self, sentence):
        rows = re.split('\n', sentence)
        return [Token(dict(zip(self.column_names,
                               re.split(self.col_sep, row))))
                for row in rows]


if __name__ == '__main__':
    
    BASE = os.getcwd()
    train_file = os.path.join(BASE, 'datasets/train.txt')

    column_names = ['id', 'form', 'lemma', 'cpos', 'pos', 'feats']
    train = open(train_file).read().strip()
    conll_dict = CoNLLDictorizer(column_names, col_sep='\t')
    train_dict = conll_dict.transform(train)

    print(train_dict[0])
    print(train_dict[0][0])
    print(type(train_dict[0][0]))
    #print(train_dict[0][0]['form'])
    print(train_dict[1])
    tok = Token({'id': '1', 'form': 'La', 'lemma': 'el', 'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'})
    print(tok['form'])
    print('form' in tok)

    save('out', train_dict, column_names)

    tok_dict = {'id': '1', 'form': 'La', 'lemma': 'el', 'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'}
    tok_dict2 = {'id': '1', 'form': 'La', 'lemma': 'el', 'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'}

    tok_set = set(tok_dict)
    print(tok_set)

    tok_set = tok_set.union(tok_dict2)
    #print(tok_set)

    #print(tok.keys())

    # exit()
    word_set = set()
    word_set = set(tok_dict.values())
    #print(list(word_set))

    word_set = set()
    word_set = set(tok.values())
    #print(list(word_set))

    word_set = set()
    word_set.update(tok.values())
    #print(list(word_set))

    word_set = set()
    #print("Token value:", tok.values())
    word_set = word_set.union(set(tok.values()))
    #print(list(word_set))

[{'id': '-DOCSTART- -X- -X- O'}]
{'id': '-DOCSTART- -X- -X- O'}
<class '__main__.Token'>
[{'id': 'EU NNP B-NP B-ORG'}, {'id': 'rejects VBZ B-VP O'}, {'id': 'German JJ B-NP B-MISC'}, {'id': 'call NN I-NP O'}, {'id': 'to TO B-VP O'}, {'id': 'boycott VB I-VP O'}, {'id': 'British JJ B-NP B-MISC'}, {'id': 'lamb NN I-NP O'}, {'id': '. . O O'}]
La
True
{'pos', 'lemma', 'cpos', 'feats', 'form', 'id'}


In [3]:
# %load datasets.py
from conll_dictorizer import CoNLLDictorizer, Token
import os

def load_conll2009_pos():
    train_file = 'datasets\train.txt'
    dev_file = 'datasets\valid.txt'
    test_file = 'datasets\test.txt'
    test2_file = 'simple_pos_test.txt'

    column_names = ['id', 'form', 'lemma', 'plemma', 'pos', 'ppos']

    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    test2_sentences = open(test2_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names

def load_conll2003_en():
    BASE_DIR = os.getcwd()
    train_file = BASE_DIR + '/datasets/train.txt'
    dev_file = BASE_DIR + '/datasets/valid.txt'
    test_file = BASE_DIR + '/datasets/test.txt'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names


if __name__ == '__main__':
    train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()

    conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
    train_dict = conll_dict.transform(train_sentences)
    val_dict = conll_dict.transform(dev_sentences)
    test_dict = conll_dict.transform(test_sentences)
    print(train_dict[0])
    print(train_dict[1])

[{'form': '-DOCSTART-', 'ppos': '-X-', 'pchunk': '-X-', 'ner': 'O'}]
[{'form': 'EU', 'ppos': 'NNP', 'pchunk': 'B-NP', 'ner': 'B-ORG'}, {'form': 'rejects', 'ppos': 'VBZ', 'pchunk': 'B-VP', 'ner': 'O'}, {'form': 'German', 'ppos': 'JJ', 'pchunk': 'B-NP', 'ner': 'B-MISC'}, {'form': 'call', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'}, {'form': 'to', 'ppos': 'TO', 'pchunk': 'B-VP', 'ner': 'O'}, {'form': 'boycott', 'ppos': 'VB', 'pchunk': 'I-VP', 'ner': 'O'}, {'form': 'British', 'ppos': 'JJ', 'pchunk': 'B-NP', 'ner': 'B-MISC'}, {'form': 'lamb', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'}, {'form': '.', 'ppos': '.', 'pchunk': 'O', 'ner': 'O'}]


In [4]:
def load_glove(file):
    embeddings_dict = {}
    glove = open(file, encoding='utf-8')
    
    for line in glove:
        line = line.strip().split()
        word = line[0]
        embedding_vec_word = np.array(line[1:], dtype='float32')
        embeddings_dict[word] = embedding_vec_word
        
    glove.close()
    return embeddings_dict

# Load and Save files

In [6]:
def load_file(file_name):
    with open('files/' + file_name + '.pkl', 'rb') as f:
        obj = pickle.load(f)
    return obj

In [7]:
def save_file(file_name, file):
    with open('files/' + file_name + '.pkl', 'wb') as f:
        pickle.dump(file, f)

In [8]:
embeddings_dict = load_file('embeddings_dict')

# Data preprocessing

In [212]:
# Train_dict is a list of lists of dictionaries
def extract_features(train_dict):
    X, y = [], []
    
    for sentence in train_dict:
        X_sentence = []
        y_sentence = []
        for word in sentence:
            w = word['form'].lower()
            n = word['ner']
            X_sentence.append(w)
            y_sentence.append(n)
    
        X.append(X_sentence)
        y.append(y_sentence)
    
    return X, y

#### Extract words and ner tags - X, Y

In [213]:
X, y = extract_features(train_dict)
print('Sentence words: ', X[1])
print('Sentence NER: ', y[1])

Sentence words:  ['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
Sentence NER:  ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


#### Create vocabularies

In [214]:
def create_vocabulary(X, WORDS=True):
    X_vocabulary = set()
    
    if WORDS:
        X_vocabulary.add("UNKNOWN_WORD")
        
    for sentence in X:
        for word in sentence:
            X_vocabulary.add(word)
    
    return sorted(list(X_vocabulary))

In [215]:
X_vocabulary = create_vocabulary(X, WORDS=True)
print("Vocabulary size WORDS: ", len(X_vocabulary))

Vocabulary size WORDS:  21011


In [216]:
y_vocabulary = create_vocabulary(y, WORDS=False)
print("Vocabulary size NER: ", len(y_vocabulary))
nbr_of_classes = len(y_vocabulary) + 2

Vocabulary size NER:  9


In [217]:
y_vocabulary

['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']

#### Add words from GloVe

In [218]:
for word in embeddings_dict.keys():
    X_vocabulary.append(word)

X_vocabulary = sorted(list(set(X_vocabulary)))
total_word_count = len(X_vocabulary)
print('Words in the vocabulary total, X_vocabulary:', total_word_count)

Words in the vocabulary total, X_vocabulary: 402596


#### Create indices and inverted indices

In [219]:
def create_indices(X):
    return dict(enumerate(X), start=2) # 0 is padding, 1 is unknown

In [220]:
i = 2
X_indices_to_words = {}

for w in X_vocabulary:
    X_indices_to_words[i] = w
    i += 1

y_indices_to_len = {}
y_indices_to_len[0] = 'O' #PADDING
y_indices_to_len[1] = 'UNKNOWN_WORD'
i = 2

for l in y_vocabulary:
    if l != 'O':
        y_indices_to_len[i] = l
        i += 1
        
#X_indices_to_len = dict(enumerate(X_vocabulary), start=2)
#y_indices_to_len = dict(enumerate(y_vocabulary), start=2)

In [221]:
y_indices_to_len

{0: 'O',
 1: 'UNKNOWN_WORD',
 2: 'B-LOC',
 3: 'B-MISC',
 4: 'B-ORG',
 5: 'B-PER',
 6: 'I-LOC',
 7: 'I-MISC',
 8: 'I-ORG',
 9: 'I-PER'}

In [222]:
def create_inverted_indices(X):
    return {v: k for k, v in X.items()}

In [223]:
X_words_to_indices = create_inverted_indices(X_indices_to_words)
y_len_to_indices = create_inverted_indices(y_indices_to_len)

In [224]:
y_len_to_indices

{'O': 0,
 'UNKNOWN_WORD': 1,
 'B-LOC': 2,
 'B-MISC': 3,
 'B-ORG': 4,
 'B-PER': 5,
 'I-LOC': 6,
 'I-MISC': 7,
 'I-ORG': 8,
 'I-PER': 9}

In [225]:
print('Word index:', list(X_words_to_indices.items())[:3])
print('LEN index:', list(y_len_to_indices.items())[:3])

Word index: [('!', 2), ('!!', 3), ('!!!', 4)]
LEN index: [('O', 0), ('UNKNOWN_WORD', 1), ('B-LOC', 2)]


#### Encode lists - Convert to indices

In [226]:
def encode_to_indices(X, X_words_to_indices, num_words=None):
    X_encoded = []
    for x in X:
        X_encoded_words = []
        if num_words:
            # We map the unknown words to the second first index of the matrix, for the test set
            X_encoded_words = list(map(lambda x: X_words_to_indices.get(x,1), x))
        else:
             X_encoded_words = list(map(X_words_to_indices.get, x))
            #for val in x:
            #    X_encoded_words.append(X_words_to_indices.get(val))
        X_encoded += [X_encoded_words]
    return X_encoded

In [227]:
X_only_indices = encode_to_indices(X, X_words_to_indices)
y_only_indices = encode_to_indices(y, y_len_to_indices)
print('First sentences, word indices', X_only_indices[4])
print("")
print('First sentences, LEN indices', y_only_indices[4])

First sentences, word indices [359699, 143138, 107474, 318005, 271940, 361488, 195554, 126463, 391264, 161837, 48420, 363369, 109493, 363369, 332754, 85853, 218261, 375629, 324031, 123767, 389005, 231734, 112304, 126757, 92049, 72526, 366525, 363369, 330316, 936]

First sentences, LEN indices [0, 4, 8, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Embeddings

In [228]:
def most_similar_embeddings(embeddings_dict, key_word, n):
    
    d = {}
    key_word_embedding = embeddings_dict[key_word]
    
    for word, embedding in embeddings_dict.items():
        d_val = cosine_similarity([key_word_embedding], [embedding])[0][0]
        d[word] = d_val
    
    d_sorted = sorted(d.items(), key=itemgetter(1), reverse=True)[1:n+1] # Do not return table
        
    return d_sorted

In [229]:
#table = most_similar_embeddings(embeddings_dict, 'table', 5)
#france = most_similar_embeddings(embeddings_dict, 'france', 5)
#sweden = most_similar_embeddings(embeddings_dict, 'sweden', 5)

In [230]:
#print(table)
#print(france)
#print(sweden)

In [231]:
def fill_glove_matrix(X_vocabulary, embeddings_dict):
    for word in X_vocabulary:
        if word in embeddings_dict:
            i = X_words_to_indices[word]
            embedding = embeddings_dict[word]
            word_embedding_matrix[i] = embedding
    return word_embedding_matrix

In [232]:
word_embedding_matrix = np.random.random((len(X_vocabulary)+2, 100))
word_embedding_matrix = fill_glove_matrix(X_vocabulary, embeddings_dict)
print('Shape of embedding matrix:', word_embedding_matrix.shape)
#print('Embedding of the padding symbol, idx 0, random numbers', word_embedding_matrix[0])

Shape of embedding matrix: (402598, 100)


# Padding the sequences

#### Find the longest sequence in either train, val, test

In [233]:
max_seq_len_train = max(len(s) for s in X)
print("Maximum sentence length in train: ", max_seq_len_train)

X_val, _ = extract_features(val_dict)
max_seq_len_val = max(len(s) for s in X_val)
print("Maximum sentence length in val: ", max_seq_len_val)

X_test, _ = extract_features(test_dict)
max_seq_len_test = max(len(s) for s in X_test)
print("Maximum sentence length in test: ", max_seq_len_test)

max_seq_len = max(max_seq_len_train, max_seq_len_val, max_seq_len_test)

print("Maximum sentence length total: ", max_seq_len)

Maximum sentence length in train:  113
Maximum sentence length in val:  109
Maximum sentence length in test:  124
Maximum sentence length total:  124


In [234]:
X_train = pad_sequences(X_only_indices, maxlen=max_seq_len)
y_train = pad_sequences(y_only_indices, maxlen=max_seq_len)

print(X_train[1])
print(y_train[1])

# The number of classes and 0 (padding symbol)
y_train = to_categorical(y_train, num_classes=nbr_of_classes + 1)
print(y_train[1])

[     0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0 142144 307144 161837  91322 363369
  83767  85853 218261    936]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

# Load validation data

In [235]:
X_val, y_val = extract_features(val_dict)

# We create the parallel sequences of indexes
X_only_indices_val = encode_to_indices(X_val, X_words_to_indices, num_words=total_word_count)
y_only_indices_val = encode_to_indices(y_val, y_len_to_indices)

X_val = pad_sequences(X_only_indices_val, maxlen=max_seq_len)
y_val = pad_sequences(y_only_indices_val, maxlen=max_seq_len)

print(X_val[1])
print(y_val[1])

y_val = to_categorical(y_val, num_classes=nbr_of_classes + 1)
print(y_val[1])

[     0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0 113352    679 221876 354361 275585  63472 364506
  49151 192164 381012    936]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

# Params

In [236]:
epochs = 1
batch_size = 128

# Model

#### SIMPLE RNN

In [237]:
def build_simpleRNN():
    model = Sequential()
    model.add(Embedding(total_word_count+2,
                               100,
                               mask_zero=True,
                               input_length=max_seq_len))
    model.layers[0].set_weights([word_embedding_matrix])
    model.layers[0].trainable = True
    model.add(SimpleRNN(32, return_sequences=True))
    model.add(Dense(nbr_of_classes + 1, activation='softmax'))
    return model

#### LSTM 

In [238]:
def build_LSTM():
    model = Sequential()
    model.add(Embedding(total_word_count+2,
                               100,
                               mask_zero=True,
                               input_length=max_seq_len))
    model.layers[0].set_weights([word_embedding_matrix])
    model.layers[0].trainable = True
    model.add(LSTM(128, return_sequences=True))
    model.add(Dense(nbr_of_classes + 1, activation='softmax'))
    return model

#### LSTM BIDIRECTIONAL

In [239]:
def build_LSTM_BIDIRECTIONAL():
    model = Sequential()
    model.add(Embedding(total_word_count+2,
                               100,
                               mask_zero=True,
                               input_length=max_seq_len))
    model.layers[0].set_weights([word_embedding_matrix])
    model.layers[0].trainable = True
    model.add(Bidirectional(layers.SimpleRNN(nbr_of_classes + 1, return_sequences=True)))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dense(nbr_of_classes + 1, activation='softmax'))
    return model

In [240]:
model = build_LSTM_BIDIRECTIONAL()

In [241]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 124, 100)          40259800  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 124, 24)           2712      
_________________________________________________________________
lstm_4 (LSTM)                (None, 124, 128)          78336     
_________________________________________________________________
dense_6 (Dense)              (None, 124, 12)           1548      
Total params: 40,342,396
Trainable params: 40,342,396
Non-trainable params: 0
_________________________________________________________________


In [242]:
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(X_val, y_val))

Train on 14987 samples, validate on 3466 samples
Epoch 1/1


<keras.callbacks.History at 0x7f6388def080>

# Evaluate

#### Load test set

In [262]:
X_test, y_test = extract_features(test_dict)

# We create the parallel sequences of indexes
X_only_indices_test = encode_to_indices(X_test, X_words_to_indices, num_words=total_word_count)
y_only_indices_test = encode_to_indices(y_test, y_len_to_indices)

X_test_padded = pad_sequences(X_only_indices_test, maxlen=max_seq_len)
y_test_padded = pad_sequences(y_only_indices_test, maxlen=max_seq_len)

print(X_test_padded[1])
print(y_test_padded[1])

# The number of LEN classes and 0 (padding symbol)
y_test_vectorized = to_categorical(y_test_padded, num_classes=nbr_of_classes + 1)
print(y_test_vectorized[1])

[     0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0 338645    679 197601 162138 229068 390519    517 100681
 190292 350950 120819    936]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [263]:
y_indices_to_len

{0: 'O',
 1: 'UNKNOWN_WORD',
 2: 'B-LOC',
 3: 'B-MISC',
 4: 'B-ORG',
 5: 'B-PER',
 6: 'I-LOC',
 7: 'I-MISC',
 8: 'I-ORG',
 9: 'I-PER'}

In [264]:
y_test[1]

['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']

In [246]:
print(len(X_test), len(y_test), len(X_only_indices_test), len(y_only_indices_test))

3684 3684 3684 3684


In [247]:
loss, acc = model.evaluate(X_test_padded, y_test_vectorized, batch_size=batch_size, verbose=1)



In [248]:
print(loss)

0.5609088026868921


In [249]:
print(acc)

0.8352069379711254


#### Connleval

In [287]:
y_pred_probs = model.predict(X_test_padded)

In [288]:
y_pred_probs[1]

array([[8.7175146e-02, 8.0977969e-02, 8.4987573e-02, ..., 8.3782949e-02,
        8.1285521e-02, 8.1265487e-02],
       [8.7175146e-02, 8.0977969e-02, 8.4987573e-02, ..., 8.3782949e-02,
        8.1285521e-02, 8.1265487e-02],
       [8.7175146e-02, 8.0977969e-02, 8.4987573e-02, ..., 8.3782949e-02,
        8.1285521e-02, 8.1265487e-02],
       ...,
       [9.3399054e-01, 1.1692542e-04, 2.3872891e-02, ..., 1.2507252e-03,
        7.9098267e-05, 1.0645347e-04],
       [9.6727854e-01, 6.9592221e-05, 1.1995676e-02, ..., 8.4547827e-04,
        4.0951207e-05, 6.5927372e-05],
       [9.8896211e-01, 1.6966827e-05, 4.4216886e-03, ..., 1.5991685e-04,
        9.9133158e-06, 1.3971513e-05]], dtype=float32)

In [291]:
print('X_test', X_test[1])
print('X_test_padded', X_test_padded[1])
print('Y_test', y_test[1])
print('Y_test_padded', y_test_padded[1])
print('predictions', y_pred_probs[1])

X_test ['soccer', '-', 'japan', 'get', 'lucky', 'win', ',', 'china', 'in', 'surprise', 'defeat', '.']
X_test_padded [     0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0 338645    679 197601 162138 229068 390519    517 100681
 190292 350950 120819    936]
Y_

In [294]:
# Remove padding
y_pred_probs_no_padd = []
for sent_nbr, sent_len_predictions in enumerate(corpus_pos_predictions):
    y_pred_probs_no_padd += [sent_len_predictions[-len(X_test[sent_nbr]):]]
print(y_pred_probs_no_padd[0])

[[0.7705599  0.00751109 0.04419868 0.02887081 0.03264279 0.02014346
  0.01727237 0.01992202 0.02555126 0.01793962 0.00776042 0.00762758]]


In [302]:
# Extract prediction with highest probability and convert indices to symbols
y_pred = []
for sentence in y_pred_probs_no_padd:
    len_idx = list(map(np.argmax, sentence))
    len_cat = list(map(y_indices_to_len.get, len_idx))
    y_pred += [len_cat]

print(y_pred[:3])
print("")
print(y_test[:3])

[['O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], [None, None]]

[['O'], ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O'], ['B-PER', 'I-PER']]


In [303]:
len(y_pred), len(y_test)

(3684, 3684)

In [304]:
c = 0
for arr in y_test:
    for x in arr:
        c += 1

print(c)

46666


In [305]:
c = 0
for arr in y_pred:
    for x in arr:
        c += 1

print(c)

46666


In [306]:
c = 0
for i, arr in enumerate(y_test):
    for j, word in enumerate(arr):
        
        try:
            
            if y_pred[i][j] == y_test[i][j]:
                c += 1
        except:
            print(i, j)

    
print(c)

39616


In [307]:
len(y_pred[213]), len(y_test[213])

(124, 124)

In [308]:
total, correct, total_ukn, correct_ukn = 0, 0, 0, 0
for id_s, sentence in enumerate(X_test):
    for id_w, word in enumerate(sentence):
        
        if word not in X_words_to_indices:
            total += 1
            
print(total)

1143


In [309]:
total, correct, total_ukn, correct_ukn = 0, 0, 0, 0
missing = 0

for id_s, sentence in enumerate(X_test):
    for id_w, word in enumerate(sentence):
        
        try:
            if y_pred[id_s][id_w] == y_test[id_s][id_w]:
                correct += 1
        except:
            print(id_s, id_w)
            missing += 1
        # The word is not in the dictionary
        if word not in X_words_to_indices:
            total_ukn += 1
            if pos_pred[id_s][id_w] == y_test[id_s][id_w]:
                correct_ukn += 1

total += correct
total += total_ukn
total += correct_ukn
print('total %d, correct %d, accuracy %f' % (total, correct, correct / total))
if total_ukn != 0:
    print('total unknown %d, correct %d, accuracy %f' % (total_ukn, correct_ukn, correct_ukn / total_ukn))

total 41620, correct 39616, accuracy 0.951850
total unknown 1143, correct 861, accuracy 0.753281


In [186]:
f = open("F1_SCORE.txt", "w")

x = 0

for id_s, sentence in enumerate(y_test):
    
    #if x == 1000:
    #    break
        
    for id_w, word in enumerate(sentence):
        x += 1
        
        word = test_dict[id_s][id_w]['form']
        ppos = test_dict[id_s][id_w]['ppos']
        pchunck = test_dict[id_s][id_w]['pchunk']
        y_true = y_test[id_s][id_w]
        pred = y_pred[id_s][id_w]

        f.write(str(word) + " " + str(ppos) + " " + str(pchunck) + " " + str(y_true).upper() + " " + str(pred).upper() + "\n")

        if word == "-DOCSTART-" or word == ".":
            f.write("\n")
                
        except Exception as e:
            print(e)
            print(id_s, id_w)
        
f.close()
        