In [4]:
from numpy.random import seed
seed(13)
from tensorflow import set_random_seed
set_random_seed(13)

from keras import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, MaxPooling3D, Dropout, Embedding, SimpleRNN, LSTM, GRU
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.utils import to_categorical
from keras.models import model_from_json
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from operator import itemgetter
import numpy as np
import os
import cv2             
import pandas as pd
import random as rand
import matplotlib.pyplot as plt
import shutil
import os

Using TensorFlow backend.


# Load Data

In [5]:
# %load conll_dictorizer.py
"""
CoNLL 2009 file readers and writers for the parts of speech.
Version with a class modeled as a vectorizer
"""
__author__ = "Pierre Nugues"

import regex as re


def save(file, corpus_dict, column_names):
    """
    Saves the corpus in a file
    :param file:
    :param corpus_dict:
    :param column_names:
    :return:
    """
    with open(file, 'w') as f_out:
        for sentence in corpus_dict:
            sentence_lst = []
            for row in sentence:
                items = map(lambda x: row.get(x, '_'), column_names)
                sentence_lst += '\t'.join(items) + '\n'
            sentence_lst += '\n'
            f_out.write(''.join(sentence_lst))


class Token(dict):
    pass


class CoNLLDictorizer:

    def __init__(self, column_names, sent_sep='\n\n', col_sep=' +'):
        self.column_names = column_names
        self.sent_sep = sent_sep
        self.col_sep = col_sep

    def fit(self):
        pass

    def transform(self, corpus):
        corpus = corpus.strip()
        sentences = re.split(self.sent_sep, corpus)
        return list(map(self._split_in_words, sentences))

    def fit_transform(self, corpus):
        return self.transform(corpus)

    def _split_in_words(self, sentence):
        rows = re.split('\n', sentence)
        return [Token(dict(zip(self.column_names,
                               re.split(self.col_sep, row))))
                for row in rows]


if __name__ == '__main__':
    
    BASE = os.getcwd()
    train_file = os.path.join(BASE, 'datasets/train.txt')

    column_names = ['id', 'form', 'lemma', 'cpos', 'pos', 'feats']
    train = open(train_file).read().strip()
    conll_dict = CoNLLDictorizer(column_names, col_sep='\t')
    train_dict = conll_dict.transform(train)

    print(train_dict[0])
    print(train_dict[0][0])
    print(type(train_dict[0][0]))
    #print(train_dict[0][0]['form'])
    print(train_dict[1])
    tok = Token({'id': '1', 'form': 'La', 'lemma': 'el', 'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'})
    print(tok['form'])
    print('form' in tok)

    save('out', train_dict, column_names)

    tok_dict = {'id': '1', 'form': 'La', 'lemma': 'el', 'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'}
    tok_dict2 = {'id': '1', 'form': 'La', 'lemma': 'el', 'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'}

    tok_set = set(tok_dict)
    print(tok_set)

    tok_set = tok_set.union(tok_dict2)
    print(tok_set)

    print(tok.keys())

    # exit()
    word_set = set()
    word_set = set(tok_dict.values())
    print(list(word_set))

    word_set = set()
    word_set = set(tok.values())
    print(list(word_set))

    word_set = set()
    word_set.update(tok.values())
    print(list(word_set))

    word_set = set()
    print("Token value:", tok.values())
    word_set = word_set.union(set(tok.values()))
    print(list(word_set))

[{'id': '-DOCSTART- -X- -X- O'}]
{'id': '-DOCSTART- -X- -X- O'}
<class '__main__.Token'>
[{'id': 'EU NNP B-NP B-ORG'}, {'id': 'rejects VBZ B-VP O'}, {'id': 'German JJ B-NP B-MISC'}, {'id': 'call NN I-NP O'}, {'id': 'to TO B-VP O'}, {'id': 'boycott VB I-VP O'}, {'id': 'British JJ B-NP B-MISC'}, {'id': 'lamb NN I-NP O'}, {'id': '. . O O'}]
La
True
{'form', 'pos', 'lemma', 'cpos', 'id', 'feats'}
{'form', 'lemma', 'pos', 'cpos', 'id', 'feats'}
dict_keys(['id', 'form', 'lemma', 'cpos', 'pos', 'feats'])
['1', 'da', 'La', 'num=s|gen=f', 'el', 'd']
['1', 'da', 'La', 'num=s|gen=f', 'el', 'd']
['1', 'da', 'La', 'num=s|gen=f', 'el', 'd']
Token value: dict_values(['1', 'La', 'el', 'd', 'da', 'num=s|gen=f'])
['1', 'La', 'num=s|gen=f', 'da', 'el', 'd']


In [6]:
# %load datasets.py
from conll_dictorizer import CoNLLDictorizer, Token
import os

def load_conll2009_pos():
    train_file = 'datasets\train.txt'
    dev_file = 'datasets\valid.txt'
    test_file = 'datasets\test.txt'
    test2_file = 'simple_pos_test.txt'

    column_names = ['id', 'form', 'lemma', 'plemma', 'pos', 'ppos']

    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    test2_sentences = open(test2_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names

def load_conll2003_en():
    BASE_DIR = os.getcwd()
    train_file = BASE_DIR + '/datasets/train.txt'
    dev_file = BASE_DIR + '/datasets/valid.txt'
    test_file = BASE_DIR + '/datasets/test.txt'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names


if __name__ == '__main__':
    train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()

    conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
    train_dict = conll_dict.transform(train_sentences)
    val_dict = conll_dict.transform(dev_sentences)
    test_dict = conll_dict.transform(test_sentences)
    print(train_dict[0])
    print(train_dict[1])

[{'form': '-DOCSTART-', 'ppos': '-X-', 'pchunk': '-X-', 'ner': 'O'}]
[{'form': 'EU', 'ppos': 'NNP', 'pchunk': 'B-NP', 'ner': 'B-ORG'}, {'form': 'rejects', 'ppos': 'VBZ', 'pchunk': 'B-VP', 'ner': 'O'}, {'form': 'German', 'ppos': 'JJ', 'pchunk': 'B-NP', 'ner': 'B-MISC'}, {'form': 'call', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'}, {'form': 'to', 'ppos': 'TO', 'pchunk': 'B-VP', 'ner': 'O'}, {'form': 'boycott', 'ppos': 'VB', 'pchunk': 'I-VP', 'ner': 'O'}, {'form': 'British', 'ppos': 'JJ', 'pchunk': 'B-NP', 'ner': 'B-MISC'}, {'form': 'lamb', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'}, {'form': '.', 'ppos': '.', 'pchunk': 'O', 'ner': 'O'}]


In [7]:
len(train_dict)

14987

In [8]:
train_dict[1]

[{'form': 'EU', 'ppos': 'NNP', 'pchunk': 'B-NP', 'ner': 'B-ORG'},
 {'form': 'rejects', 'ppos': 'VBZ', 'pchunk': 'B-VP', 'ner': 'O'},
 {'form': 'German', 'ppos': 'JJ', 'pchunk': 'B-NP', 'ner': 'B-MISC'},
 {'form': 'call', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'},
 {'form': 'to', 'ppos': 'TO', 'pchunk': 'B-VP', 'ner': 'O'},
 {'form': 'boycott', 'ppos': 'VB', 'pchunk': 'I-VP', 'ner': 'O'},
 {'form': 'British', 'ppos': 'JJ', 'pchunk': 'B-NP', 'ner': 'B-MISC'},
 {'form': 'lamb', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'},
 {'form': '.', 'ppos': '.', 'pchunk': 'O', 'ner': 'O'}]

# Data preprocessing

In [9]:
# train_dict is a list of lists of dictionaries
def extract_WORDS_NER_TAGS(train_dict):
    X, Y = [], []
    
    for list_dict in train_dict: # list of dicts, each list is a sentence
        X_sentence = []
        Y_sentence = []
        for d in list_dict: # each dict is a word
            word = d['form'].lower()
            ner = d['ner'].lower()
            if word != "-docstart-":
                X_sentence.append(word)
                Y_sentence.append(ner)
        
        X.append(X_sentence)
        Y.append(Y_sentence)
    
    return X, Y

In [10]:
def create_indices_words(X, glove):
    d_indices = {}
    # 0 = padding,  1 = unknown
    c = 2
            
    # Add all words from training set       
    for sentence in X:
        for word in sentence:
            if word not in d_indices:
                d_indices[word] = c
                c += 1
            
    print(c)
    
        
    # Add all glove words
    for word in glove:
        if word not in d_indices:
            d_indices[word] = c
            c += 1
        
    return d_indices

In [11]:
def create_indices_NER(Y):
    d_indices = {}
    c = 0
    
    # Add all NER tags from training set       
    for sentence in Y:
        for tag in sentence:
            if tag not in d_indices:
                d_indices[tag] = c
                c += 1
        
    return d_indices

In [12]:
def create_inverted_indices(d_indices):
    return {v: k for k, v in d_indices.items()}

In [13]:
def load_GloVe(file):
    embeddings_dict = {}
    glove = open(file, encoding='utf-8')
    
    for line in glove:
        line = line.strip().split()
        word = line[0]
        embedding_vec_word = np.array(line[1:], dtype='float32')
        embeddings_dict[word] = embedding_vec_word
        
    glove.close()
    return embeddings_dict

#### Extract WORDS and NER Tags

In [14]:
X, Y = extract_WORDS_NER_TAGS(train_dict)

#### Load GloVe vector embeddings 

In [15]:
embeddings_dict = load_GloVe('glove.6B/glove.6B.100d.txt')

#### Create indices and inverted indices

In [16]:
X_indices = create_indices_words(X, embeddings_dict)
Y_indices = create_indices_NER(Y)

21011


In [17]:
X_inv_indicies = create_inverted_indices(X_indices)
Y_inv_indicies = create_inverted_indices(Y_indices)

#### Calculate most similar word embeddings 

In [18]:
def most_similar_embeddings(embeddings_dict, key_word, n):
    
    cs_dict = {}
    key_word_embedding = embeddings_dict[key_word]
    
    for word, embedding in embeddings_dict.items():
        cs = cosine_similarity([key_word_embedding], [embedding])[0][0]
        cs_dict[word] = cs
    
    cs_sorted = sorted(cs_dict.items(), key=itemgetter(1), reverse=True)[1:n+1] # Do not return table
        
    return cs_sorted

In [19]:
#cs_most_similar = most_similar_embeddings(embeddings_dict, 'table', 5)

In [20]:
#cs_most_similar

#### Create GloVe word embedding matrix

In [21]:
def fill_GloVe_matrix(X_indices, embeddings_dict):
    c = 0
    for word in X_indices:
        embedding = embeddings_dict.get(word)
        if embedding is not None:
            word_embedding_matrix[c] = embedding
        else:
            word_embedding_matrix[c] = np.random.rand(1, 100)
        c += 1
    return word_embedding_matrix

In [22]:
word_embedding_matrix = np.random.rand(len(X_indices), 100)
word_embedding_matrix = fill_GloVe_matrix(X_indices, embeddings_dict)

#### Create sequenced data

In [23]:
len(X)

14987

In [24]:
len(X_indices)

402594

In [25]:
Y[1][0]

'b-org'

In [50]:
def build_sequences(X, Y, X_indices, Y_indices):
    # Find sequences. Each sequence is seperated by '.'
    X_all_seq = []
    X_seq = []
    Y_all_seq = []
    Y_seq = []
    
    for i_s, sentence in enumerate(X):
        for i_w, w in enumerate(sentence):
            
            word_encdeod = X_indices.get(w)
            tag = Y[i_s][i_w]
            tag_encoded = Y_indices.get(tag)
            #print(w, word_encdeod)
            #print(tag, tag_encoded)
            
            X_seq.append(word_encdeod)
            Y_seq.append(tag_encoded)
        
        X_all_seq.append(np.array(X_seq))
        X_seq = []
        Y_all_seq.append(np.array(Y_seq))
        Y_seq = []

    print(len(X_all_seq))
    max_len = max(len(l) for l in X_all_seq)
    #     avg_len = np.average(np.array([len(x) for x in X_all_seq]))
        
    X_all_seq = pad_sequences(X_all_seq, 50)
    Y_all_seq = pad_sequences(Y_all_seq, 50)
    
    return np.array(X_all_seq), np.array(Y_all_seq)

In [51]:
X_seq, Y_seq = build_sequences(X, Y, X_indices, Y_indices)

14987


In [55]:
Y_seq.shape[0]

14987

### Load validation set

In [53]:
X_val, Y_val = extract_WORDS_NER_TAGS(val_dict)
X_indices_val = create_indices_words(X_val, embeddings_dict)
Y_indices_val = create_indices_NER(Y_val)
X_inv_indicies_val = create_inverted_indices(X_indices_val)
Y_inv_indicies_val = create_inverted_indices(Y_indices_val)
X_seq_val, Y_seq_val = build_sequences(X_val, Y_val, X_indices_val, Y_indices_val)

9004
3466


In [54]:
X_seq.max()

21010

In [39]:
np.average(len(X_seq[:]))

14987.0

In [None]:
# 21011 = number of unique words

In [None]:
Y_seq[1042]

In [None]:
X_seq[0]

# Model

In [60]:
model = Sequential()

vocab_size = 21010
model.add(Embedding(input_dim=vocab_size, output_dim=10, input_length=50))
model.add(SimpleRNN(10, activation='relu', return_sequences=False))
model.add(Dense(50, activation='softmax'))
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 50, 10)            210100    
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 10)                210       
_________________________________________________________________
dense_3 (Dense)              (None, 50)                550       
Total params: 210,860
Trainable params: 210,860
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_seq, Y_seq, epochs=5, batch_size=10, verbose=1, validation_data=(X_seq_val, Y_seq_val))

Train on 14987 samples, validate on 3466 samples
Epoch 1/5
