In [None]:
from numpy.random import seed
seed(13)
from tensorflow import set_random_seed
set_random_seed(13)

from keras import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, MaxPooling3D, Dropout, Embedding, SimpleRNN, LSTM, GRU
from keras import layers
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.utils import to_categorical
from keras.models import model_from_json
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tempfile import TemporaryFile
from tqdm import tqdm
from operator import itemgetter
import numpy as np
import os
import cv2             
import pandas as pd
import random as rand
import matplotlib.pyplot as plt
import shutil
import os
import pickle

# Load Data

In [None]:
# %load conll_dictorizer.py
"""
CoNLL 2009 file readers and writers for the parts of speech.
Version with a class modeled as a vectorizer
"""
__author__ = "Pierre Nugues"

import regex as re


def save(file, corpus_dict, column_names):
    """
    Saves the corpus in a file
    :param file:
    :param corpus_dict:
    :param column_names:
    :return:
    """
    with open(file, 'w') as f_out:
        for sentence in corpus_dict:
            sentence_lst = []
            for row in sentence:
                items = map(lambda x: row.get(x, '_'), column_names)
                sentence_lst += '\t'.join(items) + '\n'
            sentence_lst += '\n'
            f_out.write(''.join(sentence_lst))


class Token(dict):
    pass


class CoNLLDictorizer:

    def __init__(self, column_names, sent_sep='\n\n', col_sep=' +'):
        self.column_names = column_names
        self.sent_sep = sent_sep
        self.col_sep = col_sep

    def fit(self):
        pass

    def transform(self, corpus):
        corpus = corpus.strip()
        sentences = re.split(self.sent_sep, corpus)
        return list(map(self._split_in_words, sentences))

    def fit_transform(self, corpus):
        return self.transform(corpus)

    def _split_in_words(self, sentence):
        rows = re.split('\n', sentence)
        return [Token(dict(zip(self.column_names,
                               re.split(self.col_sep, row))))
                for row in rows]


if __name__ == '__main__':
    
    BASE = os.getcwd()
    train_file = os.path.join(BASE, 'datasets/train.txt')

    column_names = ['id', 'form', 'lemma', 'cpos', 'pos', 'feats']
    train = open(train_file).read().strip()
    conll_dict = CoNLLDictorizer(column_names, col_sep='\t')
    train_dict = conll_dict.transform(train)

    print(train_dict[0])
    print(train_dict[0][0])
    print(type(train_dict[0][0]))
    #print(train_dict[0][0]['form'])
    print(train_dict[1])
    tok = Token({'id': '1', 'form': 'La', 'lemma': 'el', 'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'})
    print(tok['form'])
    print('form' in tok)

    save('out', train_dict, column_names)

    tok_dict = {'id': '1', 'form': 'La', 'lemma': 'el', 'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'}
    tok_dict2 = {'id': '1', 'form': 'La', 'lemma': 'el', 'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'}

    tok_set = set(tok_dict)
    print(tok_set)

    tok_set = tok_set.union(tok_dict2)
    #print(tok_set)

    #print(tok.keys())

    # exit()
    word_set = set()
    word_set = set(tok_dict.values())
    #print(list(word_set))

    word_set = set()
    word_set = set(tok.values())
    #print(list(word_set))

    word_set = set()
    word_set.update(tok.values())
    #print(list(word_set))

    word_set = set()
    #print("Token value:", tok.values())
    word_set = word_set.union(set(tok.values()))
    #print(list(word_set))

In [None]:
# %load datasets.py
from conll_dictorizer import CoNLLDictorizer, Token
import os

def load_conll2009_pos():
    train_file = 'datasets\train.txt'
    dev_file = 'datasets\valid.txt'
    test_file = 'datasets\test.txt'
    test2_file = 'simple_pos_test.txt'

    column_names = ['id', 'form', 'lemma', 'plemma', 'pos', 'ppos']

    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    test2_sentences = open(test2_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names

def load_conll2003_en():
    BASE_DIR = os.getcwd()
    train_file = BASE_DIR + '/datasets/train.txt'
    dev_file = BASE_DIR + '/datasets/valid.txt'
    test_file = BASE_DIR + '/datasets/test.txt'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names


if __name__ == '__main__':
    train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()

    conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
    train_dict = conll_dict.transform(train_sentences)
    val_dict = conll_dict.transform(dev_sentences)
    test_dict = conll_dict.transform(test_sentences)
    print(train_dict[0])
    print(train_dict[1])

In [None]:
def load_glove(file):
    embeddings_dict = {}
    glove = open(file, encoding='utf-8')
    
    for line in glove:
        line = line.strip().split()
        word = line[0]
        embedding_vec_word = np.array(line[1:], dtype='float32')
        embeddings_dict[word] = embedding_vec_word
        
    glove.close()
    return embeddings_dict

# Load and Save files

In [None]:
def load_file(file_name):
    with open('files/' + file_name + '.pkl', 'rb') as f:
        obj = pickle.load(f)
    return obj

In [None]:
def save_file(file_name, file):
    with open('files/' + file_name + '.pkl', 'wb') as f:
        pickle.dump(file, f)

In [None]:
save_file('X_train', X_train)
save_file('y_train', y_train)

In [None]:
X = load_file('X')
y = load_file('y')
X_indices_to_words = load_file('X_indices_to_words')
y_indices_to_len = load_file('y_indices_to_len')
X_words_to_indices = load_file('X_words_to_indices')
y_len_to_indices = load_file('y_len_to_indices')
X_vocabulary = load_file('X_vocabulary')
y_vocabulary = load_file('X_vocabulary')
X_only_indices = load_file('X_only_indices')
y_only_indices = load_file('y_only_indices')
word_embedding_matrix = load_file('word_embedding_matrix')
embeddings_dict = load_file('embeddings_dict')
cs_most_similar = load_file('cs_most_similar')
X_train = load_file('X_train')
y_train = load_file('y_train')

# Data preprocessing

In [None]:
# Train_dict is a list of lists of dictionaries
def extract_features(train_dict):
    X, y = [], []
    
    for sentence in train_dict:
        X_sentence = []
        y_sentence = []
        for word in sentence:
            w = word['form'].lower()
            n = word['ner'].lower()
            X_sentence.append(w)
            y_sentence.append(n)
    
        X.append(X_sentence)
        y.append(y_sentence)
    
    return X, y

#### Extract words and ner tags - X, Y

In [None]:
X, y = extract_features(train_dict)
print('Sentence words: ', X[1])
print('Sentence NER: ', y[1])

#### Create vocabularies

In [None]:
def create_vocabulary(X, WORDS=True):
    X_vocabulary = set()
    for sentence in X:
        for word in sentence:
            X_vocabulary.add(word)
    
    if WORDS:
        X_vocabulary.add("UNKNOWN WORD")
    
    return list(X_vocabulary)

In [None]:
X_vocabulary = create_vocabulary(X)
print("Vocabulary size WORDS: ", len(X_vocabulary))

In [None]:
y_vocabulary = create_vocabulary(y, WORDS=True)
print("Vocabulary size NER: ", len(y_vocabulary))
nbr_of_classes = len(y_vocabulary) + 1

#### Add words from GloVe

In [None]:
for word in embeddings_dict.keys():
    X_vocabulary.append(word)

X_vocabulary = set(X_vocabulary)
total_word_count = len(X_vocabulary)
print('Words in the vocabulary total, X_vocabulary:', total_word_count)

#### Create indices and inverted indices

In [None]:
def create_indices(X):
    return dict(enumerate(X), start=2) # 0 is padding, 1 is unknown

In [None]:
X_indices_to_words = create_indices(X_vocabulary) 
y_indices_to_len = create_indices(y_vocabulary)

In [None]:
def create_inverted_indices(X):
    return {v: k for k, v in X.items()}

In [None]:
X_words_to_indices = create_inverted_indices(X_indices_to_words)
y_len_to_indices = create_inverted_indices(y_indices_to_len)

In [None]:
print('Word index:', list(X_words_to_indices.items())[:3])
print('LEN index:', list(y_len_to_indices.items())[:3])

#### Encode lists - Convert to indices

In [None]:
def encode_to_indices(X, X_words_to_indices, num_words=None):
    X_encoded = []
    for x in X:
        X_encoded_words = []
        if num_words:
            # We map the unknown words to the second first index of the matrix, for the test set
            X_encoded_words = list(map(lambda x: X_words_to_indices.get(x, 1), x))
        else:
            for val in x:
                X_encoded_words.append(X_words_to_indices.get(val))
        X_encoded += [X_encoded_words]
    return X_encoded

In [None]:
# We create the parallel sequences of indexes
X_only_indices = encode_to_indices(X, X_words_to_indices)
y_only_indices = encode_to_indices(y, y_len_to_indices)
print('First sentences, word indices', X_only_indices[4])
print("")
print('First sentences, POS indices', y_only_indices[4])

# Embeddings

In [None]:
def most_similar_embeddings(embeddings_dict, key_word, n):
    
    d = {}
    key_word_embedding = embeddings_dict[key_word]
    
    for word, embedding in embeddings_dict.items():
        d_val = cosine_similarity([key_word_embedding], [embedding])[0][0]
        d[word] = d_val
    
    d_sorted = sorted(d.items(), key=itemgetter(1), reverse=True)[1:n+1] # Do not return table
        
    return d_sorted

In [None]:
#cs_most_similar = most_similar_embeddings(embeddings_dict, 'table', 5)

In [None]:
print("Most similar words: ")
cs_most_similar

In [None]:
def fill_glove_matrix(X_vocabulary, embeddings_dict):
    for word in X_vocabulary:
        if word in embeddings_dict:
            i = X_words_to_indices[word]
            embedding = embeddings_dict[word]
            word_embedding_matrix[i] = embedding
    return word_embedding_matrix

In [None]:
word_embedding_matrix = np.random.random((len(X_vocabulary)+2, 100))
word_embedding_matrix = fill_glove_matrix(X_vocabulary, embeddings_dict)
print('Shape of embedding matrix:', word_embedding_matrix.shape)
print('Embedding of the padding symbol, idx 0, random numbers', word_embedding_matrix[0])

# Padding the sequences

In [None]:
max_seq_len = max(len(s) for s in X)
print("Maximum sentence length: ", max_seq_len)

In [None]:
X_train = pad_sequences(X_only_indices, maxlen=max_seq_len, padding="post")
y_train = pad_sequences(y_only_indices, maxlen=max_seq_len, padding="post")

print(X_train[1])
print(y_train[1])

# The number of POS classes and 0 (padding symbol)
y_train = to_categorical(y_train, num_classes=nbr_of_classes + 1)
print(y_train[1])

# Load validation data

In [None]:
X_val, y_val = extract_features(val_dict)

print(len(val_dict))

# We create the parallel sequences of indexes
X_only_indices_val = encode_to_indices(X_val, X_words_to_indices)
y_only_indices_val = encode_to_indices(y_val, y_len_to_indices)

X_val = pad_sequences(X_only_indices, maxlen=max_seq_len, padding="post")
y_val = pad_sequences(y_only_indices, maxlen=max_seq_len, padding="post")

print(X_val[1])
print(y_val[1])

# The number of POS classes and 0 (padding symbol)
y_val = to_categorical(y_val, num_classes=nbr_of_classes + 1)
print(y_val[1])

# Params

In [None]:
epochs = 1
batch_size = 128

# Model

#### Simple RNN

In [None]:
def build_simpleRNN():
    model = Sequential()
    model.add(Embedding(len(X_vocabulary)+2,
                               100,
                               mask_zero=True,
                               input_length=max_seq_len))
    model.layers[0].set_weights([word_embedding_matrix])
    # The default is True
    model.layers[0].trainable = True
    #model.add(layers.Bidirectional(layers.SimpleRNN(NB_CLASSES + 1, return_sequences=True)))
    #model.add(layers.Bidirectional(layers.LSTM(NB_CLASSES + 1, return_sequences=True)))
    model.add(SimpleRNN(128, return_sequences=True))
    #model.add(layers.Bidirectional(layers.LSTM(LSTM_UNITS, return_sequences=True)))
    model.add(Dense(nbr_of_classes + 1, activation='softmax'))
    return model

#### LSTM 

In [None]:
def build_LSTM():
    model = Sequential()
    model.add(Embedding(len(X_vocabulary)+2,
                               100,
                               mask_zero=True,
                               input_length=max_seq_len))
    model.layers[0].set_weights([word_embedding_matrix])
    # The default is True
    model.layers[0].trainable = True
    #model.add(layers.Bidirectional(layers.SimpleRNN(NB_CLASSES + 1, return_sequences=True)))
    #model.add(layers.Bidirectional(layers.LSTM(NB_CLASSES + 1, return_sequences=True)))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dense(nbr_of_classes + 1, activation='softmax'))
    return model

#### LSTM BIDIRECTIONAL

#### GRU 

In [None]:
model = build_simpleRNN()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(X_val, y_val))

# Evaluate

#### Load test set

In [None]:
X_test, y_test = extract_features(test_dict)

# We create the parallel sequences of indexes
X_only_indices_test = encode_to_indices(X_test, X_words_to_indices, num_words=total_word_count)
y_only_indices_test = encode_to_indices(y_test, y_len_to_indices)

X_test_padded = pad_sequences(X_only_indices_test, maxlen=max_seq_len, padding="post")
y_test_padded = pad_sequences(y_only_indices_test, maxlen=max_seq_len, padding="post")

print(X_test_padded[1])
print(y_test_padded[1])

# The number of POS classes and 0 (padding symbol)
y_test_vectorized = to_categorical(y_test_padded, num_classes=nbr_of_classes + 1)
print(y_test_vectorized[1])

In [None]:
loss, acc = model.evaluate(X_test_padded, y_test_vectorized, batch_size=batch_size, verbose=1)

In [None]:
print(loss)

In [None]:
print(acc)

#### Connleval

In [None]:
corpus_pos_predictions = model.predict_classes(X_test_padded)