# Assignment 3 : Sequence labelling with RNNs
In this assignement we will ask you to perform POS tagging.

You are asked to follow these steps:
*   Download the corpora and split it in training and test sets, structuring a dataframe.
*   Embed the words using GloVe embeddings
*   Create a baseline model, using a simple neural architecture
*   Experiment doing small modifications to the model
*   Evaluate your best model
*   Analyze the errors of your model

**Corpora**:
Ignore the numeric value in the third column, use only the words/symbols and its label.
https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip 

**Splits**: documents 1-100 are the train set, 101-150 validation set, 151-199 test set.

**Baseline**: two layers architecture: a Bidirectional LSTM and a Dense/Fully-Connected layer on top.

**Modifications**: experiment using a GRU instead of the LSTM, adding an additional LSTM layer, and using a CRF in addition to the LSTM. Each of this change must be done by itself (don't mix these modifications).

**Training and Experiments**: all the experiments must involve only the training and validation sets.

**Evaluation**: in the end, only the best model of your choice must be evaluated on the test set. The main metric must be F1-Macro computed between the various part of speech (without considering punctuation classes).

**Error Analysis** (optional) : analyze the errors done by your model, try to understand which may be the causes and think about how to improve it.

**Report**: You are asked to deliver a small report of about 4-5 lines in the .txt file that sums up your findings.

In [1]:
%autosave 1
import os
import re
import numpy as np
from matplotlib import pyplot as plt

from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000

import seaborn as sns

from gensim.models import KeyedVectors

from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Input
from keras.layers import TimeDistributed
from keras.layers import LSTM, GRU, Bidirectional, SimpleRNN, RNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

Autosaving every 1 seconds


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Load data

In [2]:
# all senteces
data = []
base_dir = 'dependency_treebank/'
for filename in os.listdir(base_dir):
    with open(base_dir + filename) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    content = [re.sub('\s+', ' ', x) for x in content]
    # remove empty strings
    content = [x for x in content if x]
    data.append(content)

In [3]:
def split_features_labels(data):
    X = []
    y = []
    for document in data:
        X_i = []
        y_i = []
        for i in document:
            X_i.append(i.split(' ')[0])
            y_i.append(i.split(' ')[1])
        X.append(X_i)
        y.append(y_i)
    return X, y

In [4]:
X, y = split_features_labels(data[:100])

### Load GloVe model

In [5]:
import gensim.downloader as gloader

def load_embedding_model(model_type, embedding_dimension=50):
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """

    download_path = ""
    '''
    # Find the correct embedding model name
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"
    '''
    if model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)

    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove")

    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

In [6]:
embedding_model_type = "glove"
embedding_dimension = 50
embedding_model = load_embedding_model(embedding_model_type, embedding_dimension)

### Build embedding matrix

In [17]:
def build_vocabulary(df):
    flat_list = [item for sublist in df for item in sublist]
    return flat_list
vocabulary = build_vocabulary(X)

In [18]:
len(vocabulary)

50371

In [19]:
def check_OOV_terms(embedding_model, vocabulary):
    model_vocab_array = np.array(list(embedding_model.vocab.keys()))
    return list(np.setdiff1d(vocabulary, model_vocab_array))

oov_terms = check_OOV_terms(embedding_model, vocabulary)

In [20]:
print("Total OOV terms: {0} ({1:.2f}%)".format(len(oov_terms), float(len(oov_terms)) / len(vocabulary)))

Total OOV terms: 2258 (0.04%)


In [24]:
def build_embedding_matrix(embedding_model, embedding_dimension, documents, vocabulary, oov_terms):
    embedding_matrix = np.zeros((len(vocabulary), embedding_dimension))
    idx = 0
    for doc in documents:
        for i in doc:
            if i not in oov_terms:
                embedding_matrix[idx, :] = np.array(embedding_model.wv[i])
            else:
                mu, sigma = 0.11, 0.67
                s = np.random.normal(mu, sigma, embedding_dimension)
                embedding_matrix[idx, :] = np.array(s)
            idx += 1
    return embedding_matrix


In [25]:
# Testing
embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, X, vocabulary, oov_terms)

print("Embedding matrix shape: {}".format(embedding_matrix.shape))

  import sys


Embedding matrix shape: (50371, 50)


In [None]:
# use Keras' to_categorical function to one-hot encode Y
y = to_categorical(y)

X_train, y_train = X[:100, :], y[:100, :, :]
X_val, y_val = X[100:150, :], y[100:150, :, :]
X_test, y_test = X[150:, :], y[150:, :, :]

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape