# Part-of-Speech Tagging using Feedforward Networks

Author: Pierre Nugues

## Python Headers

### The Modules

In [None]:
import sys
import os
from sklearn.feature_extraction import DictVectorizer
import time
from keras import models, layers
import sys
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras.models import load_model
import math

### Some Parameters

In [None]:
OPTIMIZER = 'rmsprop'
SCALER = True
SIMPLE_MODEL = False
BATCH_SIZE = 128
EPOCHS = 1
MINI_CORPUS = True

## Preprocessing

### Loading the Corpus

In [None]:
def load_conll2009_pos():
    train_file = '/Users/pierre/Documents/Cours/EDAN20/corpus/conll2009/en/CoNLL2009-ST-English-train-pos.txt'
    dev_file = '/Users/pierre/Documents/Cours/EDAN20/corpus/conll2009/en/CoNLL2009-ST-English-development-pos.txt'
    test_file = '/Users/pierre/Documents/Cours/EDAN20/corpus/conll2009/en/CoNLL2009-ST-test-words-pos.txt'
    # test2_file = 'simple_pos_test.txt'

    column_names = ['id', 'form', 'lemma', 'plemma', 'pos', 'ppos']

    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    # test2_sentences = open(test2_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names

train_sentences, dev_sentences, test_sentences, column_names = load_conll2009_pos()

### Converting the Corpus in a Dictionary
We follow the fit-transform pattern of sklearn

In [None]:
import regex as re

class Token(dict):
    pass

class CoNLLDictorizer:

    def __init__(self, column_names, sent_sep='\n\n', col_sep=' +'):
        self.column_names = column_names
        self.sent_sep = sent_sep
        self.col_sep = col_sep

    def fit(self):
        pass

    def transform(self, corpus):
        corpus = corpus.strip()
        sentences = re.split(self.sent_sep, corpus)
        return list(map(self._split_in_words, sentences))

    def fit_transform(self, corpus):
        return self.transform(corpus)

    def _split_in_words(self, sentence):
        rows = re.split('\n', sentence)
        return [Token(dict(zip(self.column_names,
                               re.split(self.col_sep, row))))
                for row in rows]

In [None]:
conll_dict = CoNLLDictorizer(column_names, col_sep='\t')
train_dict = conll_dict.transform(train_sentences)
if MINI_CORPUS:
    train_dict = train_dict[:len(train_dict) // 5]
test_dict = conll_dict.transform(test_sentences)
print('First sentence, train:', train_dict[0])

### Extracting the Context and Dictorizing it

In [None]:
class ContextDictorizer():
    """
    Extract contexts of words in a sequence
    Contexts are of w_size to the left and to the right
    Builds an X matrix in the form of a dictionary
    and possibly extracts the output, y, if not in the test step
    If the test_step is True, returns y = []
    """

    def __init__(self, input='form', output='pos', w_size=2, tolower=True):
        self.BOS_symbol = '__BOS__'
        self.EOS_symbol = '__EOS__'
        self.input = input
        self.output = output
        self.w_size = w_size
        self.tolower = tolower
        # This was not correct as the names were not sorted
        # self.feature_names = [input + '_' + str(i)
        #                     for i in range(-w_size, w_size + 1)]
        # To be sure the names are ordered
        zeros = math.ceil(math.log10(2 * w_size + 1))
        self.feature_names = [input + '_' + str(i).zfill(zeros) for 
                              i in range(2 * w_size + 1)]

    def fit(self, sentences):
        """
        Build the padding rows
        :param sentences:
        :return:
        """
        self.column_names = sentences[0][0].keys()
        start = [self.BOS_symbol] * len(self.column_names)
        end = [self.EOS_symbol] * len(self.column_names)
        start_token = Token(dict(zip(self.column_names, start)))
        end_token = Token(dict(zip(self.column_names, end)))
        self.start_rows = [start_token] * self.w_size
        self.end_rows = [end_token] * self.w_size

    def transform(self, sentences, training_step=True):
        X_corpus = []
        y_corpus = []
        for sentence in sentences:
            X, y = self._transform_sentence(sentence, training_step)
            X_corpus += X
            if training_step:
                y_corpus += y
        return X_corpus, y_corpus

    def fit_transform(self, sentences):
        self.fit(sentences)
        return self.transform(sentences)

    def _transform_sentence(self, sentence, training_step=True):
        # We extract y
        if training_step:
            y = [row[self.output] for row in sentence]
        else:
            y = None

        # We pad the sentence
        sentence = self.start_rows + sentence + self.end_rows

        # We extract the features
        X = list()
        for i in range(len(sentence) - 2 * self.w_size):
            # x is a row of X
            x = list()
            # The words in lower case
            for j in range(2 * self.w_size + 1):
                if self.tolower:
                    x.append(sentence[i + j][self.input].lower())
                else:
                    x.append(sentence[i + j][self.input])
            # We represent the feature vector as a dictionary
            X.append(dict(zip(self.feature_names, x)))
        return X, y

    def print_example(self, sentences, id=1968):
        """
        :param corpus:
        :param id:
        :return:
        """
        # We print the features to check they match Table 8.1 in my book (second edition)
        # We use the training step extraction with the dynamic features
        Xs, ys = self._transform_sentence(sentences[id])
        print('X for sentence #', id, Xs)
        print('y for sentence #', id, ys)

### Running the Feature Extraction

In [None]:
context_dictorizer = ContextDictorizer()
context_dictorizer.fit(train_dict)
X_dict, y_cat = context_dictorizer.transform(train_dict)

In [None]:
context_dictorizer.print_example(train_dict)

### Vectorizing the $\mathbf{X}$ Matrix

In [None]:
# We transform the X symbols into numbers
dict_vectorizer = DictVectorizer()
X_num = dict_vectorizer.fit_transform(X_dict)

if SCALER:
    # We standardize X_num
    scaler = StandardScaler(with_mean=False)
    X = scaler.fit_transform(X_num)
else:
    X = X_num

### Vectorizing $\mathbf{y}$

In [None]:
# The POS and the number of different POS
pos_list = sorted(list(set(y_cat)))
NB_CLASSES = len(pos_list) + 1

# We build a part-of-speech index. We keep 0 for unknown symbols in the test set
pos_rev_idx = dict(enumerate(pos_list, start=1))
pos_idx = {v: k for k, v in pos_rev_idx.items()}

# We encode y
y = [pos_idx[i] for i in y_cat]
print(y_cat[:10])
y[:10]

## The Tagger

In [None]:
model = models.Sequential()
if SIMPLE_MODEL:
    model.add(layers.Dense(NB_CLASSES,
                           input_dim=X.shape[1],
                           activation='softmax'))
else:
    model.add(layers.Dense(NB_CLASSES * 2,
                           input_dim=X.shape[1],
                           activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(NB_CLASSES, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=OPTIMIZER,
              metrics=['accuracy'])
model.summary()

### Fitting the Model

In [None]:
np.random.seed(0)

start_time = time.perf_counter()
model.fit(X, y, epochs=EPOCHS, batch_size=BATCH_SIZE)
# model.save('out.model')
print('Time:', (time.perf_counter() - start_time) / 60)

### Testing the Model

In [None]:
X_test_dict, y_test_cat = context_dictorizer.transform(test_dict)

# We transform the symbols into numbers
X_test_num = dict_vectorizer.transform(X_test_dict)
X_test = scaler.transform(X_test_num)
y_test = [pos_idx.get(i, 0) for i in y_test_cat]

test_loss, test_acc = model.evaluate(X_test, y_test)
print('Optimizer', OPTIMIZER, 'Scaler', SCALER, 'Epochs', EPOCHS, 'Batch size', 
      BATCH_SIZE, 'Simple model', SIMPLE_MODEL, 'Mini corpus', MINI_CORPUS)
print('Loss:', test_loss)
print('Accuracy:', test_acc)