# Chapter 14: Part-of-Speech and Sequence Annotation
A baseline POS tagger that uses the most frequent part of speech to annotate the words

Programs from the book: [Python for Natural Language Processing](https://link.springer.com/book/9783031575488)

__Author__: Pierre Nugues

## Modules

We import a CoNLL reader

In [23]:

from urllib.request import urlopen
import regex as re
from collections import Counter
from sklearn.metrics import confusion_matrix
import ud_datasets
from conll_dictorizer import CoNLLDictorizer

## Loading the corpus

We load a corpus from Universal Dependencies: https://universaldependencies.org/

In [24]:
CORPUS = 'EWT'

In [25]:
if CORPUS == 'EWT':
    train_sentences, val_sentences, test_sentences, column_names = ud_datasets.load_ud_en_ewt()
elif CORPUS == 'Talbanken':
    train_sentences, val_sentences, test_sentences, column_names = ud_datasets.load_ud_sv_talbanken()
elif CORPUS == 'GSD':
    train_sentences, val_sentences, test_sentences, column_names = ud_datasets.load_ud_fr_gsd()

In [26]:
conll_dict = CoNLLDictorizer(column_names)

In [27]:
train_dict = conll_dict.transform(train_sentences)
val_dict = conll_dict.transform(val_sentences)
test_dict = conll_dict.transform(test_sentences)

### Looking at the corpus

Printing some sentences

In [28]:
print('First sentence:', train_dict[0])
print('First word:', train_dict[0][0])
print('Type of the first word', type(train_dict[0][0]))
print('Form of the first word', train_dict[0][0]['FORM'])
print('Second sentence:', train_dict[1])

First sentence: [{'ID': '1', 'FORM': 'Al', 'LEMMA': 'Al', 'UPOS': 'PROPN', 'XPOS': 'NNP', 'FEATS': 'Number=Sing', 'HEAD': '0:root', 'DEPREL': 'root', 'DEPS': 'SpaceAfter=No'}, {'ID': '2', 'FORM': '-', 'LEMMA': '-', 'UPOS': 'PUNCT', 'XPOS': 'HYPH', 'FEATS': '_', 'HEAD': '3:punct', 'DEPREL': 'punct', 'DEPS': 'SpaceAfter=No'}, {'ID': '3', 'FORM': 'Zaman', 'LEMMA': 'Zaman', 'UPOS': 'PROPN', 'XPOS': 'NNP', 'FEATS': 'Number=Sing', 'HEAD': '1:flat', 'DEPREL': 'flat', 'DEPS': '_'}, {'ID': '4', 'FORM': ':', 'LEMMA': ':', 'UPOS': 'PUNCT', 'XPOS': ':', 'FEATS': '_', 'HEAD': '7:punct', 'DEPREL': 'punct', 'DEPS': '_'}, {'ID': '5', 'FORM': 'American', 'LEMMA': 'American', 'UPOS': 'ADJ', 'XPOS': 'JJ', 'FEATS': 'Degree=Pos', 'HEAD': '6:amod', 'DEPREL': 'amod', 'DEPS': '_'}, {'ID': '6', 'FORM': 'forces', 'LEMMA': 'force', 'UPOS': 'NOUN', 'XPOS': 'NNS', 'FEATS': 'Number=Plur', 'HEAD': '7:nsubj', 'DEPREL': 'nsubj', 'DEPS': '_'}, {'ID': '7', 'FORM': 'killed', 'LEMMA': 'kill', 'UPOS': 'VERB', 'XPOS': 'VBD'

## Word/POS distribution functions

We extract the word/POS statistics from the corpus. We proceed in two steps:
1. We count the words
2. We count the POS per word

This corresponds to the two functions below

In [29]:
def norm_and_clean(corpus, lc=False):
    new_corpus = []
    for sent in corpus:
        new_sent = []
        for row in sent:
            if re.search(r'-', row['ID']):
                continue
            if lc:
                row['FORM'] = row['FORM'].lower()
            new_sent += [row]
        new_corpus += [new_sent]
    return new_corpus

In [30]:
test_dict[0:2]

[[{'ID': '1',
   'FORM': 'What',
   'LEMMA': 'what',
   'UPOS': 'PRON',
   'XPOS': 'WP',
   'FEATS': 'PronType=Int',
   'HEAD': '0:root',
   'DEPREL': 'root',
   'DEPS': '_'},
  {'ID': '2',
   'FORM': 'if',
   'LEMMA': 'if',
   'UPOS': 'SCONJ',
   'XPOS': 'IN',
   'FEATS': '_',
   'HEAD': '4:mark',
   'DEPREL': 'mark',
   'DEPS': '_'},
  {'ID': '3',
   'FORM': 'Google',
   'LEMMA': 'Google',
   'UPOS': 'PROPN',
   'XPOS': 'NNP',
   'FEATS': 'Number=Sing',
   'HEAD': '4:nsubj',
   'DEPREL': 'nsubj',
   'DEPS': '_'},
  {'ID': '4',
   'FORM': 'Morphed',
   'LEMMA': 'morph',
   'UPOS': 'VERB',
   'XPOS': 'VBD',
   'FEATS': 'Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin',
   'HEAD': '1:advcl:if',
   'DEPREL': 'advcl',
   'DEPS': '_'},
  {'ID': '5',
   'FORM': 'Into',
   'LEMMA': 'into',
   'UPOS': 'ADP',
   'XPOS': 'IN',
   'FEATS': '_',
   'HEAD': '6:case',
   'DEPREL': 'case',
   'DEPS': '_'},
  {'ID': '6',
   'FORM': 'GoogleOS',
   'LEMMA': 'GoogleOS',
   'UPOS': 'PROPN',
   'XPO

In [31]:
NORM = False

In [32]:
if NORM:
    train_dict = norm_and_clean(train_dict)
    val_dict = norm_and_clean(val_dict)
    test_dict = norm_and_clean(test_dict)

In [33]:
def count_word(corpus, word_key='FORM'):
    """
    Computes the word distribution
    in a CoNLL file
    :param corpus:
    :return:
    """
    word_cnt = Counter()
    for sentence in corpus:
        for row in sentence:
            word_cnt[row[word_key]] += 1
    return word_cnt

In [34]:
def distribution(corpus,
                 word_key='FORM',
                 pos_key='UPOS'):
    """
    Computes the pos distribution by word
    The result is stored in a dictionary
    :param corpus:
    :return:
    """

    word_cnt = count_word(corpus, word_key)

    # We compute the POS distribution by word
    pos_dist = {key: Counter() for key in word_cnt.keys()}
    for sentence in corpus:
        for row in sentence:
            distribution = pos_dist[row[word_key]]
            distribution[row[pos_key]] += 1
    return pos_dist

## The word/POS association

Now we compute the word/POS association

In [35]:
pos_dist = distribution(train_dict)

In [36]:
pos_dist['That']

Counter({'PRON': 58, 'DET': 15, 'SCONJ': 6})

In [37]:
pos_dist['round']

Counter({'NOUN': 4, 'ADV': 3, 'ADJ': 2, 'ADP': 2})

In [38]:
pos_dist['table']

Counter({'NOUN': 14})

In [39]:
pos_dist['might']

Counter({'AUX': 77})

In [40]:
pos_dist['collapse']

Counter({'NOUN': 2, 'VERB': 1})

In [41]:
if CORPUS == 'EWT':
    print(pos_dist['move'])
elif CORPUS == 'Talbanken':
    print(pos_dist.get('den'))
elif CORPUS == 'GSD':
    print(pos_dist.get('le'))

Counter({'VERB': 26, 'NOUN': 9})


In [42]:
# We determine the best association
word_pos = {}
for word in pos_dist:
    word_pos[word] = max(pos_dist[word],
                         key=pos_dist[word].get)

In [43]:
if CORPUS == 'EWT':
    print(word_pos.get('move'))
elif CORPUS == 'Talbanken':
    print(word_pos.get('den'))
elif CORPUS == 'GSD':
    print(word_pos.get('le'))

VERB


## Unknown words

In a text, we will certainly encounter unseen words. What is their most frequent part of speech? We use the validation set to determine the POS distribution of these unseen words.

In [44]:
def unseen_words_pos_dist(corpus,
                          word_pos,
                          word_key='FORM',
                          pos_key='UPOS'):
    unseen_words = Counter()
    for sentence in corpus:
        for word in sentence:
            if not word[word_key] in word_pos:
                unseen_words[word[pos_key]] += 1
    return unseen_words

In [45]:
Counter(word_pos.values()).most_common(5)

[('NOUN', 7317), ('PROPN', 4029), ('VERB', 3412), ('ADJ', 2327), ('NUM', 977)]

In [46]:
unseen_dist = unseen_words_pos_dist(val_dict, word_pos)
unseen_dist.most_common(5)

[('PROPN', 818), ('NOUN', 622), ('VERB', 206), ('ADJ', 189), ('NUM', 109)]

In [47]:
default_pos = max(unseen_dist, key=unseen_dist.get)
default_pos

'PROPN'

## The prediction

We use the best association to predict the part of speech. If the word is unseen, we assign a predefined POS. We add a key to the dictionaries for the predicted part of speech.

In [48]:
def predict(sentences, word_pos, word_key='FORM', ppos_key='PPOS', default_pos='PROPN'):
    for sentence in sentences:
        for word in sentence:
            if word[word_key] in word_pos:
                word[ppos_key] = word_pos[word[word_key]]
            else:
                word[ppos_key] = default_pos
    return sentences

In [49]:
word_key = 'FORM'
ppos_key = 'PPOS'

In [50]:
test_annotated = predict(test_dict, word_pos, word_key=word_key,
                         ppos_key=ppos_key, default_pos=default_pos)
test_annotated[0]

[{'ID': '1',
  'FORM': 'What',
  'LEMMA': 'what',
  'UPOS': 'PRON',
  'XPOS': 'WP',
  'FEATS': 'PronType=Int',
  'HEAD': '0:root',
  'DEPREL': 'root',
  'DEPS': '_',
  'PPOS': 'PRON'},
 {'ID': '2',
  'FORM': 'if',
  'LEMMA': 'if',
  'UPOS': 'SCONJ',
  'XPOS': 'IN',
  'FEATS': '_',
  'HEAD': '4:mark',
  'DEPREL': 'mark',
  'DEPS': '_',
  'PPOS': 'SCONJ'},
 {'ID': '3',
  'FORM': 'Google',
  'LEMMA': 'Google',
  'UPOS': 'PROPN',
  'XPOS': 'NNP',
  'FEATS': 'Number=Sing',
  'HEAD': '4:nsubj',
  'DEPREL': 'nsubj',
  'DEPS': '_',
  'PPOS': 'PROPN'},
 {'ID': '4',
  'FORM': 'Morphed',
  'LEMMA': 'morph',
  'UPOS': 'VERB',
  'XPOS': 'VBD',
  'FEATS': 'Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin',
  'HEAD': '1:advcl:if',
  'DEPREL': 'advcl',
  'DEPS': '_',
  'PPOS': 'PROPN'},
 {'ID': '5',
  'FORM': 'Into',
  'LEMMA': 'into',
  'UPOS': 'ADP',
  'XPOS': 'IN',
  'FEATS': '_',
  'HEAD': '6:case',
  'DEPREL': 'case',
  'DEPS': '_',
  'PPOS': 'ADP'},
 {'ID': '6',
  'FORM': 'GoogleOS',
  'LEM

## Evaluation

We carry out the evaluation by comparing the value of two keys: the truth and the prediction

In [51]:
def evaluate(sentences, truth_key='UPOS', pred_key='PPOS'):
    """
    Computes the accuracy
    :param sentences:
    :param gold:
    :param system:
    :return:
    """
    bad = 0
    good = 0
    for sentence in sentences:
        for word in sentence:
            if word[truth_key] == word[pred_key]:
                good += 1
            else:
                bad += 1
    return good, bad

In [52]:
truth_key = 'UPOS'
pred_key = ppos_key

In [53]:
good, bad = evaluate(test_annotated, truth_key=truth_key, pred_key=pred_key)
print('Accuracy:', good / (good + bad))

Accuracy: 0.8682121807465619


In [54]:
len(train_dict)

12544

Finding the sentences with contracted forms

In [55]:
def has_contractions(sent):
    for word in sent:
        if '-' in word['ID']:
            return True
    return False

In [56]:
len([(len(sent), i)
    for i, sent in enumerate(train_dict) if has_contractions(sent)])

2192

`UPOS` is the true tag and `PPOS` is the prediction

In [57]:
tag_pairs_test = [(word['UPOS'], word['PPOS'])
                  for sent in test_annotated for word in sent]

In [58]:
y_true_test, y_pred_test = zip(*tag_pairs_test)

In [59]:
labels = sorted(list(set(y_true_test)))

In [60]:
print(labels)

['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', '_']


In [61]:
import numpy as np
np.set_printoptions(precision=1, suppress=True)

In [62]:
cm = 100 * confusion_matrix(y_true_test, y_pred_test,
                            labels=labels, normalize='true')
cm

array([[82.8,  0.6,  1.5,  0. ,  0. ,  0.1,  0. ,  1.9,  0. ,  0. ,  0. ,
        11.5,  0. ,  0. ,  0. ,  1.6,  0. ,  0. ],
       [ 0. , 88.2,  0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. , 10.3,  0. ,
         0.3,  0. ,  0.6,  0. ,  0. ,  0. ,  0. ],
       [ 5.4,  7.1, 78.6,  0.1,  0.2,  1.3,  0.2,  0.8,  0. ,  0.3,  2.6,
         2.1,  0. ,  1.4,  0. ,  0.1,  0.1,  0. ],
       [ 0. ,  0. ,  0. , 88.9,  0. ,  0. ,  0. ,  0.1,  0.1,  4.1,  0.1,
         0.3,  0. ,  0. ,  0. ,  6.5,  0. ,  0. ],
       [ 0. ,  0.1,  0. ,  0. , 99.7,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0.1,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.2,  0. ,  0.1,  0. ,  0.2, 96.8,  0. ,  0.1,  0. ,  0. ,  1.7,
         0.1,  0. ,  0.9,  0. ,  0. ,  0. ,  0. ],
       [ 4.2,  3.3,  2.5,  0. ,  0. ,  7.5, 69.2,  0. ,  0. ,  0. ,  1.7,
        11.7,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.8,  0.1,  0.2,  0.1,  0. ,  0. ,  0. , 76.2,  0.4,  0. ,  0. ,
        19. ,  0. ,  0.1,  0. ,  3.1,  0. ,  0. ],


In [63]:
list(enumerate(labels))

[(0, 'ADJ'),
 (1, 'ADP'),
 (2, 'ADV'),
 (3, 'AUX'),
 (4, 'CCONJ'),
 (5, 'DET'),
 (6, 'INTJ'),
 (7, 'NOUN'),
 (8, 'NUM'),
 (9, 'PART'),
 (10, 'PRON'),
 (11, 'PROPN'),
 (12, 'PUNCT'),
 (13, 'SCONJ'),
 (14, 'SYM'),
 (15, 'VERB'),
 (16, 'X'),
 (17, '_')]

To obtain the reduced confusion matrix in the book

In [64]:
cm[[0, 1, 2, 3, 4, 5, 7, 10, 11, 13, 15],
   :][:, [0, 1, 2, 3, 4, 5, 7, 10, 11, 13, 15]]

array([[82.8,  0.6,  1.5,  0. ,  0. ,  0.1,  1.9,  0. , 11.5,  0. ,  1.6],
       [ 0. , 88.2,  0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0.3,  0.6,  0. ],
       [ 5.4,  7.1, 78.6,  0.1,  0.2,  1.3,  0.8,  2.6,  2.1,  1.4,  0.1],
       [ 0. ,  0. ,  0. , 88.9,  0. ,  0. ,  0.1,  0.1,  0.3,  0. ,  6.5],
       [ 0. ,  0.1,  0. ,  0. , 99.7,  0. ,  0. ,  0. ,  0.1,  0. ,  0. ],
       [ 0.2,  0. ,  0.1,  0. ,  0.2, 96.8,  0.1,  1.7,  0.1,  0.9,  0. ],
       [ 0.8,  0.1,  0.2,  0.1,  0. ,  0. , 76.2,  0. , 19. ,  0.1,  3.1],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  2.1,  0. , 93. ,  0.1,  4.4,  0. ],
       [ 1. ,  0.2,  0. ,  0. ,  0. ,  0. ,  3.9,  0. , 94. ,  0. ,  0.4],
       [ 0. , 33.3,  1.6,  0. ,  0. ,  0. ,  0. ,  0.3,  1.6, 60.4,  0. ],
       [ 0.6,  0.9,  0.2,  3.6,  0. ,  0. ,  5.7,  0. ,  7.2,  0. , 81.5]])