# Baseline part-of-speech tagger

A baseline POS tagger that uses the most frequent part of speech to annotate the words

Author: Pierre Nugues

## Imports

We import a CoNLL reader

In [1]:
from conll_dictorizer import CoNLLDictorizer
import regex as re
from urllib.request import urlopen

## Loading the corpus

We load a corpus from Universal Dependencies: https://universaldependencies.org/

### CONLL-U columns

In [2]:
column_names = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS',
                'FEATS', 'HEAD', 'DEPREL', 'HEAD', 'DEPS', 'MISC']

column_names = list(map(str.lower, column_names))

### The corpus

In [3]:
lang = 'english'

In [4]:
if lang == 'english':
    url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/'
    train_file = url + 'en_ewt-ud-train.conllu'
    dev_file = url + 'en_ewt-ud-dev.conllu'
    test_file = url + 'en_ewt-ud-test.conllu'
if lang == 'swedish':
    url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_Swedish-Talbanken/master/'
    train_file = url + 'sv_talbanken-ud-train.conllu'
    dev_file = url + 'sv_talbanken-ud-dev.conllu'
    test_file = url + 'sv_talbanken-ud-test.conllu'
if lang == 'french':    
    url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_French-GSD/master/'
    train_file = url + 'fr_gsd-ud-train.conllu'
    dev_file = url + 'fr_gsd-ud-dev.conllu'
    test_file = url + 'fr_gsd-ud-test.conllu'
if lang == 'persian':
    url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_Persian-PerDT/master/'
    train_file = url + 'fa_perdt-ud-train.conllu'
    dev_file = url + 'fa_perdt-ud-dev.conllu'
    test_file = url + 'fa_perdt-ud-test.conllu'

We load it from GitHub

In [5]:
train_sentences = urlopen(train_file).read().decode('utf-8').strip()
dev_sentences = urlopen(dev_file).read().decode('utf-8').strip()
test_sentences = urlopen(test_file).read().decode('utf-8').strip()

conll_dict = CoNLLDictorizer(column_names)
train_dict = conll_dict.transform(train_sentences)
dev_dict = conll_dict.transform(dev_sentences)
test_dict = conll_dict.transform(test_sentences)

### Looking at the corpus

Printing some sentences

In [6]:
print('First sentence:', train_dict[0])
print('First word:', train_dict[0][0])
print('Type of the first word', type(train_dict[0][0]))
print('Form of the first word', train_dict[0][0]['form'])
print('Second sentence:', train_dict[1])

First sentence: [{'id': '1', 'form': 'Al', 'lemma': 'Al', 'upos': 'PROPN', 'xpos': 'NNP', 'feats': 'Number=Sing', 'head': '0:root', 'deprel': 'root', 'deps': 'SpaceAfter=No'}, {'id': '2', 'form': '-', 'lemma': '-', 'upos': 'PUNCT', 'xpos': 'HYPH', 'feats': '_', 'head': '1:punct', 'deprel': 'punct', 'deps': 'SpaceAfter=No'}, {'id': '3', 'form': 'Zaman', 'lemma': 'Zaman', 'upos': 'PROPN', 'xpos': 'NNP', 'feats': 'Number=Sing', 'head': '1:flat', 'deprel': 'flat', 'deps': '_'}, {'id': '4', 'form': ':', 'lemma': ':', 'upos': 'PUNCT', 'xpos': ':', 'feats': '_', 'head': '1:punct', 'deprel': 'punct', 'deps': '_'}, {'id': '5', 'form': 'American', 'lemma': 'American', 'upos': 'ADJ', 'xpos': 'JJ', 'feats': 'Degree=Pos', 'head': '6:amod', 'deprel': 'amod', 'deps': '_'}, {'id': '6', 'form': 'forces', 'lemma': 'force', 'upos': 'NOUN', 'xpos': 'NNS', 'feats': 'Number=Plur', 'head': '7:nsubj', 'deprel': 'nsubj', 'deps': '_'}, {'id': '7', 'form': 'killed', 'lemma': 'kill', 'upos': 'VERB', 'xpos': 'VBD'

## Word/POS distribution functions

We extract the word/POS statistics from the corpus. We proceed in two steps:
1. We count the words
2. We count the POS per word

This corresponds to the two functions below

In [7]:
def count_word(corpus, word_key='form'):
    """
    Computes the word distribution
    in a CoNLL file
    :param corpus:
    :return:
    """
    word_cnt = {}
    for sentence in corpus:
        for row in sentence:
            if row[word_key] in word_cnt:
                word_cnt[row[word_key]] += 1
            else:
                word_cnt[row[word_key]] = 1
    return word_cnt

In [8]:
def distribution(corpus, word_key='form', pos_key='upos'):
    """
    Computes the pos distribution by word
    The result is stored in a dictionary
    :param corpus:
    :return:
    """
    
    word_cnt = count_word(corpus, word_key)
    
    # We compute the POS distribution by word
    pos_dist = {key: {} for key in word_cnt.keys()}
    for sentence in corpus:
        for row in sentence:
            distribution = pos_dist[row[word_key]]
            if row[pos_key] in distribution:
                distribution[row[pos_key]] += 1
            else:
                distribution[row[pos_key]] = 1
    return pos_dist

## The word/POS association

Now we compute the word/POS association

In [9]:
pos_dist = distribution(train_dict)

In [10]:
if lang == 'english':
    print(pos_dist.get('move'))
elif lang == 'swedish':
    print(pos_dist.get('den'))
elif lang == 'french':
    print(pos_dist.get('le'))

{'NOUN': 9, 'VERB': 26}


In [11]:
# We determine the best association
word_pos = {}
for word in pos_dist:
    word_pos[word] = max(pos_dist[word], key=pos_dist[word].get)

In [12]:
if lang == 'english':
    print(word_pos.get('move'))
elif lang == 'swedish':   
    print(word_pos.get('den'))
elif lang == 'french':
    print(word_pos.get('le'))

VERB


## Unknown words

In a text, we will certainly encounter unseen words. What is their most frequent part of speech? We use the development set to determine the POS distribution of these unseen words.

In [13]:
def unseen_words_pos_distribution(sentences, word_pos, word_key='form', pos_key='upos'):
    unseen_words = {}
    for sentence in sentences:
        for word in sentence:
            if not word[word_key] in word_pos:
                if word[pos_key] in unseen_words:
                    unseen_words[word[pos_key]] += 1
                else:
                    unseen_words[word[pos_key]] = 1
    return unseen_words

In [14]:
unseen_dist = unseen_words_pos_distribution(dev_dict, word_pos)
unseen_dist

{'NOUN': 621,
 'PROPN': 737,
 'ADJ': 189,
 'X': 101,
 '_': 66,
 'VERB': 207,
 'ADV': 44,
 'NUM': 109,
 'INTJ': 19,
 'ADP': 4,
 'PRON': 8,
 'SCONJ': 6,
 'SYM': 9,
 'PUNCT': 24,
 'AUX': 5,
 'DET': 3,
 'CCONJ': 2}

In [15]:
default_pos = max(unseen_dist, key=unseen_dist.get)
default_pos

'PROPN'

## The prediction

We use the best association to predict the part of speech. If the word is unseen, we assign a predefined POS. We add a key to the dictionaries for the predicted part of speech.

In [16]:
def predict(sentences, word_pos, word_key='form', ppos_key='ppos', default_pos='PROPN'):
    for sentence in sentences:
        for word in sentence:
            if word[word_key] in word_pos:
                word[ppos_key] = word_pos[word[word_key]]
            else:
                word[ppos_key] = default_pos
    return sentences

In [17]:
word_key = 'form'
ppos_key = 'ppos'

In [18]:
test_annotated = predict(test_dict, word_pos, word_key=word_key, ppos_key=ppos_key, default_pos=default_pos)
test_annotated[0]

[{'id': '1',
  'form': 'What',
  'lemma': 'what',
  'upos': 'PRON',
  'xpos': 'WP',
  'feats': 'PronType=Int',
  'head': '0:root',
  'deprel': 'root',
  'deps': '_',
  'ppos': 'PRON'},
 {'id': '2',
  'form': 'if',
  'lemma': 'if',
  'upos': 'SCONJ',
  'xpos': 'IN',
  'feats': '_',
  'head': '4:mark',
  'deprel': 'mark',
  'deps': '_',
  'ppos': 'SCONJ'},
 {'id': '3',
  'form': 'Google',
  'lemma': 'Google',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '4:nsubj',
  'deprel': 'nsubj',
  'deps': '_',
  'ppos': 'PROPN'},
 {'id': '4',
  'form': 'Morphed',
  'lemma': 'morph',
  'upos': 'VERB',
  'xpos': 'VBD',
  'feats': 'Mood=Ind|Tense=Past|VerbForm=Fin',
  'head': '1:advcl:if',
  'deprel': 'advcl',
  'deps': '_',
  'ppos': 'PROPN'},
 {'id': '5',
  'form': 'Into',
  'lemma': 'into',
  'upos': 'ADP',
  'xpos': 'IN',
  'feats': '_',
  'head': '6:case',
  'deprel': 'case',
  'deps': '_',
  'ppos': 'ADP'},
 {'id': '6',
  'form': 'GoogleOS',
  'lemma': 'GoogleOS',
  'u

## Evaluation

We carry out the evaluation by comparing the value of two keys: the truth and the prediction

In [19]:
def evaluate(sentences, truth_key='upos', pred_key='ppos'):
    """
    Computes the accuracy
    :param sentences:
    :param gold:
    :param system:
    :return:
    """
    bad = 0
    good = 0
    for sentence in sentences:
        for word in sentence:
            if word[truth_key] == word[pred_key]:
                good += 1
            else:
                bad += 1
    return good, bad

In [20]:
truth_key = 'upos'
pred_key = ppos_key

In [21]:
good, bad = evaluate(test_annotated, truth_key=truth_key, pred_key=pred_key)
print('Accuracy:', good / (good + bad))

Accuracy: 0.8602239245727755
