# Part-of-speech tagging with logistic regression

Author: Pierre Nugues

A simple POS tagger using a context of five words and logistic regression.

## Imports

In [1]:
import sys
import os
import time
from sklearn.feature_extraction import DictVectorizer
from sklearn import linear_model
import datasets
from context_dictorizer import ContextDictorizer, evaluate
from ch06.python.conll_dictorizer import CoNLLDictorizer

## Loading the corpus

In [2]:
CORPUS = 'EWT'  # 'EWT' or 'PTB' # The English Web Treebank or the Penn Treebank

In [3]:
if CORPUS == 'EWT':
    train_sentences, dev_sentences, test_sentences, column_names = datasets.load_ud_en_ewt()
else:
    train_sentences, dev_sentences, test_sentences, column_names = datasets.load_conll2009_pos()

In [4]:
column_names

['id',
 'form',
 'lemma',
 'upos',
 'xpos',
 'feats',
 'head',
 'deprel',
 'head',
 'deps',
 'misc']

In [5]:
train_sentences[:500]

'# newdoc id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000\n# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001\n# newpar id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-p0001\n# text = Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.\n1\tAl\tAl\tPROPN\tNNP\tNumber=Sing\t0\troot\t0:root\tSpaceAfter=No\n2\t-\t-\tPUNCT\tHYPH\t_\t1\tpunct\t1:punct\tSpaceAfter=No\n3\tZaman\tZam'

## Dictorizing the corpus

We store the corpus word in a dictionary, where the keys are the CoNLL-U columns

In [6]:
conll_dict = CoNLLDictorizer(column_names)
train_dict = conll_dict.transform(train_sentences)
train_dict[0][:10]

[{'id': '1',
  'form': 'Al',
  'lemma': 'Al',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '0:root',
  'deprel': 'root',
  'deps': 'SpaceAfter=No'},
 {'id': '2',
  'form': '-',
  'lemma': '-',
  'upos': 'PUNCT',
  'xpos': 'HYPH',
  'feats': '_',
  'head': '1:punct',
  'deprel': 'punct',
  'deps': 'SpaceAfter=No'},
 {'id': '3',
  'form': 'Zaman',
  'lemma': 'Zaman',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '1:flat',
  'deprel': 'flat',
  'deps': '_'},
 {'id': '4',
  'form': ':',
  'lemma': ':',
  'upos': 'PUNCT',
  'xpos': ':',
  'feats': '_',
  'head': '1:punct',
  'deprel': 'punct',
  'deps': '_'},
 {'id': '5',
  'form': 'American',
  'lemma': 'American',
  'upos': 'ADJ',
  'xpos': 'JJ',
  'feats': 'Degree=Pos',
  'head': '6:amod',
  'deprel': 'amod',
  'deps': '_'},
 {'id': '6',
  'form': 'forces',
  'lemma': 'force',
  'upos': 'NOUN',
  'xpos': 'NNS',
  'feats': 'Number=Plur',
  'head': '7:nsubj',
  'deprel': 'nsubj',
  'deps

## Feature extraction

We extract the features and we store them in a dictionary. This $y$ output is extracted from the `upos` column for the Universal dependencies or `pos` column for the Penn Treebank.

In [7]:
context_dictorizer = ContextDictorizer(output='upos', w_size=2)
context_dictorizer.fit(train_dict)
# Feature and response extraction
X_dict, y = context_dictorizer.transform(train_dict)

When using the Penn Treebank, we print the features to check they match Table 8.1 in my book (second edition). Use the range: `range(48759, 48795)` then.
We use the training step extraction with the dynamic features

In [8]:
#context_dictorizer.print_example(train_dict)
for i in range(10):
#for i in range(48759, 48795): # For the Penn Treebank
    print(str(X_dict[i]) + '\t' + y[i])

{'form_0': '__bos__', 'form_1': '__bos__', 'form_2': 'al', 'form_3': '-', 'form_4': 'zaman'}	PROPN
{'form_0': '__bos__', 'form_1': 'al', 'form_2': '-', 'form_3': 'zaman', 'form_4': ':'}	PUNCT
{'form_0': 'al', 'form_1': '-', 'form_2': 'zaman', 'form_3': ':', 'form_4': 'american'}	PROPN
{'form_0': '-', 'form_1': 'zaman', 'form_2': ':', 'form_3': 'american', 'form_4': 'forces'}	PUNCT
{'form_0': 'zaman', 'form_1': ':', 'form_2': 'american', 'form_3': 'forces', 'form_4': 'killed'}	ADJ
{'form_0': ':', 'form_1': 'american', 'form_2': 'forces', 'form_3': 'killed', 'form_4': 'shaikh'}	NOUN
{'form_0': 'american', 'form_1': 'forces', 'form_2': 'killed', 'form_3': 'shaikh', 'form_4': 'abdullah'}	VERB
{'form_0': 'forces', 'form_1': 'killed', 'form_2': 'shaikh', 'form_3': 'abdullah', 'form_4': 'al'}	PROPN
{'form_0': 'killed', 'form_1': 'shaikh', 'form_2': 'abdullah', 'form_3': 'al', 'form_4': '-'}	PROPN
{'form_0': 'shaikh', 'form_1': 'abdullah', 'form_2': 'al', 'form_3': '-', 'form_4': 'ani'}	PROPN


We transform the symbols into numbers

In [9]:
dict_vectorizer = DictVectorizer()
X = dict_vectorizer.fit_transform(X_dict)

## Training the model

In [10]:
classifier = linear_model.LogisticRegression()
model = classifier.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
model

LogisticRegression()

## Prediction

### Process the test corpus

We first dictorize the test corpus

In [12]:
test_dict = conll_dict.transform(test_sentences)
test_dict[0]

[{'id': '1',
  'form': 'What',
  'lemma': 'what',
  'upos': 'PRON',
  'xpos': 'WP',
  'feats': 'PronType=Int',
  'head': '0:root',
  'deprel': 'root',
  'deps': '_'},
 {'id': '2',
  'form': 'if',
  'lemma': 'if',
  'upos': 'SCONJ',
  'xpos': 'IN',
  'feats': '_',
  'head': '4:mark',
  'deprel': 'mark',
  'deps': '_'},
 {'id': '3',
  'form': 'Google',
  'lemma': 'Google',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '4:nsubj',
  'deprel': 'nsubj',
  'deps': '_'},
 {'id': '4',
  'form': 'Morphed',
  'lemma': 'morph',
  'upos': 'VERB',
  'xpos': 'VBD',
  'feats': 'Mood=Ind|Tense=Past|VerbForm=Fin',
  'head': '1:advcl:if',
  'deprel': 'advcl',
  'deps': '_'},
 {'id': '5',
  'form': 'Into',
  'lemma': 'into',
  'upos': 'ADP',
  'xpos': 'IN',
  'feats': '_',
  'head': '6:case',
  'deprel': 'case',
  'deps': '_'},
 {'id': '6',
  'form': 'GoogleOS',
  'lemma': 'GoogleOS',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '4:obl:into',
  'deprel'

### Predict the sentences

In [13]:
def predict_sentence(sentence, model,
                     context_dictorizer,
                     dict_vectorizer,
                     ppos_key='ppos'):
    """
    Prediction using the words (lexical values)
    :param sentence:
    :return:
    """
    X_dict, y = context_dictorizer.transform([sentence],
                                             training_step=False)
    X = dict_vectorizer.transform(X_dict)
    y_pred_vec = model.predict(X)
    
    # We add the predictions in the ppos column
    for row, y_pred in zip(sentence, y_pred_vec):
        row[ppos_key] = y_pred

    return sentence

In [14]:
for sentence in test_dict:
    sentence = predict_sentence(sentence,
                                model,
                                context_dictorizer,
                                dict_vectorizer)

In [15]:
test_dict[0]

[{'id': '1',
  'form': 'What',
  'lemma': 'what',
  'upos': 'PRON',
  'xpos': 'WP',
  'feats': 'PronType=Int',
  'head': '0:root',
  'deprel': 'root',
  'deps': '_',
  'ppos': 'PRON'},
 {'id': '2',
  'form': 'if',
  'lemma': 'if',
  'upos': 'SCONJ',
  'xpos': 'IN',
  'feats': '_',
  'head': '4:mark',
  'deprel': 'mark',
  'deps': '_',
  'ppos': 'SCONJ'},
 {'id': '3',
  'form': 'Google',
  'lemma': 'Google',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '4:nsubj',
  'deprel': 'nsubj',
  'deps': '_',
  'ppos': 'PROPN'},
 {'id': '4',
  'form': 'Morphed',
  'lemma': 'morph',
  'upos': 'VERB',
  'xpos': 'VBD',
  'feats': 'Mood=Ind|Tense=Past|VerbForm=Fin',
  'head': '1:advcl:if',
  'deprel': 'advcl',
  'deps': '_',
  'ppos': 'VERB'},
 {'id': '5',
  'form': 'Into',
  'lemma': 'into',
  'upos': 'ADP',
  'xpos': 'IN',
  'feats': '_',
  'head': '6:case',
  'deprel': 'case',
  'deps': '_',
  'ppos': 'ADP'},
 {'id': '6',
  'form': 'GoogleOS',
  'lemma': 'GoogleOS',
  'up

## Evaluate the prediction

In [16]:
good, bad = evaluate(test_dict, 'upos', 'ppos')
print('Accuracy, lexical model:', good / (good + bad))

Accuracy, lexical model: 0.9039874287959143


## Applying the model to sentences

A few sentences:

In [17]:
sentences = ['That round table might collapse .',
             'That man can learn well .',
             'This man can swim well .',
             'The man can simwo .',
             'That round table might collapsex .']

We convert them into CoNLL-like tables

In [18]:
def sentence_to_conll(sentence):
    """
    Convert a sentence to a CoNLL dict
    :param sentence:
    :return:
    """
    column_names = ['id', 'form']
    sentence = list(enumerate(sentence.split(), start=1))
    conll_cols = ''
    for tuple in sentence:
        conll_cols += str(tuple[0]) + '\t' + tuple[1] + '\n'

    conll_dict = CoNLLDictorizer(column_names)
    sent_dict = conll_dict.transform(conll_cols)
    return sent_dict[0]

And we tag them

In [19]:
for sentence in sentences:
    sentence = sentence_to_conll(sentence.lower())
    y_test_pred_cat = predict_sentence(sentence,
                                       model,
                                       context_dictorizer,
                                       dict_vectorizer)
    print([y['form'] for y in y_test_pred_cat])
    print([y['ppos'] for y in y_test_pred_cat])

['that', 'round', 'table', 'might', 'collapse', '.']
['PRON', 'ADJ', 'NOUN', 'AUX', 'VERB', 'PUNCT']
['that', 'man', 'can', 'learn', 'well', '.']
['DET', 'NOUN', 'AUX', 'VERB', 'INTJ', 'PUNCT']
['this', 'man', 'can', 'swim', 'well', '.']
['DET', 'NOUN', 'AUX', 'VERB', 'ADV', 'PUNCT']
['the', 'man', 'can', 'simwo', '.']
['DET', 'NOUN', 'AUX', 'VERB', 'PUNCT']
['that', 'round', 'table', 'might', 'collapsex', '.']
['PRON', 'ADJ', 'NOUN', 'AUX', 'VERB', 'PUNCT']
