# Chapter 14: Part-of-Speech and Sequence Annotation
A POS tagger using logistic regression

Programs from the book: [_Python for Natural Language Processing_](https://link.springer.com/book/9783031575488)

__Author__: Pierre Nugues

## Modules

We import a CoNLL reader

In [1]:
from sklearn.feature_extraction import DictVectorizer
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import ud_datasets
from conll_dictorizer import CoNLLDictorizer

## Loading the corpus

We load a corpus from Universal Dependencies: https://universaldependencies.org/ hosted by GitHub

In [2]:
CORPUS = 'EWT'

In [3]:
if CORPUS == 'EWT':
    train_sentences, val_sentences, test_sentences, column_names = ud_datasets.load_ud_en_ewt()
elif CORPUS == 'Talbanken':
    train_sentences, val_sentences, test_sentences, column_names = ud_datasets.load_ud_sv_talbanken()
elif CORPUS == 'GSD':
    train_sentences, val_sentences, test_sentences, column_names = ud_datasets.load_ud_fr_gsd()

In [4]:
column_names

['ID',
 'FORM',
 'LEMMA',
 'UPOS',
 'XPOS',
 'FEATS',
 'HEAD',
 'DEPREL',
 'HEAD',
 'DEPS',
 'MISC']

In [5]:
train_sentences[:500]

'# newdoc id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000\n# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001\n# newpar id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-p0001\n# text = Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.\n1\tAl\tAl\tPROPN\tNNP\tNumber=Sing\t0\troot\t0:root\tSpaceAfter=No\n2\t-\t-\tPUNCT\tHYPH\t_\t3\tpunct\t3:punct\tSpaceAfter=No\n3\tZaman\tZam'

## Dictorizing the corpus

We store the corpus word in a dictionary, where the keys are the CoNLL-U columns

In [6]:
conll_dict = CoNLLDictorizer(column_names)

In [7]:
train_dict = conll_dict.transform(train_sentences)
val_dict = conll_dict.transform(val_sentences)
test_dict = conll_dict.transform(test_sentences)

In [8]:
train_dict[0][:10]

[{'ID': '1',
  'FORM': 'Al',
  'LEMMA': 'Al',
  'UPOS': 'PROPN',
  'XPOS': 'NNP',
  'FEATS': 'Number=Sing',
  'HEAD': '0:root',
  'DEPREL': 'root',
  'DEPS': 'SpaceAfter=No'},
 {'ID': '2',
  'FORM': '-',
  'LEMMA': '-',
  'UPOS': 'PUNCT',
  'XPOS': 'HYPH',
  'FEATS': '_',
  'HEAD': '3:punct',
  'DEPREL': 'punct',
  'DEPS': 'SpaceAfter=No'},
 {'ID': '3',
  'FORM': 'Zaman',
  'LEMMA': 'Zaman',
  'UPOS': 'PROPN',
  'XPOS': 'NNP',
  'FEATS': 'Number=Sing',
  'HEAD': '1:flat',
  'DEPREL': 'flat',
  'DEPS': '_'},
 {'ID': '4',
  'FORM': ':',
  'LEMMA': ':',
  'UPOS': 'PUNCT',
  'XPOS': ':',
  'FEATS': '_',
  'HEAD': '7:punct',
  'DEPREL': 'punct',
  'DEPS': '_'},
 {'ID': '5',
  'FORM': 'American',
  'LEMMA': 'American',
  'UPOS': 'ADJ',
  'XPOS': 'JJ',
  'FEATS': 'Degree=Pos',
  'HEAD': '6:amod',
  'DEPREL': 'amod',
  'DEPS': '_'},
 {'ID': '6',
  'FORM': 'forces',
  'LEMMA': 'force',
  'UPOS': 'NOUN',
  'XPOS': 'NNS',
  'FEATS': 'Number=Plur',
  'HEAD': '7:nsubj',
  'DEPREL': 'nsubj',
  'DEPS

## Feature extraction

We extract the features and we store them in a dictionary. This $y$ output is extracted from the `UPOS` column for the Universal dependencies.

In [9]:
def extract_cols(sent_dict, x='FORM', y='UPOS'):
    (input, target) = ([], [])
    for word in sent_dict:
        input += [word[x]]
        target += [word.get(y, None)]
    return input, target

In [10]:
train_cols = [extract_cols(sent_dict)
              for sent_dict in train_dict]

In [11]:
train_sent_words, train_sent_pos = zip(*train_cols)

In [12]:
train_sent_words[8131]

['Or', 'you', 'can', 'visit', 'temples', 'or', 'shrines', 'in', 'Okinawa', '.']

In [13]:
train_sent_pos[8131]

['CCONJ',
 'PRON',
 'AUX',
 'VERB',
 'NOUN',
 'CCONJ',
 'NOUN',
 'ADP',
 'PROPN',
 'PUNCT']

## The `X` Matrix

In [14]:
def create_X_cat(sentence: list[str],
                 w_size: int = 2) -> list[dict[int: str]]:
    start_pads = ['__BOS__'] * w_size
    end_pads = ['__EOS__'] * w_size
    sentence = start_pads + sentence + end_pads
    # We extract the features
    X = []
    for i in range(len(sentence) - 2 * w_size):
        x = []
        for j in range(2 * w_size + 1):
            x += [sentence[i + j]]
        X += [x]
    X = [dict(enumerate(x)) for x in X]
    return X

In [15]:
train_sent_words[0]

['Al',
 '-',
 'Zaman',
 ':',
 'American',
 'forces',
 'killed',
 'Shaikh',
 'Abdullah',
 'al',
 '-',
 'Ani',
 ',',
 'the',
 'preacher',
 'at',
 'the',
 'mosque',
 'in',
 'the',
 'town',
 'of',
 'Qaim',
 ',',
 'near',
 'the',
 'Syrian',
 'border',
 '.']

In [16]:
create_X_cat(train_sent_words[8131])

[{0: '__BOS__', 1: '__BOS__', 2: 'Or', 3: 'you', 4: 'can'},
 {0: '__BOS__', 1: 'Or', 2: 'you', 3: 'can', 4: 'visit'},
 {0: 'Or', 1: 'you', 2: 'can', 3: 'visit', 4: 'temples'},
 {0: 'you', 1: 'can', 2: 'visit', 3: 'temples', 4: 'or'},
 {0: 'can', 1: 'visit', 2: 'temples', 3: 'or', 4: 'shrines'},
 {0: 'visit', 1: 'temples', 2: 'or', 3: 'shrines', 4: 'in'},
 {0: 'temples', 1: 'or', 2: 'shrines', 3: 'in', 4: 'Okinawa'},
 {0: 'or', 1: 'shrines', 2: 'in', 3: 'Okinawa', 4: '.'},
 {0: 'shrines', 1: 'in', 2: 'Okinawa', 3: '.', 4: '__EOS__'},
 {0: 'in', 1: 'Okinawa', 2: '.', 3: '__EOS__', 4: '__EOS__'}]

In [17]:
train_sent_pos[8131]

['CCONJ',
 'PRON',
 'AUX',
 'VERB',
 'NOUN',
 'CCONJ',
 'NOUN',
 'ADP',
 'PROPN',
 'PUNCT']

In [18]:
X_train_cat = [row for sent in train_sent_words
               for row in create_X_cat(sent)]

In [19]:
y_train_cat = [pos for sent in train_sent_pos
               for pos in sent]

In [20]:
len(y_train_cat)

207229

In [21]:
X_train_cat[:10]

[{0: '__BOS__', 1: '__BOS__', 2: 'Al', 3: '-', 4: 'Zaman'},
 {0: '__BOS__', 1: 'Al', 2: '-', 3: 'Zaman', 4: ':'},
 {0: 'Al', 1: '-', 2: 'Zaman', 3: ':', 4: 'American'},
 {0: '-', 1: 'Zaman', 2: ':', 3: 'American', 4: 'forces'},
 {0: 'Zaman', 1: ':', 2: 'American', 3: 'forces', 4: 'killed'},
 {0: ':', 1: 'American', 2: 'forces', 3: 'killed', 4: 'Shaikh'},
 {0: 'American', 1: 'forces', 2: 'killed', 3: 'Shaikh', 4: 'Abdullah'},
 {0: 'forces', 1: 'killed', 2: 'Shaikh', 3: 'Abdullah', 4: 'al'},
 {0: 'killed', 1: 'Shaikh', 2: 'Abdullah', 3: 'al', 4: '-'},
 {0: 'Shaikh', 1: 'Abdullah', 2: 'al', 3: '-', 4: 'Ani'}]

## Vectorizing the Matrix

In [22]:
dict_vectorizer = DictVectorizer()

In [23]:
X_train = dict_vectorizer.fit_transform(X_train_cat)

## Training the model

In [24]:
classifier = linear_model.LogisticRegression()
model = classifier.fit(X_train, y_train_cat)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
model

## Prediction

### Process the test corpus

We first dictorize the test corpus

In [26]:
test_dict = conll_dict.transform(test_sentences)
test_dict[0]

[{'ID': '1',
  'FORM': 'What',
  'LEMMA': 'what',
  'UPOS': 'PRON',
  'XPOS': 'WP',
  'FEATS': 'PronType=Int',
  'HEAD': '0:root',
  'DEPREL': 'root',
  'DEPS': '_'},
 {'ID': '2',
  'FORM': 'if',
  'LEMMA': 'if',
  'UPOS': 'SCONJ',
  'XPOS': 'IN',
  'FEATS': '_',
  'HEAD': '4:mark',
  'DEPREL': 'mark',
  'DEPS': '_'},
 {'ID': '3',
  'FORM': 'Google',
  'LEMMA': 'Google',
  'UPOS': 'PROPN',
  'XPOS': 'NNP',
  'FEATS': 'Number=Sing',
  'HEAD': '4:nsubj',
  'DEPREL': 'nsubj',
  'DEPS': '_'},
 {'ID': '4',
  'FORM': 'Morphed',
  'LEMMA': 'morph',
  'UPOS': 'VERB',
  'XPOS': 'VBD',
  'FEATS': 'Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin',
  'HEAD': '1:advcl:if',
  'DEPREL': 'advcl',
  'DEPS': '_'},
 {'ID': '5',
  'FORM': 'Into',
  'LEMMA': 'into',
  'UPOS': 'ADP',
  'XPOS': 'IN',
  'FEATS': '_',
  'HEAD': '6:case',
  'DEPREL': 'case',
  'DEPS': '_'},
 {'ID': '6',
  'FORM': 'GoogleOS',
  'LEMMA': 'GoogleOS',
  'UPOS': 'PROPN',
  'XPOS': 'NNP',
  'FEATS': 'Number=Sing',
  'HEAD': '4:

In [27]:
test_cols = [extract_cols(sent_dict) for sent_dict in test_dict]

In [28]:
test_sent_words, test_sent_pos = zip(*test_cols)

### Predict the sentences

In [29]:
def predict_sentence(sentence,
                     model,
                     dict_vectorizer,
                     ppos_key='PPOS'):
    sent_words, _ = extract_cols(sentence)
    X_cat = create_X_cat(sent_words)
    X = dict_vectorizer.transform(X_cat)
    y_pred_vec = model.predict(X)
    # We add the predictions in the PPOS column
    for row, y_pred in zip(sentence, y_pred_vec):
        row[ppos_key] = y_pred
    return sentence

In [30]:
predict_sentence(test_dict[0], model, dict_vectorizer)

[{'ID': '1',
  'FORM': 'What',
  'LEMMA': 'what',
  'UPOS': 'PRON',
  'XPOS': 'WP',
  'FEATS': 'PronType=Int',
  'HEAD': '0:root',
  'DEPREL': 'root',
  'DEPS': '_',
  'PPOS': 'PRON'},
 {'ID': '2',
  'FORM': 'if',
  'LEMMA': 'if',
  'UPOS': 'SCONJ',
  'XPOS': 'IN',
  'FEATS': '_',
  'HEAD': '4:mark',
  'DEPREL': 'mark',
  'DEPS': '_',
  'PPOS': 'SCONJ'},
 {'ID': '3',
  'FORM': 'Google',
  'LEMMA': 'Google',
  'UPOS': 'PROPN',
  'XPOS': 'NNP',
  'FEATS': 'Number=Sing',
  'HEAD': '4:nsubj',
  'DEPREL': 'nsubj',
  'DEPS': '_',
  'PPOS': 'PROPN'},
 {'ID': '4',
  'FORM': 'Morphed',
  'LEMMA': 'morph',
  'UPOS': 'VERB',
  'XPOS': 'VBD',
  'FEATS': 'Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin',
  'HEAD': '1:advcl:if',
  'DEPREL': 'advcl',
  'DEPS': '_',
  'PPOS': 'VERB'},
 {'ID': '5',
  'FORM': 'Into',
  'LEMMA': 'into',
  'UPOS': 'ADP',
  'XPOS': 'IN',
  'FEATS': '_',
  'HEAD': '6:case',
  'DEPREL': 'case',
  'DEPS': '_',
  'PPOS': 'NOUN'},
 {'ID': '6',
  'FORM': 'GoogleOS',
  'LEM

In [31]:
for sentence in test_dict:
    sentence = predict_sentence(sentence,
                                model,
                                dict_vectorizer)

## Evaluate the prediction

In [32]:
test_dict[:2]

[[{'ID': '1',
   'FORM': 'What',
   'LEMMA': 'what',
   'UPOS': 'PRON',
   'XPOS': 'WP',
   'FEATS': 'PronType=Int',
   'HEAD': '0:root',
   'DEPREL': 'root',
   'DEPS': '_',
   'PPOS': 'PRON'},
  {'ID': '2',
   'FORM': 'if',
   'LEMMA': 'if',
   'UPOS': 'SCONJ',
   'XPOS': 'IN',
   'FEATS': '_',
   'HEAD': '4:mark',
   'DEPREL': 'mark',
   'DEPS': '_',
   'PPOS': 'SCONJ'},
  {'ID': '3',
   'FORM': 'Google',
   'LEMMA': 'Google',
   'UPOS': 'PROPN',
   'XPOS': 'NNP',
   'FEATS': 'Number=Sing',
   'HEAD': '4:nsubj',
   'DEPREL': 'nsubj',
   'DEPS': '_',
   'PPOS': 'PROPN'},
  {'ID': '4',
   'FORM': 'Morphed',
   'LEMMA': 'morph',
   'UPOS': 'VERB',
   'XPOS': 'VBD',
   'FEATS': 'Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin',
   'HEAD': '1:advcl:if',
   'DEPREL': 'advcl',
   'DEPS': '_',
   'PPOS': 'VERB'},
  {'ID': '5',
   'FORM': 'Into',
   'LEMMA': 'into',
   'UPOS': 'ADP',
   'XPOS': 'IN',
   'FEATS': '_',
   'HEAD': '6:case',
   'DEPREL': 'case',
   'DEPS': '_',
   'PPOS': 

In [33]:
(y_true, y_pred) = ([], [])
for sent in test_dict:
    a, b = extract_cols(sent, x='UPOS', y='PPOS')
    y_true += a
    y_pred += b

In [34]:
print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

         ADJ     0.8878    0.7964    0.8396      1788
         ADP     0.9364    0.9553    0.9457      2034
         ADV     0.9155    0.8384    0.8753      1176
         AUX     0.9792    0.9773    0.9783      1543
       CCONJ     0.9973    0.9864    0.9918       737
         DET     0.9873    0.9821    0.9847      1897
        INTJ     0.9775    0.7250    0.8325       120
        NOUN     0.7709    0.9248    0.8409      4137
         NUM     0.9196    0.6753    0.7787       542
        PART     0.9407    0.9784    0.9592       649
        PRON     0.9782    0.9773    0.9778      2162
       PROPN     0.7842    0.6683    0.7216      2077
       PUNCT     0.9938    0.9829    0.9883      3096
       SCONJ     0.8902    0.7604    0.8202       384
         SYM     0.8155    0.7706    0.7925       109
        VERB     0.8949    0.9079    0.9013      2606
           X     0.0000    0.0000    0.0000        39
           _     1.0000    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Applying the model to sentences

A few sentences:

In [35]:
sentences = ['That round table might collapse .',
             'That man can learn well .',
             'This man can swim well .',
             'The man can simwo .',
             'That round table might collapsex .']

We convert them into CoNLL-like tables

In [36]:
def sentence_to_conll(sentence: str):
    sent_dict = [{'ID': x, 'FORM': y} for (x, y) in
                 enumerate(sentence.split(), start=1)]
    return sent_dict

And we tag them

In [37]:
for sentence in sentences:
    sentence = sentence_to_conll(sentence)
    y_test_pred_cat = predict_sentence(sentence,
                                       model,
                                       dict_vectorizer)
    print([y['FORM'] for y in y_test_pred_cat])
    print([y['PPOS'] for y in y_test_pred_cat])

['That', 'round', 'table', 'might', 'collapse', '.']
['PRON', 'NOUN', 'NOUN', 'AUX', 'VERB', 'PUNCT']
['That', 'man', 'can', 'learn', 'well', '.']
['DET', 'NOUN', 'AUX', 'VERB', 'ADV', 'PUNCT']
['This', 'man', 'can', 'swim', 'well', '.']
['DET', 'NOUN', 'AUX', 'VERB', 'ADV', 'PUNCT']
['The', 'man', 'can', 'simwo', '.']
['DET', 'NOUN', 'AUX', 'VERB', 'PUNCT']
['That', 'round', 'table', 'might', 'collapsex', '.']
['PRON', 'NOUN', 'NOUN', 'AUX', 'VERB', 'PUNCT']
