# Part-of-speech tagging with logistic regression

Author: Pierre Nugues

A simple POS tagger using a context of five words and logistic regression. We use GloVe embeddings to represent the words.

## Imports

In [1]:
import sys
import os
import time
from sklearn.feature_extraction import DictVectorizer
from sklearn import linear_model
import datasets
from context_dictorizer import ContextDictorizer, evaluate
from ch06.python.conll_dictorizer import CoNLLDictorizer
import numpy as np
from tqdm import tqdm

## Loading the corpus

In [2]:
CORPUS = 'EWT'  # 'EWT' or 'PTB' # The English Web Treebank or the Penn Treebank

In [3]:
if CORPUS == 'EWT':
    train_sentences, dev_sentences, test_sentences, column_names = datasets.load_ud_en_ewt()
else:
    train_sentences, dev_sentences, test_sentences, column_names = datasets.load_conll2009_pos()

In [4]:
column_names

['id',
 'form',
 'lemma',
 'upos',
 'xpos',
 'feats',
 'head',
 'deprel',
 'head',
 'deps',
 'misc']

In [5]:
train_sentences[:500]

'# newdoc id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000\n# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001\n# newpar id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-p0001\n# text = Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.\n1\tAl\tAl\tPROPN\tNNP\tNumber=Sing\t0\troot\t0:root\tSpaceAfter=No\n2\t-\t-\tPUNCT\tHYPH\t_\t1\tpunct\t1:punct\tSpaceAfter=No\n3\tZaman\tZam'

## Dictorizing the corpus

We store the corpus word in a dictionary, where the keys are the CoNLL-U columns

In [6]:
conll_dict = CoNLLDictorizer(column_names)
train_dict = conll_dict.transform(train_sentences)

In [7]:
train_dict[0][:5]

[{'id': '1',
  'form': 'Al',
  'lemma': 'Al',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '0:root',
  'deprel': 'root',
  'deps': 'SpaceAfter=No'},
 {'id': '2',
  'form': '-',
  'lemma': '-',
  'upos': 'PUNCT',
  'xpos': 'HYPH',
  'feats': '_',
  'head': '1:punct',
  'deprel': 'punct',
  'deps': 'SpaceAfter=No'},
 {'id': '3',
  'form': 'Zaman',
  'lemma': 'Zaman',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '1:flat',
  'deprel': 'flat',
  'deps': '_'},
 {'id': '4',
  'form': ':',
  'lemma': ':',
  'upos': 'PUNCT',
  'xpos': ':',
  'feats': '_',
  'head': '1:punct',
  'deprel': 'punct',
  'deps': '_'},
 {'id': '5',
  'form': 'American',
  'lemma': 'American',
  'upos': 'ADJ',
  'xpos': 'JJ',
  'feats': 'Degree=Pos',
  'head': '6:amod',
  'deprel': 'amod',
  'deps': '_'}]

## Feature extraction

We extract the features and we store them in a dictionary

In [8]:
context_dictorizer = ContextDictorizer(output='upos', w_size=2)
context_dictorizer.fit(train_dict)
# Feature and response extraction
X_dict, y = context_dictorizer.transform(train_dict)

We print the features to check they match Table 8.1 in my book (second edition)
We use the training step extraction with the dynamic features

In [9]:
#context_dictorizer.print_example(train_dict)
for i in range(10):
    print(str(X_dict[i]) + '\t' + y[i])

{'form_0': '__bos__', 'form_1': '__bos__', 'form_2': 'al', 'form_3': '-', 'form_4': 'zaman'}	PROPN
{'form_0': '__bos__', 'form_1': 'al', 'form_2': '-', 'form_3': 'zaman', 'form_4': ':'}	PUNCT
{'form_0': 'al', 'form_1': '-', 'form_2': 'zaman', 'form_3': ':', 'form_4': 'american'}	PROPN
{'form_0': '-', 'form_1': 'zaman', 'form_2': ':', 'form_3': 'american', 'form_4': 'forces'}	PUNCT
{'form_0': 'zaman', 'form_1': ':', 'form_2': 'american', 'form_3': 'forces', 'form_4': 'killed'}	ADJ
{'form_0': ':', 'form_1': 'american', 'form_2': 'forces', 'form_3': 'killed', 'form_4': 'shaikh'}	NOUN
{'form_0': 'american', 'form_1': 'forces', 'form_2': 'killed', 'form_3': 'shaikh', 'form_4': 'abdullah'}	VERB
{'form_0': 'forces', 'form_1': 'killed', 'form_2': 'shaikh', 'form_3': 'abdullah', 'form_4': 'al'}	PROPN
{'form_0': 'killed', 'form_1': 'shaikh', 'form_2': 'abdullah', 'form_3': 'al', 'form_4': '-'}	PROPN
{'form_0': 'shaikh', 'form_1': 'abdullah', 'form_2': 'al', 'form_3': '-', 'form_4': 'ani'}	PROPN


In [10]:
X_dict[0]

{'form_0': '__bos__',
 'form_1': '__bos__',
 'form_2': 'al',
 'form_3': '-',
 'form_4': 'zaman'}

## Embeddings

We transform the symbols into numbers

In [11]:
DIM = 100

In [12]:
glove_vec = datasets.load_glove_vectors(dim=DIM)

In [13]:
glove_vec['table']

array([-0.61454  ,  0.89693  ,  0.56771  ,  0.39102  , -0.22437  ,
        0.49035  ,  0.10868  ,  0.27411  , -0.23833  , -0.52153  ,
        0.73551  , -0.32654  ,  0.51304  ,  0.32415  , -0.46709  ,
        0.68051  , -0.25497  , -0.040484 , -0.54418  , -1.0548   ,
       -0.46692  ,  0.23557  ,  0.31234  , -0.34537  ,  0.14793  ,
       -0.53745  , -0.43215  , -0.48724  , -0.51019  , -0.9051   ,
       -0.17919  , -0.018376 ,  0.09719  , -0.31623  ,  0.7512   ,
        0.92236  , -0.49965  ,  0.14036  , -0.28296  , -0.97443  ,
       -0.0094408, -0.62944  ,  0.14711  , -0.94376  ,  0.0075222,
        0.18565  , -0.99172  ,  0.072789 , -0.18474  , -0.52901  ,
        0.38995  , -0.45677  , -0.21932  ,  1.3723   , -0.29636  ,
       -2.2342   , -0.36667  ,  0.04987  ,  0.63421  ,  0.53275  ,
       -0.53955  ,  0.31398  , -0.44698  , -0.38389  ,  0.066668 ,
       -0.02168  ,  0.20558  ,  0.59456  , -0.24892  , -0.52795  ,
       -0.3761   ,  0.077104 ,  0.75222  , -0.2647   , -0.0587

We extract the unique words and see if they are in GloVe

In [14]:
words = []
for sentence in train_dict:
    for word in sentence:
        words += [word['form'].lower()]

In [15]:
len(words)

207224

In [16]:
unique_words = set(words)
len(unique_words)

17111

In [17]:
unk_cnt = 0
for word in unique_words:
    if word not in glove_vec:
        unk_cnt += 1
unk_cnt

2007

We create random vectors for the special tokens

In [18]:
mark_tokens = {'__unk__': np.random.uniform(-1, 1, (1,DIM)).astype(np.float32)[0], 
               '__bos__': np.random.uniform(-1, 1, (1,DIM)).astype(np.float32)[0], 
               '__eos__': np.random.uniform(-1, 1, (1,DIM)).astype(np.float32)[0]}

In [19]:
mark_tokens['__unk__']

array([ 0.30558416, -0.20719153,  0.9562487 , -0.3019126 , -0.547967  ,
       -0.08337779, -0.6247664 ,  0.1445823 , -0.06643973,  0.23288693,
        0.11315586, -0.9649046 , -0.31624758, -0.4254379 , -0.37405428,
        0.8577804 , -0.47334865, -0.46018428, -0.5249374 , -0.27850774,
       -0.6411228 , -0.36215857,  0.6181061 ,  0.22992958, -0.00305828,
        0.5247872 , -0.9450432 , -0.59263706,  0.6969545 ,  0.94673496,
        0.17516367, -0.4254662 , -0.2677989 , -0.19289763, -0.48943555,
        0.4565137 ,  0.06892559, -0.08399918, -0.8954855 , -0.6059567 ,
       -0.73702675, -0.37068933,  0.06296079, -0.07221411, -0.17154104,
       -0.13912769,  0.7133528 , -0.05226912, -0.7306064 ,  0.6642804 ,
       -0.7314352 ,  0.7373011 ,  0.732513  ,  0.22730777,  0.36284977,
        0.45550278, -0.79514295,  0.851078  ,  0.78982085, -0.05651645,
        0.00998254, -0.24247408, -0.7035762 , -0.90906155, -0.3698797 ,
       -0.9981354 , -0.57340664,  0.81710875, -0.44264305,  0.76

## Embedding Matrices

We map the words to their embeddings

In [20]:
def map_embeddings(x_feats_words):
    x_embeddings = []
    for form in ['form_0', 'form_1', 'form_2', 'form_3', 'form_4']:
        #print(X_dict[0][key])
        if x_feats_words[form] in glove_vec:
            x_embeddings += list(glove_vec[x_feats_words[form]])
        elif x_feats_words[form] == '__bos__':
            x_embeddings += list(mark_tokens['__bos__'])
        elif x_feats_words[form] == '__eos__':
            x_embeddings += list(mark_tokens['__eos__'])
        else:
            x_embeddings += list(mark_tokens['__unk__'])
    return x_embeddings

In [21]:
X_emb = []
for x_dict in tqdm(X_dict):
    X_emb += [map_embeddings(x_dict)]

100%|████████████████████████████████| 207224/207224 [00:08<00:00, 24300.85it/s]


In [22]:
X_emb[0]

[0.15426832,
 0.46791613,
 0.55827695,
 0.048383486,
 0.20225365,
 0.7837949,
 0.41214815,
 0.2664557,
 0.9791968,
 0.719599,
 0.36310297,
 0.12455861,
 0.3494414,
 -0.034840103,
 0.18241648,
 0.05140237,
 -0.7763707,
 0.22084323,
 -0.044053063,
 0.9380405,
 -0.86235565,
 0.959623,
 0.3030166,
 -0.5981812,
 -0.46362373,
 -0.33278468,
 -0.2667399,
 -0.90579313,
 0.53356904,
 0.31991467,
 0.78728616,
 0.56640106,
 0.40221268,
 -0.65917754,
 -0.9471683,
 0.70805454,
 0.19053006,
 -0.6830383,
 0.35104567,
 0.58508676,
 -0.84287864,
 0.18092436,
 -0.8353727,
 -0.6951879,
 0.009115021,
 0.058422387,
 -0.39469734,
 -0.85807425,
 0.22544967,
 0.012701015,
 -0.40694708,
 0.36644873,
 0.422661,
 -0.54022586,
 0.07717917,
 -0.8448863,
 0.1103071,
 0.29149973,
 0.95717764,
 0.19376212,
 0.17607318,
 0.81135666,
 -0.021448247,
 0.79296494,
 0.72035646,
 0.67124605,
 0.4822406,
 0.11080468,
 0.18427423,
 0.3139711,
 -0.07899373,
 -0.46738642,
 0.72235656,
 -0.46379134,
 -0.67996764,
 -0.3765637,
 0.

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [24]:
X_emb_scaled = scaler.fit_transform(X_emb)

## Training the model

In [25]:
classifier = linear_model.LogisticRegression(max_iter=500)

In [26]:
#model = classifier.fit(X_emb, y)
model = classifier.fit(X_emb_scaled, y)
model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=500)

## Prediction

### Process the test corpus

We first dictorize the test corpus

In [27]:
test_dict = conll_dict.transform(test_sentences)
test_dict[0][:5]

[{'id': '1',
  'form': 'What',
  'lemma': 'what',
  'upos': 'PRON',
  'xpos': 'WP',
  'feats': 'PronType=Int',
  'head': '0:root',
  'deprel': 'root',
  'deps': '_'},
 {'id': '2',
  'form': 'if',
  'lemma': 'if',
  'upos': 'SCONJ',
  'xpos': 'IN',
  'feats': '_',
  'head': '4:mark',
  'deprel': 'mark',
  'deps': '_'},
 {'id': '3',
  'form': 'Google',
  'lemma': 'Google',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '4:nsubj',
  'deprel': 'nsubj',
  'deps': '_'},
 {'id': '4',
  'form': 'Morphed',
  'lemma': 'morph',
  'upos': 'VERB',
  'xpos': 'VBD',
  'feats': 'Mood=Ind|Tense=Past|VerbForm=Fin',
  'head': '1:advcl:if',
  'deprel': 'advcl',
  'deps': '_'},
 {'id': '5',
  'form': 'Into',
  'lemma': 'into',
  'upos': 'ADP',
  'xpos': 'IN',
  'feats': '_',
  'head': '6:case',
  'deprel': 'case',
  'deps': '_'}]

In [28]:
X_dict_test, y_test = context_dictorizer.transform(test_dict)

In [29]:
X_dict_test[:5]

[{'form_0': '__bos__',
  'form_1': '__bos__',
  'form_2': 'what',
  'form_3': 'if',
  'form_4': 'google'},
 {'form_0': '__bos__',
  'form_1': 'what',
  'form_2': 'if',
  'form_3': 'google',
  'form_4': 'morphed'},
 {'form_0': 'what',
  'form_1': 'if',
  'form_2': 'google',
  'form_3': 'morphed',
  'form_4': 'into'},
 {'form_0': 'if',
  'form_1': 'google',
  'form_2': 'morphed',
  'form_3': 'into',
  'form_4': 'googleos'},
 {'form_0': 'google',
  'form_1': 'morphed',
  'form_2': 'into',
  'form_3': 'googleos',
  'form_4': '?'}]

In [30]:
y_test[:5]

['PRON', 'SCONJ', 'PROPN', 'VERB', 'ADP']

In [31]:
X_test_emb = []
for x_dict in tqdm(X_dict_test):
    X_test_emb += [map_embeddings(x_dict)]

100%|██████████████████████████████████| 25455/25455 [00:01<00:00, 19708.46it/s]


### Predict

In [32]:
X_test_emb_scaled = scaler.transform(X_test_emb)

In [33]:
y_test_pred = model.predict(X_test_emb_scaled)
#y_test_pred = model.predict(X_test_emb)

In [34]:
y_test_pred[:20]

array(['PRON', 'SCONJ', 'VERB', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'PRON',
       'SCONJ', 'VERB', 'VERB', 'ADP', 'PRON', 'NOUN', 'PUNCT', 'NOUN',
       'PUNCT', 'CCONJ', 'ADV', 'NOUN'], dtype='<U5')

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)

0.884737772539776

In [36]:
# accuracy when nonscaled. Would require more experiments
# 0.8837949322333529

## Applying the model to sentences

In [37]:
sentences = ['That round table might collapse .',
             'The man can learn well .',
             'The man can swim well .',
             'The man can simwo .',
             'that round table might collapsex .']

We create CoNLL-like sentences

In [38]:
def sentence_to_conll(sentence):
    """
    Convert a sentence to a CoNLL dict
    :param sentence:
    :return:
    """
    column_names = ['id', 'form']
    sentence = list(enumerate(sentence.split(), start=1))
    conll_cols = ''
    for tuple in sentence:
        conll_cols += str(tuple[0]) + '\t' + tuple[1] + '\n'

    conll_dict = CoNLLDictorizer(column_names)
    sent_dict = conll_dict.transform(conll_cols)
    return sent_dict[0]

In [39]:
sentence_to_conll(sentences[0])

[{'id': '1', 'form': 'That'},
 {'id': '2', 'form': 'round'},
 {'id': '3', 'form': 'table'},
 {'id': '4', 'form': 'might'},
 {'id': '5', 'form': 'collapse'},
 {'id': '6', 'form': '.'}]

And we tag them

In [41]:
for sentence in sentences:
    sentence_conll = sentence_to_conll(sentence.lower().strip())
    X_feats, y = context_dictorizer.transform([sentence_conll], training_step=False)
    X = []
    for x_dict in X_feats:
        X += [map_embeddings(x_dict)]
    print(sentence)
    print(model.predict(scaler.transform(X)))
    #print(model.predict(X))

That round table might collapse .
['DET' 'ADJ' 'NOUN' 'AUX' 'VERB' 'PUNCT']
The man can learn well .
['DET' 'NOUN' 'AUX' 'VERB' 'ADV' 'PUNCT']
The man can swim well .
['DET' 'NOUN' 'AUX' 'VERB' 'ADV' 'PUNCT']
The man can simwo .
['DET' 'NOUN' 'AUX' 'VERB' 'PUNCT']
that round table might collapsex .
['DET' 'ADJ' 'NOUN' 'VERB' 'VERB' 'PUNCT']
