In [1]:
# load necessary dependencies
import nltk, re
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from collections import Counter

In [2]:
print('POS Tagging using CRF')

POS Tagging using CRF


In [3]:
# Use tagset
tagged_sentences = nltk.corpus.treebank.tagged_sents(tagset='universal')
print('Number of tagged sentences = ', len(tagged_sentences))

Number of tagged sentences =  3914


In [4]:
tagged_words = [tup for sent in tagged_sentences for tup in sent]
print('Total number of tagged words = ', len(tagged_words))

Total number of tagged words =  100676


In [5]:
vocab = set([word for word, tag in tagged_words])
print('Vocabulary of the corpus = ', len(vocab))

Vocabulary of the corpus =  12408


In [6]:
tags = set([tag for word, tag in tagged_words])
print('The unique tags in the corpus: ', tags)
print('Number of tags in the corpus = ', len(tags))

The unique tags in the corpus:  {'NOUN', 'ADP', 'DET', 'PRT', 'VERB', 'X', 'PRON', '.', 'CONJ', 'ADJ', 'ADV', 'NUM'}
Number of tags in the corpus =  12


In [7]:
# Splitting data into train and test set -> 80-20 split
train_set, test_set = train_test_split(tagged_sentences, test_size=0.2, random_state=1234)
print('Number of sentences in training data = ', len(train_set))
print('Number of sentences in testing data = ', len(test_set))

Number of sentences in training data =  3131
Number of sentences in testing data =  783


In [8]:
# Define the feature function
def features(sentence, index):
    # sentence: [w1, w2, w3, ...], index is the position of the word in the sentence
    return {
        'is_first_capital': int(sentence[index][0].isupper()),
        'is_first_word': int(index==0),
        'is_last_word': int(index==len(sentence)-1),
        'is_complete_capital': int(sentence[index].upper()==sentence[index]),
        'prev_word': '' if index==0 else sentence[index-1],
        'next_word': '' if index==len(sentence)-1 else sentence[index+1],
        'is_numeric': int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[a-zA-Z])',sentence[index])))),
        'prefix_1': sentence[index][0],
        'prefix_2': sentence[index][:2],
        'prefix_3': sentence[index][:3],
        'prefix_4': sentence[index][:4],
        'suffix_1': sentence[index][-1],
        'suffix_2': sentence[index][-2:],
        'suffix_3': sentence[index][-3:],
        'suffix_4': sentence[index][-4:],
        'word_has_hyphen': 1 if '-' in sentence[index] else 0
    }

In [9]:
# Separate labels and the sentences in both training and testing data
def untag(sentence):
    return [word for word, tag in sentence]

def prepare_data(tagged_sentss):
    X, y = [], []
    for sent in tagged_sentss:
        X.append([features(untag(sent), index) for index in range(len(sent))])
        y.append([tag for word, tag in sent])
    return X, y

In [10]:
X_train, y_train = prepare_data(train_set)
X_test, y_test = prepare_data(test_set)
print('X training data: ', X_train[0])
print('y training data: ', y_train[0])

X training data:  [{'is_first_capital': 1, 'is_first_word': 1, 'is_last_word': 0, 'is_complete_capital': 0, 'prev_word': '', 'next_word': 'Wall', 'is_numeric': 0, 'is_alphanumeric': 1, 'prefix_1': 'O', 'prefix_2': 'On', 'prefix_3': 'On', 'prefix_4': 'On', 'suffix_1': 'n', 'suffix_2': 'On', 'suffix_3': 'On', 'suffix_4': 'On', 'word_has_hyphen': 0}, {'is_first_capital': 1, 'is_first_word': 0, 'is_last_word': 0, 'is_complete_capital': 0, 'prev_word': 'On', 'next_word': 'Street', 'is_numeric': 0, 'is_alphanumeric': 1, 'prefix_1': 'W', 'prefix_2': 'Wa', 'prefix_3': 'Wal', 'prefix_4': 'Wall', 'suffix_1': 'l', 'suffix_2': 'll', 'suffix_3': 'all', 'suffix_4': 'Wall', 'word_has_hyphen': 0}, {'is_first_capital': 1, 'is_first_word': 0, 'is_last_word': 0, 'is_complete_capital': 0, 'prev_word': 'Wall', 'next_word': 'men', 'is_numeric': 0, 'is_alphanumeric': 1, 'prefix_1': 'S', 'prefix_2': 'St', 'prefix_3': 'Str', 'prefix_4': 'Stre', 'suffix_1': 't', 'suffix_2': 'et', 'suffix_3': 'eet', 'suffix_4': 

In [11]:
# Fit a CRF model with default parameters
crf = CRF(algorithm='lbfgs', c1=0.01, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.01, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [12]:
y_pred_train = crf.predict(X_train)
print('F1 score on train data: ', metrics.flat_f1_score(y_train, y_pred_train, average='weighted', labels=crf.classes_))

y_pred = crf.predict(X_test)
print('F1 score on test data: ', metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=crf.classes_))

F1 score on train data:  0.9963656348416452
F1 score on test data:  0.9740179687208006


In [13]:
# Important features used to identify different POS tags
print('Score on y_train and y_pred_train: ', metrics.flat_accuracy_score(y_train, y_pred_train))
print('Score on y_test and y_pred: ', metrics.flat_accuracy_score(y_test, y_pred))

Score on y_train and y_pred_train:  0.9963688461823481
Score on y_test and y_pred:  0.974124809741248


In [14]:
# Class-wise scores
print('Classwise scores: ', metrics.flat_classification_report(y_test, y_pred, labels=crf.classes_, digits=3))



Classwise scores:                precision    recall  f1-score   support

         ADP      0.979     0.985     0.982      1869
        NOUN      0.966     0.977     0.972      5606
        CONJ      0.994     0.994     0.994       480
        VERB      0.964     0.961     0.962      2722
         ADJ      0.909     0.877     0.893      1274
           .      1.000     1.000     1.000      2354
           X      1.000     0.996     0.998      1278
         NUM      0.993     0.993     0.993       671
         DET      0.994     0.995     0.994      1695
         ADV      0.929     0.911     0.920       585
        PRON      0.998     0.998     0.998       562
         PRT      0.984     0.982     0.983       614

    accuracy                          0.974     19710
   macro avg      0.976     0.972     0.974     19710
weighted avg      0.974     0.974     0.974     19710



In [17]:
# Most likely transition feature
print('Number of transition features: ', len(crf.transition_features_))
print()
print('Top 10 transition features: ', Counter(crf.transition_features_).most_common(10))
print()
print('Bottom 10 transition features: ', Counter(crf.transition_features_).most_common()[-10:])

Number of transition features:  144

Top 10 transition features:  [(('VERB', 'PRT'), 2.852403), (('ADJ', 'NOUN'), 2.733202), (('X', 'VERB'), 2.222562), (('PRON', 'VERB'), 1.890524), (('NOUN', 'PRT'), 1.82336), (('DET', 'X'), 1.612082), (('NUM', 'NUM'), 1.565432), (('NUM', 'NOUN'), 1.56025), (('ADP', 'PRON'), 1.494321), (('ADP', 'NOUN'), 1.475512)]

Bottom 10 transition features:  [(('NOUN', 'ADJ'), -1.356654), (('PRON', 'PRT'), -1.368557), (('X', 'NOUN'), -1.536805), (('ADV', 'NOUN'), -1.540835), (('ADJ', 'PRON'), -1.543827), (('CONJ', 'X'), -1.620481), (('DET', 'ADP'), -1.791202), (('ADP', 'X'), -2.604306), (('.', 'PRT'), -2.704322), (('DET', 'PRT'), -3.860879)]


In [18]:
# Most likely state features
print('Number of state features: ', len(crf.state_features_))
print()
print('Top 10 state features: ', Counter(crf.state_features_).most_common(10))
print()
print('Bottom 10 state features: ', Counter(crf.state_features_).most_common()[-10:])

Number of state features:  32285

Top 10 state features:  [(('prev_word:will', 'VERB'), 6.758106), (('prefix_1:*', 'X'), 6.250123), (('prev_word:would', 'VERB'), 5.910319), (('suffix_4:rest', 'NOUN'), 5.619756), (('suffix_2:ly', 'ADV'), 5.339965), (('prev_word:could', 'VERB'), 4.994689), (('suffix_3:ous', 'ADJ'), 4.864249), (('prev_word:to', 'VERB'), 4.536444), (('prev_word:how', 'PRT'), 4.420125), (('suffix_4:will', 'VERB'), 4.402834)]

Bottom 10 state features:  [(('prev_word:their', 'VERB'), -2.751858), (('prev_word:was', 'NOUN'), -2.754931), (('next_word:currency', 'NOUN'), -2.788666), (('suffix_4:good', 'NOUN'), -2.902023), (('next_word:of', 'PRT'), -3.215567), (('suffix_4:rter', 'ADJ'), -3.28299), (('prev_word:*U*', 'VERB'), -3.312148), (('next_word:swap', 'ADJ'), -3.391222), (('prev_word:his', 'VERB'), -3.648272), (('word_has_hyphen', 'VERB'), -4.765276)]
