## Part-of-Speech tagging using CRF

### Data Preparation

In [1]:
#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import sklearn
import sklearn

In [2]:
# reading the Treebank tagged sentences
wsj = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
# first few tagged sentences
print(wsj[:2])
print(len(wsj))

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]
3914


In [4]:
# reading the Brown tagged sentences
brown= list(nltk.corpus.brown.tagged_sents(tagset='universal'))

In [5]:
# first few tagged sentences
print(brown[:2])
print(len(brown))

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [6]:
# reading the conll2000 tagged sentences
conll2000= list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))

In [7]:
# first few tagged sentences
print(conll2000[:2])
print(len(conll2000))

[[('Confidence', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('pound', 'NOUN'), ('is', 'VERB'), ('widely', 'ADV'), ('expected', 'VERB'), ('to', 'PRT'), ('take', 'VERB'), ('another', 'DET'), ('sharp', 'ADJ'), ('dive', 'NOUN'), ('if', 'ADP'), ('trade', 'NOUN'), ('figures', 'NOUN'), ('for', 'ADP'), ('September', 'NOUN'), (',', '.'), ('due', 'ADJ'), ('for', 'ADP'), ('release', 'NOUN'), ('tomorrow', 'NOUN'), (',', '.'), ('fail', 'VERB'), ('to', 'PRT'), ('show', 'VERB'), ('a', 'DET'), ('substantial', 'ADJ'), ('improvement', 'NOUN'), ('from', 'ADP'), ('July', 'NOUN'), ('and', 'CONJ'), ('August', 'NOUN'), ("'s", 'PRT'), ('near-record', 'ADJ'), ('deficits', 'NOUN'), ('.', '.')], [('Chancellor', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('Exchequer', 'NOUN'), ('Nigel', 'NOUN'), ('Lawson', 'NOUN'), ("'s", 'PRT'), ('restated', 'VERB'), ('commitment', 'NOUN'), ('to', 'PRT'), ('a', 'DET'), ('firm', 'NOUN'), ('monetary', 'ADJ'), ('policy', 'NOUN'), ('has', 'VERB'), ('helped', 'VERB'), ('to', 'PRT'), ('prev

In [8]:
nltk_data = wsj + brown + conll2000

In [9]:
print(nltk_data[:2])
print(len(nltk_data))

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]
72202


In [10]:
nltk_data[1]

[('Mr.', 'NOUN'),
 ('Vinken', 'NOUN'),
 ('is', 'VERB'),
 ('chairman', 'NOUN'),
 ('of', 'ADP'),
 ('Elsevier', 'NOUN'),
 ('N.V.', 'NOUN'),
 (',', '.'),
 ('the', 'DET'),
 ('Dutch', 'NOUN'),
 ('publishing', 'VERB'),
 ('group', 'NOUN'),
 ('.', '.')]

### 1. Build your CRF

In [11]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [12]:
# extract features from a given sentence

def word_features(sent, i):
    word = sent[i][0]    
    features = {
            'word':word,
            'bias': 1.0,
            'is_capitalized': word[0].upper() == word[0],
            'is_all_caps': word.upper() == word,
            'is_all_lower': word.lower() == word,
            'is_numeric': word.isdigit(),
            'has_no_digit': None==re.search(r'[0-9]+',word),
            'has_no_punc': None==re.search(r'[\.\,\:\;\(\)\[\]\?\!]+',word),
            'has_no_only_punc': None==re.search(r'^[\.\,\:\;\(\)\[\]\?\!]+$',word),
            'capitals_inside': word[1:].lower() != word[1:],
            'has_no_ing': None==re.search(r'.*ing$', word),             
            'has_no_ed': None==re.search(r'.*ed$', word),              
            'has_no_es': None==re.search(r'.*es$', word),               
            'has_no_ould': None==re.search(r'.*ould$', word),              
            'is_no_possessive':None==re.search(r'.*\'s$', word),              
            'is_no_plural': None==re.search(r'.*s$', word), 
            'is_no_cardinal': None==re.search(r'^-?[0-9]+(.[0-9]+)?$', word),
            'suff_1': word[-1:],  
            'suff_2': word[-2:],  
            'suff_3': word[-3:],  
            'suff_4': word[-4:], 
            'pref_1': word[:1],  
            'pref_2': word[:2],  
            'pref_3': word[:3], 
            'pref_4': word[:4],
            'lemma': stemmer.stem(word),
    }

    if i > 0:
        word1 = sent[i-1][0]

        features.update({   
            '-1:word' : word1,
            '-1:is_capitalized': word1.upper() == word1,
            '-1:is_all_caps': word1.upper() == word1,
            '-1:is_all_lower': word1.lower() == word1,
            '-1:has_no_digit': None==re.search(r'[0-9]+',word1),
            '-1:has_no_punc': None==re.search(r'[\.\,\:\;\(\)\[\]\?\!]+',word1),
            '-1:has_no_only_punc': None==re.search(r'^[\.\,\:\;\(\)\[\]\?\!]+$',word1),
            '-1:capitals_inside': word[1:].lower() != word1[1:],
            '-1:has_no_ing': None==re.search(r'.*ing$', word1),             
            '-1:has_no_ed': None==re.search(r'.*ed$', word1),              
            '-1:has_no_es': None==re.search(r'.*es$', word1),               
            '-1:has_no_ould': None==re.search(r'.*ould$', word1),              
            '-1:is_no_possessive':None==re.search(r'.*\'s$', word1),              
            '-1:is_no_plural': None==re.search(r'.*s$', word1), 
            '-1:is_no_cardinal': None==re.search(r'^-?[0-9]+(.[0-9]+)?$', word1),
            '-1:suff_1': word1[-1:],  
            '-1:suff_2': word1[-2:],  
            '-1:suff_3': word1[-3:],  
            '-1:suff_4': word1[-4:], 
            '-1:pref_1': word1[:1],  
            '-1:pref_2': word1[:2],  
            '-1:pref_3': word1[:3], 
            '-1:pref_4': word1[:4],
            '-1:lemma': stemmer.stem(word1),
        })
    else:
        features.update({   
            '-1:word': '<START>',
        })
    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:is_capitalized': word1.upper() == word1,
            '+1:is_all_caps': word1.upper() == word1,
            '+1:is_all_lower': word1.lower() == word1,
            '+1:is_numeric': word1.isdigit(),
            '+1:has_no_digit': None==re.search(r'[0-9]+',word1),
            '+1:has_no_punc': None==re.search(r'[\.\,\:\;\(\)\[\]\?\!]+',word1),
            '+1:has_no_only_punc': None==re.search(r'^[\.\,\:\;\(\)\[\]\?\!]+$',word1),
            '+1:capitals_inside': word[1:].lower() != word1[1:],
            '+1:has_no_ing': None==re.search(r'.*ing$', word1),             
            '+1:has_no_ed': None==re.search(r'.*ed$', word1),              
            '+1:has_no_es': None==re.search(r'.*es$', word1),               
            '+1:has_no_ould': None==re.search(r'.*ould$', word1),              
            '+1:is_no_possessive':None==re.search(r'.*\'s$', word1),              
            '+1:is_no_plural': None==re.search(r'.*s$', word1), 
            '+1:is_no_cardinal': None==re.search(r'^-?[0-9]+(.[0-9]+)?$', word1),
            '+1:suff_1': word1[-1:],  
            '+1:suff_2': word1[-2:],  
            '+1:suff_3': word1[-3:],  
            '+1:suff_4': word1[-4:], 
            '+1:pref_1': word1[:1],  
            '+1:pref_2': word1[:2],  
            '+1:pref_3': word1[:3], 
            '+1:pref_4': word1[:4],
            '+1:lemma': stemmer.stem(word1),
        })
    else:
        features.update({
            '+1:word': '<END>',
        })
    
    return features

In [13]:
def sent2features(sent):
    return [word_features(sent, i) for i in range(len(sent))]

def sent2pos(sent):
    return [postag for token, postag in sent]

def sent2tokens(sent):
    return [token for token, postag in sent]

In [14]:
X = [sent2features(sent) for sent in nltk_data]
y = [sent2pos(sent) for sent in nltk_data]

In [15]:
X[0]

[{'+1:capitals_inside': True,
  '+1:has_no_digit': True,
  '+1:has_no_ed': True,
  '+1:has_no_es': True,
  '+1:has_no_ing': True,
  '+1:has_no_only_punc': True,
  '+1:has_no_ould': True,
  '+1:has_no_punc': True,
  '+1:is_all_caps': False,
  '+1:is_all_lower': False,
  '+1:is_capitalized': False,
  '+1:is_no_cardinal': True,
  '+1:is_no_plural': True,
  '+1:is_no_possessive': True,
  '+1:is_numeric': False,
  '+1:lemma': 'vinken',
  '+1:pref_1': 'V',
  '+1:pref_2': 'Vi',
  '+1:pref_3': 'Vin',
  '+1:pref_4': 'Vink',
  '+1:suff_1': 'n',
  '+1:suff_2': 'en',
  '+1:suff_3': 'ken',
  '+1:suff_4': 'nken',
  '+1:word': 'Vinken',
  '-1:word': '<START>',
  'bias': 1.0,
  'capitals_inside': False,
  'has_no_digit': True,
  'has_no_ed': True,
  'has_no_es': True,
  'has_no_ing': True,
  'has_no_only_punc': True,
  'has_no_ould': True,
  'has_no_punc': True,
  'is_all_caps': False,
  'is_all_lower': False,
  'is_capitalized': True,
  'is_no_cardinal': True,
  'is_no_plural': True,
  'is_no_possess

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
from sklearn.preprocessing import LabelBinarizer

# pip/conda install sklearn_crfsuite
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

In [19]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 16min 45s


In [20]:
labels = list(crf.classes_)

In [21]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9509576492962789

In [22]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

          .      0.999     0.999     0.999     38387
          X      0.981     0.833     0.901      1589
        ADJ      0.860     0.844     0.852     21451
        ADP      0.959     0.967     0.963     36649
        ADV      0.910     0.891     0.900     13706
       VERB      0.952     0.950     0.951     46647
        DET      0.974     0.983     0.978     34071
       CONJ      0.992     0.992     0.992      9518
       NOUN      0.942     0.956     0.949     77256
       PRON      0.981     0.921     0.950     11865
        PRT      0.880     0.874     0.877      8289
        NUM      0.975     0.962     0.969      5776

avg / total      0.951     0.951     0.951    305204



In [None]:
%%time
import scipy
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=3,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


### 2. Evaluate the model performance

In [None]:
test

### 3. Interpret the model (enlist important state and transition features)