Load the Browns corpus from NLTK (nltk.corpus.brown) with fiction category (pass the category to the loader functions). 
From the corpus, load the tagged and untagged sentences. 
Make sure that the tags are using the universal tag set. 

In [1]:
import nltk
nltk.download('book', quiet=True)
nltk.corpus.brown

<CategorizedTaggedCorpusReader in 'C:\\Users\\USER\\AppData\\Roaming\\nltk_data\\corpora\\brown'>

In [2]:
DATA = nltk.corpus.brown.sents()
DATA_TAGGED = nltk.corpus.brown.tagged_sents(tagset='universal')

In [3]:
train_split = int(len(DATA_TAGGED) * 0.75)
DATA_TRAIN = DATA_TAGGED[:train_split]
DATA_TEST = DATA_TAGGED[train_split:]

In [4]:
len(DATA), len(DATA_TAGGED), len(DATA_TRAIN), len(DATA_TEST)

(57340, 57340, 43005, 14335)

Uni-gram Tagger (1-a)

In [5]:
unigram_tagger = nltk.tag.UnigramTagger(DATA_TRAIN)
unigram_tagger.evaluate(DATA_TEST)

0.9090116513014942

In [6]:
unigram_tagger.tag_sents(DATA)[:5]

[[('The', 'DET'),
  ('Fulton', 'NOUN'),
  ('County', 'NOUN'),
  ('Grand', 'ADJ'),
  ('Jury', 'NOUN'),
  ('said', 'VERB'),
  ('Friday', 'NOUN'),
  ('an', 'DET'),
  ('investigation', 'NOUN'),
  ('of', 'ADP'),
  ("Atlanta's", 'NOUN'),
  ('recent', 'ADJ'),
  ('primary', 'ADJ'),
  ('election', 'NOUN'),
  ('produced', 'VERB'),
  ('``', '.'),
  ('no', 'DET'),
  ('evidence', 'NOUN'),
  ("''", '.'),
  ('that', 'ADP'),
  ('any', 'DET'),
  ('irregularities', 'NOUN'),
  ('took', 'VERB'),
  ('place', 'NOUN'),
  ('.', '.')],
 [('The', 'DET'),
  ('jury', 'NOUN'),
  ('further', 'ADJ'),
  ('said', 'VERB'),
  ('in', 'ADP'),
  ('term-end', 'NOUN'),
  ('presentments', 'NOUN'),
  ('that', 'ADP'),
  ('the', 'DET'),
  ('City', 'NOUN'),
  ('Executive', 'ADJ'),
  ('Committee', 'NOUN'),
  (',', '.'),
  ('which', 'DET'),
  ('had', 'VERB'),
  ('over-all', 'ADJ'),
  ('charge', 'NOUN'),
  ('of', 'ADP'),
  ('the', 'DET'),
  ('election', 'NOUN'),
  (',', '.'),
  ('``', '.'),
  ('deserves', 'VERB'),
  ('the', 'DET'),


Unigram Tagger with a Verb Back-off (1-b)

In [7]:
default_tagger = nltk.tag.DefaultTagger('VB')
unigram_tagger_backoff = nltk.tag.UnigramTagger(DATA_TRAIN, backoff=default_tagger)

In [8]:
unigram_tagger_backoff.evaluate(DATA_TRAIN)

0.9585078173831979

In [9]:
unigram_tagger_backoff.evaluate(DATA_TEST)

0.9090116513014942

Trigram Tagger with Unigram Tagger and adjective backoff (1-c)

In [10]:
adjective_tagger = nltk.tag.DefaultTagger('JJ')
unigram_tagger_backoff2 = nltk.tag.UnigramTagger(DATA_TRAIN, backoff=adjective_tagger)

In [11]:
unigram_tagger_backoff.evaluate(DATA_TRAIN), unigram_tagger_backoff.evaluate(DATA_TEST)

(0.9585078173831979, 0.9090116513014942)

In [12]:
trigram_tagger_backoff = nltk.tag.TrigramTagger(DATA_TRAIN, backoff=unigram_tagger_backoff)
trigram_tagger_backoff.evaluate(DATA_TRAIN), trigram_tagger_backoff.evaluate(DATA_TEST)

(0.9739687219341117, 0.9148954244274946)

Trigram Tagger with a Bigram Tagger backoff (1-d)

In [13]:
bigram_tagger_backoff = nltk.tag.BigramTagger(DATA_TRAIN, backoff=unigram_tagger_backoff)
bigram_tagger_backoff.evaluate(DATA_TRAIN), bigram_tagger_backoff.evaluate(DATA_TEST)

(0.9703261562879323, 0.917309738628133)

In [14]:
trigram_tagger_backoff = nltk.tag.TrigramTagger(DATA_TRAIN, backoff=bigram_tagger_backoff)
trigram_tagger_backoff.evaluate(DATA_TRAIN), trigram_tagger_backoff.evaluate(DATA_TEST)

(0.9739185899911043, 0.9166659215079628)

#Average Perceptron Tagger (1 iteration) - 2A

In [15]:
perceptron_pretrained = nltk.perceptron.PerceptronTagger()

In [16]:
perceptron_pretrained.evaluate(DATA_TEST)

0.06772598428013198

In [17]:
perceptron_pretrained.tag_sents(DATA)[:5]

[[('The', 'DT'),
  ('Fulton', 'NNP'),
  ('County', 'NNP'),
  ('Grand', 'NNP'),
  ('Jury', 'NNP'),
  ('said', 'VBD'),
  ('Friday', 'NNP'),
  ('an', 'DT'),
  ('investigation', 'NN'),
  ('of', 'IN'),
  ("Atlanta's", 'NNP'),
  ('recent', 'JJ'),
  ('primary', 'JJ'),
  ('election', 'NN'),
  ('produced', 'VBD'),
  ('``', '``'),
  ('no', 'DT'),
  ('evidence', 'NN'),
  ("''", "''"),
  ('that', 'IN'),
  ('any', 'DT'),
  ('irregularities', 'NNS'),
  ('took', 'VBD'),
  ('place', 'NN'),
  ('.', '.')],
 [('The', 'DT'),
  ('jury', 'NN'),
  ('further', 'RB'),
  ('said', 'VBD'),
  ('in', 'IN'),
  ('term-end', 'JJ'),
  ('presentments', 'NNS'),
  ('that', 'IN'),
  ('the', 'DT'),
  ('City', 'NNP'),
  ('Executive', 'NNP'),
  ('Committee', 'NNP'),
  (',', ','),
  ('which', 'WDT'),
  ('had', 'VBD'),
  ('over-all', 'JJ'),
  ('charge', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('election', 'NN'),
  (',', ','),
  ('``', '``'),
  ('deserves', 'VBZ'),
  ('the', 'DT'),
  ('praise', 'NN'),
  ('and', 'CC'),
  ('than

In [18]:
perceptron_trained = nltk.perceptron.PerceptronTagger(load=False)
perceptron_trained.train(DATA_TRAIN, nr_iter=1)

In [19]:
perceptron_trained.evaluate(DATA_TRAIN), perceptron_trained.evaluate(DATA_TEST)

(0.9767494448153972, 0.9623769370400508)

#Average Perceptron Tagger (5 iteration) - 2B

In [20]:
perceptron_trained = nltk.perceptron.PerceptronTagger(load=False)
perceptron_trained.train(DATA_TRAIN, nr_iter=5)

In [21]:
perceptron_trained.evaluate(DATA_TRAIN), perceptron_trained.evaluate(DATA_TEST)

(0.9909325181381636, 0.9720789033648386)

#Average Perceptron Tagger (10 iteration) - 2C

In [22]:
perceptron_trained = nltk.perceptron.PerceptronTagger(load=False)
perceptron_trained.train(DATA_TRAIN, nr_iter=10)

In [23]:
perceptron_trained.evaluate(DATA_TRAIN), perceptron_trained.evaluate(DATA_TEST)

(0.9946806808557843, 0.9729820357139664)

Conditional Random Fields Model A 

In [24]:
import nltk
crf_default = nltk.crf.CRFTagger()
crf_default.train(DATA_TRAIN, 'crf_default.tag')

In [25]:
crf_default.evaluate(DATA_TRAIN), crf_default.evaluate(DATA_TEST)

(0.9676787630423049, 0.9576019600654547)

In [26]:
crf_default.tag_sents(DATA)[:5]

[[('The', 'DET'),
  ('Fulton', 'NOUN'),
  ('County', 'NOUN'),
  ('Grand', 'NOUN'),
  ('Jury', 'NOUN'),
  ('said', 'VERB'),
  ('Friday', 'NOUN'),
  ('an', 'DET'),
  ('investigation', 'NOUN'),
  ('of', 'ADP'),
  ("Atlanta's", 'NOUN'),
  ('recent', 'ADJ'),
  ('primary', 'ADJ'),
  ('election', 'NOUN'),
  ('produced', 'VERB'),
  ('``', '.'),
  ('no', 'DET'),
  ('evidence', 'NOUN'),
  ("''", '.'),
  ('that', 'ADP'),
  ('any', 'DET'),
  ('irregularities', 'NOUN'),
  ('took', 'VERB'),
  ('place', 'NOUN'),
  ('.', '.')],
 [('The', 'DET'),
  ('jury', 'NOUN'),
  ('further', 'ADV'),
  ('said', 'VERB'),
  ('in', 'ADP'),
  ('term-end', 'NOUN'),
  ('presentments', 'NOUN'),
  ('that', 'ADP'),
  ('the', 'DET'),
  ('City', 'NOUN'),
  ('Executive', 'ADJ'),
  ('Committee', 'NOUN'),
  (',', '.'),
  ('which', 'DET'),
  ('had', 'VERB'),
  ('over-all', 'ADJ'),
  ('charge', 'NOUN'),
  ('of', 'ADP'),
  ('the', 'DET'),
  ('election', 'NOUN'),
  (',', '.'),
  ('``', '.'),
  ('deserves', 'VERB'),
  ('the', 'DET'),

In [28]:
def custom_crf_features(tokens, idx):
    feature_list = []
    
    # WORDS
    feature_list.append(f'WORD_{tokens[idx]}')
    try:
        feature_list.append(f'WORD-1_{tokens[idx-1]}')
    except IndexError:
        pass
    try:
        feature_list.append(f'WORD+1_{tokens[idx+1]}')
    except IndexError:
        pass
    
    # SUFFIX
    token = tokens[idx]  
    if len(token) > 1:
        feature_list.append("SUF_" + token[-1:])
    if len(token) > 2:
        feature_list.append("SUF_" + token[-2:])
    if len(token) > 3:
        feature_list.append("SUF_" + token[-3:])
                
    return feature_list

In [29]:
crf_custom = nltk.crf.CRFTagger(feature_func=custom_crf_features)
crf_custom.train(DATA_TRAIN, 'crf_custom.tag')

In [30]:
crf_custom.evaluate(DATA_TRAIN), crf_custom.evaluate(DATA_TEST)

(0.9797968269679987, 0.9647420707662318)