## Defaul Tagger

In [1]:
from nltk.tag import DefaultTagger
tagger=DefaultTagger("NN")
tagger.tag(["This","is","defualt","tagger"])

[('This', 'NN'), ('is', 'NN'), ('defualt', 'NN'), ('tagger', 'NN')]

## Evaluating Accuracy

In [2]:
import nltk
nltk.download("treebank")

[nltk_data] Error loading treebank: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>


False

In [3]:
from nltk.corpus import treebank
sents=treebank.sents()

In [4]:
sents

[['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'], ['Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.'], ...]

In [5]:
tagged_sents=treebank.tagged_sents()

In [6]:
tagged_sents

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]

In [7]:
len(sents)

3914

In [8]:
test_sents=tagged_sents[3000:]
tagger.evaluate(test_sents)

0.14331966328512843

## Unigram Tagger

In [9]:
from nltk.tag import UnigramTagger
train_sents=tagged_sents[:3000]
tagger=UnigramTagger(train_sents)

In [10]:
tagger.tag(sents[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [11]:
tagger.evaluate(test_sents)

0.8571551910209367

### Context Model

In [12]:
tagger=UnigramTagger(model={"Pierre":"NN"})
tagger.tag(sents[0])

[('Pierre', 'NN'),
 ('Vinken', None),
 (',', None),
 ('61', None),
 ('years', None),
 ('old', None),
 (',', None),
 ('will', None),
 ('join', None),
 ('the', None),
 ('board', None),
 ('as', None),
 ('a', None),
 ('nonexecutive', None),
 ('director', None),
 ('Nov.', None),
 ('29', None),
 ('.', None)]

### Frequency Cutoff

In [13]:
tagger=UnigramTagger(train_sents,cutoff=3)
tagger.evaluate(test_sents)

0.775350744657889

## Backoff Tagger

In [14]:
tagger1=DefaultTagger("NN")
tagger2=UnigramTagger(train_sents,backoff=tagger1)
tagger2.evaluate(test_sents)

0.8741204403194475

## Saving and Loading a trained Tagger

In [15]:
import pickle
f=open("tagger.pickle","wb")
pickle.dump(tagger,f)
f.close()

In [16]:
f=open("tagger.pickle","rb")
tagger=pickle.load(f)

## n-gram Tagger

In [17]:
from nltk.tag import BigramTagger,TrigramTagger
bitagger=BigramTagger(train_sents)
bitagger.evaluate(test_sents)

0.11318799913662854

In [18]:
tritagger=TrigramTagger(train_sents)
tritagger.evaluate(test_sents)

0.06902654867256637

### backoff tagger function

In [19]:
def backoff_tagger(train_sents,tagger_classes,backoff=None):
    for cls in tagger_classes:
        backoff=cls(train_sents,backoff=backoff)
    return backoff

In [20]:
backoff=DefaultTagger("NN")
tagger=backoff_tagger(train_sents,[UnigramTagger,BigramTagger,TrigramTagger],backoff=backoff)
tagger.evaluate(test_sents)

0.8806388948845241

### more than 3 gram tagger

In [21]:
from nltk.tag import NgramTagger
quadtagger=NgramTagger(4,train_sents)
quadtagger.evaluate(test_sents)

0.058493416792575005

## Word Tags Frequence

In [22]:
from nltk.probability import FreqDist
from nltk.corpus import treebank

fd=FreqDist(treebank.words())

In [23]:
fd["book"]

8

In [24]:
from nltk.probability import ConditionalFreqDist

cfd=ConditionalFreqDist(treebank.tagged_words())

In [25]:
cfd["book"]

FreqDist({'NN': 7, 'VB': 1})

## Regular Expression Tagger

In [26]:
from nltk.tag import RegexpTagger

pattern=[(r"^\d+$","CD"),(r".*ing$","VBG"),(r".*ment$","NN"),(r".*ful$","JJ")]
tagger=RegexpTagger(pattern)
tagger.evaluate(test_sents)

0.037470321605870924

## Affix Tagger

In [27]:
from nltk.tag import AffixTagger

tagger=AffixTagger(train_sents)
tagger.evaluate(test_sents)

0.27507014893157783

In [28]:
prefix_tagger=AffixTagger(train_sents,affix_length=3)
prefix_tagger.evaluate(test_sents)

0.2365637815670192

In [29]:
suffix_tagger=AffixTagger(train_sents,affix_length=-2)
suffix_tagger.evaluate(test_sents)

0.3196201165551478

In [30]:
suffix_tagger=AffixTagger(train_sents,affix_length=-2,backoff=prefix_tagger)
suffix_tagger.evaluate(test_sents)

0.3213036909130153

## TnT Tagger

In [31]:
from nltk.tag import tnt
tnt_tagger=tnt.TnT()
tnt_tagger.train(train_sents)

In [32]:
tnt_tagger.evaluate(test_sents)

0.875545003237643

In [33]:
unk=DefaultTagger("NN")
tnt_tagger=tnt.TnT(unk=unk,Trained=True)
tnt_tagger.train(train_sents)

In [34]:
tnt_tagger.evaluate(test_sents)

0.892467083962875

## Classifier Based POS Tagger

In [35]:
from nltk.tag.sequential import ClassifierBasedPOSTagger
tagger=ClassifierBasedPOSTagger(train=train_sents)
tagger.evaluate(test_sents)

0.9309734513274336

In [36]:
defualt=DefaultTagger("NN")
tagger=ClassifierBasedPOSTagger(train=train_sents,cutoff_prob=0.3,backoff=defualt)
tagger.evaluate(test_sents)

0.9311029570472696