In [1]:
import nltk

In [2]:
# Generic dataset: validated data set is used here
# for easy starting we use 'treebank' dataset
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/rajendrakarki/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [3]:
wsj = nltk.corpus.treebank.tagged_sents()
print(len(wsj))

# see structure of data
wsj[:2]

3914


[[('Pierre', 'NNP'),
  ('Vinken', 'NNP'),
  (',', ','),
  ('61', 'CD'),
  ('years', 'NNS'),
  ('old', 'JJ'),
  (',', ','),
  ('will', 'MD'),
  ('join', 'VB'),
  ('the', 'DT'),
  ('board', 'NN'),
  ('as', 'IN'),
  ('a', 'DT'),
  ('nonexecutive', 'JJ'),
  ('director', 'NN'),
  ('Nov.', 'NNP'),
  ('29', 'CD'),
  ('.', '.')],
 [('Mr.', 'NNP'),
  ('Vinken', 'NNP'),
  ('is', 'VBZ'),
  ('chairman', 'NN'),
  ('of', 'IN'),
  ('Elsevier', 'NNP'),
  ('N.V.', 'NNP'),
  (',', ','),
  ('the', 'DT'),
  ('Dutch', 'NNP'),
  ('publishing', 'VBG'),
  ('group', 'NN'),
  ('.', '.')]]

In [4]:
# Tagged word using comphrension of python to get desired result
tagged_word = [tup for sent in wsj for tup in sent]
print(tagged_word[:5])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS')]


In [5]:
# Now get the probability. get the number of Tags
# two element in tagged word. iterate over tagged word and return first index data to tags
tags = [pair[1] for pair in tagged_word]
print(len(tags))
unique_tags = set(tags)
print(len(unique_tags))

100676
46


In [6]:
# practise loop and comprehension of python 
tag_s1 = []
for pair in tagged_word:
    tag_s1.append(pair[1])
# which is equivalent to below
tag_s2 = [pair[1] for pair in tagged_word]
print(len(tag_s1))
print(len(tag_s2))

100676
100676


In [7]:
from collections import Counter
tag_counts = Counter(tags)
tag_counts

Counter({'NN': 13166,
         'IN': 9857,
         'NNP': 9410,
         'DT': 8165,
         '-NONE-': 6592,
         'NNS': 6047,
         'JJ': 5834,
         ',': 4886,
         '.': 3874,
         'CD': 3546,
         'VBD': 3043,
         'RB': 2822,
         'VB': 2554,
         'CC': 2265,
         'TO': 2179,
         'VBN': 2134,
         'VBZ': 2125,
         'PRP': 1716,
         'VBG': 1460,
         'VBP': 1321,
         'MD': 927,
         'POS': 824,
         'PRP$': 766,
         '$': 724,
         '``': 712,
         "''": 694,
         ':': 563,
         'WDT': 445,
         'JJR': 381,
         'NNPS': 244,
         'WP': 241,
         'RP': 216,
         'JJS': 182,
         'WRB': 178,
         'RBR': 136,
         '-RRB-': 126,
         '-LRB-': 120,
         'EX': 88,
         'RBS': 35,
         'PDT': 27,
         '#': 16,
         'WP$': 14,
         'LS': 13,
         'FW': 4,
         'UH': 3,
         'SYM': 1})

In [8]:
bank = [pair for pair in tagged_word if pair[0].lower() == 'bank']
bank
print(len(bank))

70


In [9]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(wsj, test_size = 0.3)
print(len(train_set))
print(len(test_set))

2739
1175


In [10]:
unigram_tagger = nltk.UnigramTagger(train_set)
unigram_tagger.accuracy(test_set)

0.8677768526228143

Rule Based Tagger

In [11]:
patterns = [
    (r'.*ing', 'VBG'),                          # Greund
    (r'.*ed$', 'VBD'),                          # past Tense
    (r'.*es$', 'VBZ'),                          # 3rd singular present
    (r'.*ould$', 'MD'),                          # modals 
    (r'.*\'s$', 'NN$'),                         # possessive nouns
    (r'.*s$', 'NNS'),                           # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),            # cardinal number
    (r'.*', 'NN')                               # nouns
]

In [12]:
rule_based_tagger = nltk.RegexpTagger(patterns)
rule_based_tagger.accuracy(test_set)

0.21751873438801

In [13]:
lexicon_tagger = nltk.UnigramTagger(train_set, backoff = rule_based_tagger)
lexicon_tagger.accuracy(test_set)

0.9019150707743547

In [14]:
lexicon_tagger.tag('This is POS tagging test'.split())

[('This', 'DT'),
 ('is', 'VBZ'),
 ('POS', 'NN'),
 ('tagging', 'VBG'),
 ('test', 'NN')]

In [15]:
nltk.corpus.treebank.raw('wsj_0142.mrg')

"\n( (S \n    (PP (IN Despite) \n      (NP \n        (NP (DT a) (NN deluge) )\n        (PP (IN of) \n          (NP (JJ economic) (NN news) ))))\n    (, ,) \n    (S \n      (NP-SBJ (DT the) (NNP Treasury) (NN market) )\n      (VP (VBD remained) \n        (ADJP-PRD (JJ quiet) )))\n    (CC but) \n    (S \n      (NP-SBJ (DT the) (JJ corporate) (NN market) )\n      (VP (VBD was) \n        (ADJP-PRD (JJ abuzz) \n          (PP (IN over) \n            (NP \n              (NP (NNP International) (NNP Business) (NNPS Machines) (NNP Corp.) (POS 's) )\n              (JJ huge) (NN debt) (NN offering) )))))\n    (. .) ))\n( (SINV (`` ``) \n    (S-TPC-1 \n      (S \n        (NP-SBJ (EX There) )\n        (VP (VBD were) \n          (NP-PRD \n            (ADJP (RB so) (JJ many) )\n            (JJ economic) (NNS reports) )))\n      (CC but) \n      (S \n        (NP-SBJ (DT the) (NN market) )\n        (VP (VBD did) (RB n't) \n          (VP (VB care) \n            (PP-CLR (IN about) \n              (NP \n 