In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

# Sentence : I left my phone on the left side of the room.

In [3]:
text = nltk.word_tokenize("I left my phone on the left side of the room.")

In [4]:
print(text)

['I', 'left', 'my', 'phone', 'on', 'the', 'left', 'side', 'of', 'the', 'room', '.']


In [6]:
tagged_text = nltk.pos_tag(text)

In [7]:
tagged_text

[('I', 'PRP'),
 ('left', 'VBD'),
 ('my', 'PRP$'),
 ('phone', 'NN'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('left', 'JJ'),
 ('side', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('room', 'NN'),
 ('.', '.')]

# Sentence : Who will play with the dog while they are at the play?

In [8]:
text = nltk.word_tokenize("Who will play with the dog while they are at the play?")

In [9]:
tagged_text = nltk.pos_tag(text)

In [10]:
tagged_text

[('Who', 'WP'),
 ('will', 'MD'),
 ('play', 'VB'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('dog', 'NN'),
 ('while', 'IN'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('play', 'NN'),
 ('?', '.')]

In [11]:
from nltk.corpus import brown

In [17]:
brown_tagged_sents = brown.tagged_sents(categories="news")
brown_sents = brown.sents(categories="news")

In [20]:
type(brown_sents), len(brown_sents)

(nltk.corpus.reader.util.ConcatenatedCorpusView, 4623)

In [28]:
# Regular Expresion Tagger

patterns = [
    (r".*ing$", "VBG"),
    (r".*ed$", "VBD"),
    (r".*es$", "VBZ"),
    (r".*ould$", "MD"),
    (r".*\'s$", "NN$"),
    (r".*s$", "NNS"),
    (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),
    (r".*", "NN")
]

In [16]:
regexp_tagger = nltk.RegexpTagger(patterns)

In [22]:
print(brown_sents[50])

['The', 'largest', 'hurdle', 'the', 'Republicans', 'would', 'have', 'to', 'face', 'is', 'a', 'state', 'law', 'which', 'says', 'that', 'before', 'making', 'a', 'first', 'race', ',', 'one', 'of', 'two', 'alternative', 'courses', 'must', 'be', 'taken', ':', '1']


In [24]:
print(regexp_tagger.tag(brown_sents[50]))

[('The', 'NN'), ('largest', 'NN'), ('hurdle', 'NN'), ('the', 'NN'), ('Republicans', 'NNS'), ('would', 'MD'), ('have', 'NN'), ('to', 'NN'), ('face', 'NN'), ('is', 'NNS'), ('a', 'NN'), ('state', 'NN'), ('law', 'NN'), ('which', 'NN'), ('says', 'NNS'), ('that', 'NN'), ('before', 'NN'), ('making', 'VBG'), ('a', 'NN'), ('first', 'NN'), ('race', 'NN'), (',', 'NN'), ('one', 'NN'), ('of', 'NN'), ('two', 'NN'), ('alternative', 'NN'), ('courses', 'VBZ'), ('must', 'NN'), ('be', 'NN'), ('taken', 'NN'), (':', 'NN'), ('1', 'CD')]


In [26]:
print(nltk.pos_tag(brown_sents[50]))

[('The', 'DT'), ('largest', 'JJS'), ('hurdle', 'NN'), ('the', 'DT'), ('Republicans', 'NNPS'), ('would', 'MD'), ('have', 'VB'), ('to', 'TO'), ('face', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('state', 'NN'), ('law', 'NN'), ('which', 'WDT'), ('says', 'VBZ'), ('that', 'IN'), ('before', 'IN'), ('making', 'VBG'), ('a', 'DT'), ('first', 'JJ'), ('race', 'NN'), (',', ','), ('one', 'CD'), ('of', 'IN'), ('two', 'CD'), ('alternative', 'JJ'), ('courses', 'NNS'), ('must', 'MD'), ('be', 'VB'), ('taken', 'VBN'), (':', ':'), ('1', 'CD')]


In [27]:
regexp_tagger.evaluate(brown_tagged_sents)

0.20186168625812995

In [29]:
# Lookup Tagger
fd = nltk.FreqDist(brown.words(categories = "news"))

In [32]:
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories = "news")) # 특정 단어에 어떤 태그가 몇 번 사용되었는지 계산

In [33]:
most_freq_words = fd.most_common(100)

In [35]:
most_freq_words

[('the', 5580),
 (',', 5188),
 ('.', 4030),
 ('of', 2849),
 ('and', 2146),
 ('to', 2116),
 ('a', 1993),
 ('in', 1893),
 ('for', 943),
 ('The', 806),
 ('that', 802),
 ('``', 732),
 ('is', 732),
 ('was', 717),
 ("''", 702),
 ('on', 657),
 ('at', 598),
 ('with', 545),
 ('be', 526),
 ('by', 497),
 ('as', 481),
 ('he', 451),
 ('said', 402),
 ('his', 399),
 ('will', 389),
 ('it', 363),
 ('from', 344),
 ('are', 328),
 (';', 314),
 ('an', 300),
 ('has', 300),
 ('--', 300),
 ('had', 279),
 ('who', 268),
 ('have', 265),
 ('not', 254),
 ('Mrs.', 253),
 ('were', 252),
 ('this', 250),
 ('which', 244),
 ('would', 244),
 ('their', 219),
 ('been', 212),
 ('they', 205),
 ('He', 191),
 ('one', 184),
 ('I', 179),
 ('but', 174),
 ('its', 174),
 ('or', 173),
 (')', 171),
 ('more', 171),
 ('Mr.', 170),
 ('(', 168),
 ('up', 168),
 ('all', 163),
 ('out', 161),
 ('last', 161),
 ('two', 157),
 ('other', 149),
 (':', 149),
 ('new', 148),
 ('first', 143),
 ('than', 138),
 ('year', 138),
 ('A', 137),
 ('about', 13

In [36]:
likely_tags = dict( (word, cfd[word].max()) for (word, _) in most_freq_words )

In [37]:
likely_tags

{'the': 'AT',
 ',': ',',
 '.': '.',
 'of': 'IN',
 'and': 'CC',
 'to': 'TO',
 'a': 'AT',
 'in': 'IN',
 'for': 'IN',
 'The': 'AT',
 'that': 'CS',
 '``': '``',
 'is': 'BEZ',
 'was': 'BEDZ',
 "''": "''",
 'on': 'IN',
 'at': 'IN',
 'with': 'IN',
 'be': 'BE',
 'by': 'IN',
 'as': 'CS',
 'he': 'PPS',
 'said': 'VBD',
 'his': 'PP$',
 'will': 'MD',
 'it': 'PPS',
 'from': 'IN',
 'are': 'BER',
 ';': '.',
 'an': 'AT',
 'has': 'HVZ',
 '--': '--',
 'had': 'HVD',
 'who': 'WPS',
 'have': 'HV',
 'not': '*',
 'Mrs.': 'NP',
 'were': 'BED',
 'this': 'DT',
 'which': 'WDT',
 'would': 'MD',
 'their': 'PP$',
 'been': 'BEN',
 'they': 'PPSS',
 'He': 'PPS',
 'one': 'CD',
 'I': 'PPSS',
 'but': 'CC',
 'its': 'PP$',
 'or': 'CC',
 ')': ')',
 'more': 'AP',
 'Mr.': 'NP',
 '(': '(',
 'up': 'RP',
 'all': 'ABN',
 'out': 'RP',
 'last': 'AP',
 'two': 'CD',
 'other': 'AP',
 ':': ':',
 'new': 'JJ',
 'first': 'OD',
 'than': 'IN',
 'year': 'NN',
 'A': 'AT',
 'about': 'IN',
 'there': 'EX',
 'when': 'WRB',
 'home': 'NN',
 'after':

In [38]:
lookup_tagger = nltk.UnigramTagger(model=likely_tags)

In [39]:
lookup_tagger.evaluate(brown_tagged_sents)

0.45578495136941344

In [42]:
sent = brown.sents(categories = "news")[3]
sent

['``',
 'Only',
 'a',
 'relative',
 'handful',
 'of',
 'such',
 'reports',
 'was',
 'received',
 "''",
 ',',
 'the',
 'jury',
 'said',
 ',',
 '``',
 'considering',
 'the',
 'widespread',
 'interest',
 'in',
 'the',
 'election',
 ',',
 'the',
 'number',
 'of',
 'voters',
 'and',
 'the',
 'size',
 'of',
 'this',
 'city',
 "''",
 '.']

In [43]:
lookup_tagger.tag(sent)

[('``', '``'),
 ('Only', None),
 ('a', 'AT'),
 ('relative', None),
 ('handful', None),
 ('of', 'IN'),
 ('such', None),
 ('reports', None),
 ('was', 'BEDZ'),
 ('received', None),
 ("''", "''"),
 (',', ','),
 ('the', 'AT'),
 ('jury', None),
 ('said', 'VBD'),
 (',', ','),
 ('``', '``'),
 ('considering', None),
 ('the', 'AT'),
 ('widespread', None),
 ('interest', None),
 ('in', 'IN'),
 ('the', 'AT'),
 ('election', None),
 (',', ','),
 ('the', 'AT'),
 ('number', None),
 ('of', 'IN'),
 ('voters', None),
 ('and', 'CC'),
 ('the', 'AT'),
 ('size', None),
 ('of', 'IN'),
 ('this', 'DT'),
 ('city', None),
 ("''", "''"),
 ('.', '.')]

# Part-Of-Speech (POS) Tagging

# Rule-Based Tagger:
## Regular Expresion Tagger
## Lookup Tagger

In [44]:
# stochastic Tagger:

# Unigram Tagger
split_size = int(len(brown_tagged_sents) * 0.9)
split_size

4160

In [45]:
train_sents = brown_tagged_sents[:split_size]
test_sents = brown_tagged_sents[split_size:]

In [46]:
unigram_tagger = nltk.UnigramTagger(train_sents)

In [47]:
unigram_tagger.evaluate(test_sents)

0.8121200039868434

In [48]:
# Bigram Tagger (N-gram Tagger)
bigram_tagger = nltk.BigramTagger(train_sents)

In [49]:
bigram_tagger.tag(brown_sents[1000])

[('800', 'CD'),
 ('in', 'IN'),
 ('Southern', 'JJ-TL'),
 ('New', 'JJ-TL'),
 ('England', 'NP'),
 (',', ','),
 ('we', 'PPSS'),
 ('have', 'HV'),
 ('60', 'CD'),
 (';', '.'),
 (';', '.')]

In [50]:
bigram_tagger.evaluate(test_sents)

0.10206319146815508

In [None]:
# I left my phone on the left side of the room.
# unigram : I, left, my, phone, on, the , left, side, of, the, room
# bigrams : I left, left my, my phone, phone on, on the, the left, left side, side of, of the, the room

In [51]:
unseen_sent = brown_sents[4500]

In [53]:
print(unseen_sent)

['Japan', ',', 'since', '1957', ',', 'has', 'been', '``', 'voluntarily', "''", 'curbing', 'exports', 'of', 'textiles', 'to', 'the', 'U.S.', '.']


In [55]:
print(bigram_tagger.tag(unseen_sent))

[('Japan', None), (',', None), ('since', None), ('1957', None), (',', None), ('has', None), ('been', None), ('``', None), ('voluntarily', None), ("''", None), ('curbing', None), ('exports', None), ('of', None), ('textiles', None), ('to', None), ('the', None), ('U.S.', None), ('.', None)]


In [56]:
print(nltk.pos_tag(unseen_sent))

[('Japan', 'NNP'), (',', ','), ('since', 'IN'), ('1957', 'CD'), (',', ','), ('has', 'VBZ'), ('been', 'VBN'), ('``', '``'), ('voluntarily', 'RB'), ("''", "''"), ('curbing', 'VBG'), ('exports', 'NNS'), ('of', 'IN'), ('textiles', 'NNS'), ('to', 'TO'), ('the', 'DT'), ('U.S.', 'NNP'), ('.', '.')]


In [57]:
print(unigram_tagger.tag(unseen_sent))

[('Japan', 'NP'), (',', ','), ('since', 'IN'), ('1957', 'CD'), (',', ','), ('has', 'HVZ'), ('been', 'BEN'), ('``', '``'), ('voluntarily', 'RB'), ("''", "''"), ('curbing', None), ('exports', None), ('of', 'IN'), ('textiles', None), ('to', 'TO'), ('the', 'AT'), ('U.S.', 'NP'), ('.', '.')]
