In [None]:
# POS tagging : 문장의 단어를 POS와 연결하는 것
# Parts of Speech : 문장에 존재하는 단어를 특정 문법적 기능으로 분류하는 것
# English의 경우: parts of speech는 noun, pronoun, adjective, verb, adverb, preposition, determiner, conjunction

# NLTK에서 가장 많이 사용하는 tagging 데이터셋 : Penn Treebank, Brown Corpus

In [2]:
from nltk.corpus import brown
type(brown.tagged_words()), len(brown.tagged_words())

(nltk.corpus.reader.util.ConcatenatedCorpusView, 1161192)

In [3]:
brown.tagged_words()[30:40]

[('term-end', 'NN'),
 ('presentments', 'NNS'),
 ('that', 'CS'),
 ('the', 'AT'),
 ('City', 'NN-TL'),
 ('Executive', 'JJ-TL'),
 ('Committee', 'NN-TL'),
 (',', ','),
 ('which', 'WDT'),
 ('had', 'HVD')]

In [None]:
# ('City', 'NN-TL') : noun이고 context of a title

In [4]:
brown.tagged_words(tagset="universal")[30:40]

[('term-end', 'NOUN'),
 ('presentments', 'NOUN'),
 ('that', 'ADP'),
 ('the', 'DET'),
 ('City', 'NOUN'),
 ('Executive', 'ADJ'),
 ('Committee', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('had', 'VERB')]

In [11]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\한국IT비즈니스진흥협회\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [12]:
# POS tagging : Named Entity Recognition (NER), 감성 분석, 응답 시스템, 단어 의도 확인에 사용
import nltk

text1 = nltk.word_tokenize("I left the room")
text2 = nltk.word_tokenize("Left of the room")


In [13]:
nltk.pos_tag(text1, tagset="universal")

[('I', 'PRON'), ('left', 'VERB'), ('the', 'DET'), ('room', 'NOUN')]

In [14]:
nltk.pos_tag(text2, tagset="universal")

[('Left', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('room', 'NOUN')]

In [15]:
import nltk
example_sentence = nltk.word_tokenize("The company is located in South Africa")

In [16]:
type(example_sentence), len(example_sentence), example_sentence

(list, 7, ['The', 'company', 'is', 'located', 'in', 'South', 'Africa'])

In [17]:
tagging_example_sentence = nltk.pos_tag(example_sentence)
type(tagging_example_sentence), len(tagging_example_sentence), tagging_example_sentence

(list,
 7,
 [('The', 'DT'),
  ('company', 'NN'),
  ('is', 'VBZ'),
  ('located', 'VBN'),
  ('in', 'IN'),
  ('South', 'NNP'),
  ('Africa', 'NNP')])

In [18]:
tagging_example_sentence = nltk.ne_chunk(tagging_example_sentence)
type(tagging_example_sentence), len(tagging_example_sentence), tagging_example_sentence
# GPE : geopolitical entity의 약자

(nltk.tree.Tree,
 6,
 Tree('S', [('The', 'DT'), ('company', 'NN'), ('is', 'VBZ'), ('located', 'VBN'), ('in', 'IN'), Tree('GPE', [('South', 'NNP'), ('Africa', 'NNP')])]))

In [20]:
from nltk.corpus import movie_reviews 

cats = movie_reviews.categories()
type(cats), len(cats), cats

(list, 2, ['neg', 'pos'])

In [26]:
len(movie_reviews.fileids()), len(movie_reviews.fileids("neg")), len(movie_reviews.fileids("pos"))

(2000, 1000, 1000)

In [34]:
reviews = []
for cat in cats:
    for fid in movie_reviews.fileids(cat):
        review = (list(movie_reviews.words(fid)), cat)
        reviews.append(review)

In [36]:
type(reviews), len(reviews)

(list, 2000)

In [37]:
import random

random.shuffle(reviews)

In [38]:
all_wd_id_reviews = nltk.FreqDist(wd.lower() for wd in movie_reviews.words())

In [40]:
type(all_wd_id_reviews.most_common(2000)), len(all_wd_id_reviews.most_common(2000))

(list, 2000)

In [52]:
top_wd_in_reviews = [list(wds) for wds in zip(*all_wd_id_reviews.most_common(2000))][0]
type(top_wd_in_reviews), len(top_wd_in_reviews)

(list, 2000)

In [56]:
# binary features
def extract_feature(review, top_words):
    reviews_wds = set(review)
    feature = {}
    for wd in top_words:
        feature["word_presen({}".format(wd)] = (wd in reviews_wds) 
    
    return feature

In [57]:
featuresets = [(extract_feature(d, top_wd_in_reviews), c) for (d, c) in reviews]

In [58]:
type(featuresets), len(featuresets)

(list, 2000)

In [None]:
train_set, test_set = featuresets[200:], featuresets[:200]