In [1]:
from nltk.tokenize import word_tokenize
import nltk

In [2]:
train = [('i like you', 'pos'),
         ('i hate you', 'neg'),
         ('you like me', 'neg'),
         ('i like her', 'pos')]

In [3]:
all_words = set(word.lower() for sentence in train for word in word_tokenize(sentence[0]))
all_words

{'hate', 'her', 'i', 'like', 'me', 'you'}

In [4]:
all_words = []
for sentence in train:
    for word in word_tokenize(sentence[0]):
        all_words.append(word.lower())
        
set(all_words)

{'hate', 'her', 'i', 'like', 'me', 'you'}

In [5]:
t = [({word : (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t

[({'i': True,
   'like': True,
   'you': True,
   'hate': False,
   'me': False,
   'her': False},
  'pos'),
 ({'i': True,
   'like': False,
   'you': True,
   'hate': True,
   'me': False,
   'her': False},
  'neg'),
 ({'i': False,
   'like': True,
   'you': True,
   'hate': False,
   'me': True,
   'her': False},
  'neg'),
 ({'i': True,
   'like': True,
   'you': False,
   'hate': False,
   'me': False,
   'her': True},
  'pos')]

In [8]:
classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()

Most Informative Features
                    like = True              pos : neg    =      1.7 : 1.0
                     her = False             neg : pos    =      1.7 : 1.0
                     you = True              neg : pos    =      1.7 : 1.0
                       i = True              pos : neg    =      1.7 : 1.0
                      me = False             pos : neg    =      1.7 : 1.0
                    hate = False             pos : neg    =      1.7 : 1.0


In [9]:
test_sentence = 'I like MeRui'
test_sent_features = {word.lower(): (word in word_tokenize(test_sentence.lower()))
                                     for word in all_words}
test_sent_features

{'i': True,
 'like': True,
 'you': False,
 'hate': False,
 'me': False,
 'her': False}

In [10]:
classifier.classify(test_sent_features)

'pos'

### 한글에서 Naive Bayes Classifier 연습

In [11]:
from konlpy.tag import Okt

In [12]:
pos_tagger = Okt()

In [13]:
train = [('난 수업이 빨리 마치면 좋겠어', 'pos'),
         ('내일은 수업이 없어서 좋아', 'pos'),
         ('내일은 놀러가야지', 'pos'),
         ('오늘 수업은 정말 지루해', 'neg'),
         ('수업은 아직 시작도 안했어', 'neg'),
         ('나는 왜 이런걸 해야하는지 모르겠어', 'neg')]

In [14]:
pos_tagger.pos(train[0][0])

[('난', 'Noun'),
 ('수업', 'Noun'),
 ('이', 'Josa'),
 ('빨리', 'Adverb'),
 ('마치', 'Noun'),
 ('면', 'Josa'),
 ('좋겠어', 'Adjective')]

In [16]:
train_docs = [(pos_tagger.morphs(sentence[0]), sentence[1]) for sentence in train]
train_docs

[(['난', '수업', '이', '빨리', '마치', '면', '좋겠어'], 'pos'),
 (['내일', '은', '수업', '이', '없어서', '좋아'], 'pos'),
 (['내일', '은', '놀러', '가야', '지'], 'pos'),
 (['오늘', '수업', '은', '정말', '지루해'], 'neg'),
 (['수업', '은', '아직', '시작', '도', '안', '했어'], 'neg'),
 (['나', '는', '왜', '이런', '걸', '해야하는지', '모르겠어'], 'neg')]

In [17]:
all_words = set([t for d in train_docs for t in d[0]])
all_words

{'가야',
 '걸',
 '나',
 '난',
 '내일',
 '놀러',
 '는',
 '도',
 '마치',
 '면',
 '모르겠어',
 '빨리',
 '수업',
 '시작',
 '아직',
 '안',
 '없어서',
 '오늘',
 '왜',
 '은',
 '이',
 '이런',
 '정말',
 '좋겠어',
 '좋아',
 '지',
 '지루해',
 '해야하는지',
 '했어'}

In [18]:
def term_exists(doc):
    return {word : (word in set(doc)) for word in all_words}

In [19]:
train_xy = [(term_exists(d), c) for d,c in train_docs]
train_xy

[({'빨리': True,
   '해야하는지': False,
   '는': False,
   '시작': False,
   '내일': False,
   '왜': False,
   '없어서': False,
   '나': False,
   '면': True,
   '마치': True,
   '오늘': False,
   '이런': False,
   '이': True,
   '좋겠어': True,
   '좋아': False,
   '도': False,
   '수업': True,
   '가야': False,
   '모르겠어': False,
   '안': False,
   '했어': False,
   '아직': False,
   '지': False,
   '정말': False,
   '걸': False,
   '놀러': False,
   '은': False,
   '지루해': False,
   '난': True},
  'pos'),
 ({'빨리': False,
   '해야하는지': False,
   '는': False,
   '시작': False,
   '내일': True,
   '왜': False,
   '없어서': True,
   '나': False,
   '면': False,
   '마치': False,
   '오늘': False,
   '이런': False,
   '이': True,
   '좋겠어': False,
   '좋아': True,
   '도': False,
   '수업': True,
   '가야': False,
   '모르겠어': False,
   '안': False,
   '했어': False,
   '아직': False,
   '지': False,
   '정말': False,
   '걸': False,
   '놀러': False,
   '은': True,
   '지루해': False,
   '난': False},
  'pos'),
 ({'빨리': False,
   '해야하는지': False,
   '는': False,
   '시작': False,
   

In [20]:
classifier = nltk.NaiveBayesClassifier.train(train_xy)
classifier.show_most_informative_features()

Most Informative Features
                       이 = False             neg : pos    =      2.3 : 1.0
                      내일 = False             neg : pos    =      2.3 : 1.0
                      마치 = False             neg : pos    =      1.4 : 1.0
                      이런 = False             pos : neg    =      1.4 : 1.0
                      빨리 = False             neg : pos    =      1.4 : 1.0
                     지루해 = False             pos : neg    =      1.4 : 1.0
                      놀러 = False             neg : pos    =      1.4 : 1.0
                      정말 = False             pos : neg    =      1.4 : 1.0
                    모르겠어 = False             pos : neg    =      1.4 : 1.0
                      가야 = False             neg : pos    =      1.4 : 1.0


In [21]:
test_sentence = "수업 마치면 놀러 가야지"

In [24]:
test_docs = pos_tagger.morphs(test_sentence)
test_docs

['수업', '마치', '면', '놀러', '가야', '지']

In [25]:
test_xy = term_exists(test_docs)
test_xy

{'빨리': False,
 '해야하는지': False,
 '는': False,
 '시작': False,
 '내일': False,
 '왜': False,
 '없어서': False,
 '나': False,
 '면': True,
 '마치': True,
 '오늘': False,
 '이런': False,
 '이': False,
 '좋겠어': False,
 '좋아': False,
 '도': False,
 '수업': True,
 '가야': True,
 '모르겠어': False,
 '안': False,
 '했어': False,
 '아직': False,
 '지': True,
 '정말': False,
 '걸': False,
 '놀러': True,
 '은': False,
 '지루해': False,
 '난': False}

In [26]:
classifier.classify(test_xy)

'pos'

In [27]:
test_sentence = '오늘은 언제 수업 마칠려나'
classifier.classify(term_exists(pos_tagger.morphs(test_sentence)))

'neg'

In [28]:
test_sentence = "오늘 수업 마치고 일어나면 내일이다"
classifier.classify(term_exists(pos_tagger.morphs(test_sentence)))

'pos'

In [37]:
test_sentence = "오늘 수업은 재미있었다"
classifier.classify(term_exists(pos_tagger.morphs(test_sentence)))

'neg'