# 建立一個bag of words模型

Bag of words 詞袋 = 從文本中統計字詞出現的頻率

In [1]:
import pandas as pd
import nltk
#nltk.download()
import numpy as np
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
corpus = dataset['Review'].values

從文本中提出所有單字

In [2]:
whole_words = []
for sentence in corpus:
    tokenized_sentence = nltk.word_tokenize(sentence)
    for word in tokenized_sentence:
        whole_words.append(word)

移除重複字詞

In [3]:
whole_words = set(whole_words)
print('There are total {} words'.format(len(whole_words)))

There are total 2351 words


建立字典使每一個單字有對應數值

In [4]:
word_index = {}
index_word = {}
for idx, word in enumerate(whole_words):
    word_index[word] = idx
    index_word[idx] = word

In [5]:
list(word_index.items())[:10]

[('venturing', 0),
 ('Shrimp-', 1),
 ('guys', 2),
 ('Cibo', 3),
 ('fabulous', 4),
 ('tender', 5),
 ('opportunity', 6),
 ('weekend', 7),
 ('oven', 8),
 ('containers', 9)]

In [6]:
list(index_word.items())[:10]

[(0, 'venturing'),
 (1, 'Shrimp-'),
 (2, 'guys'),
 (3, 'Cibo'),
 (4, 'fabulous'),
 (5, 'tender'),
 (6, 'opportunity'),
 (7, 'weekend'),
 (8, 'oven'),
 (9, 'containers')]

轉換句子為bag of words型式

In [7]:
def _get_bag_of_words_vector(sentence, word_index_dict, whole_words):
    vector = np.zeros(len(whole_words))
    for word in nltk.word_tokenize(sentence):
        if word in whole_words:
            vector[word_index_dict[word]] += 1
    return vector

In [8]:
_get_bag_of_words_vector('Wow... Loved this place.', word_index, whole_words)

array([0., 0., 0., ..., 0., 0., 0.])