# 特徴量1 Bag of Words

文書をBag of Wordsに変換します

テキストデータにはニュース記事（Brown Corpus）を使用します
- 500文書
- 116万単語
- 15カテゴリー

In [1]:
import csv
from nltk.corpus import brown
from tqdm import tqdm_notebook as tqdm
tqdm.monitor_interval = 0

In [2]:
# 文書一例
brown.words(fileids=['ca16'])[:10]

['Romantic',
 'news',
 'concerns',
 'Mrs.',
 'Joan',
 'Monroe',
 'Armour',
 'and',
 'F.',
 'Lee']

In [3]:
# カテゴリー
cat_to_id = {c:i for i, c in enumerate(brown.categories())}
id_to_cat = {v:k for k, v in cat_to_id.items()}
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

---

In [4]:
# 全文書から語彙の作成
# 前処理は小文字化だけ
all_words = []
for fileid in tqdm(brown.fileids()):
    all_words.extend(brown.words(fileids=[fileid]))

vocab = list(set([w.lower() for w in all_words]))
print('語彙数: {}'.format(len(vocab)))
print(vocab[:5])

# 単語とインデックスの変換辞書を作っておく
word_to_id = {v:i+1 for i, v in enumerate(vocab)}
id_to_word = {v:k for k, v in word_to_id.items()}


語彙数: 49815
['coveting', 'next-to-last', 'over-', 'capability', 'schwarzen']


In [5]:
# 文書をBoWに変換する関数
def doc_to_bow(words, vocab):
    bow = [0] * len(vocab)
    for word in words:
        bow[vocab.index(word.lower())] += 1
    return bow

In [6]:
# 文書のBoW、カテゴリーを取得
bows = []
labels = []
for fileid in tqdm(brown.fileids()):
    words = brown.words(fileids=[fileid])
    bows.append(doc_to_bow(words, vocab))
    labels.extend(brown.categories(fileids=[fileid]))




In [7]:
# 保存
with open('../data/vocab.txt', 'w') as f:
    writer = csv.writer(f)
    for v in vocab:
        writer.writerow([v])

with open('../data/label.csv', 'w') as f:
    writer = csv.writer(f)
    for label in labels:
        writer.writerow([cat_to_id[label]])

with open('../data/label_map.csv', 'w') as f:
    writer = csv.writer(f)
    for i in range(len(id_to_cat)):
        writer.writerow([id_to_cat[i]])        
        
with open('../data/bow.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(bows)