# データ取得

テキストデータにはニュース記事（Brown Corpus）を使用します
- 500文書
- 116万単語
- 15カテゴリー

In [1]:
import csv
from nltk.corpus import brown
from tqdm import tqdm_notebook as tqdm
tqdm.monitor_interval = 0

In [2]:
# 文書一例
brown.words(fileids=['ca16'])[:10]

['Romantic',
 'news',
 'concerns',
 'Mrs.',
 'Joan',
 'Monroe',
 'Armour',
 'and',
 'F.',
 'Lee']

In [3]:
# カテゴリー一覧
cat_to_id = {c:i for i, c in enumerate(brown.categories())}
id_to_cat = {v:k for k, v in cat_to_id.items()}
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

---

In [4]:
# 全文書、ラベル取得（文書分類なので文を結合）
docs = []
labels = []
for fileid in tqdm(brown.fileids()):
    sents = brown.sents(fileids=[fileid])
    docs.append(' '.join([word for sent in sents for word in sent]))
    labels.extend(brown.categories(fileids=[fileid]))




In [5]:
# カテゴリーをラベルに変換
label_to_idx = {l:i for i, l in enumerate(list(set(labels)))}
idx_to_label = {v:k for k, v in label_to_idx.items()}

In [6]:
# 保存
with open('../data/data.txt', 'w') as f:
    f.write('\n'.join(docs))
    
with open('../data/label.txt', 'w') as f:
    idx = [label_to_idx[l] for l in labels]
    f.write('\n'.join(str(idx)))
    
with open('../data/lable_map.txt', 'w') as f:
    for i in range(len(idx_to_label)):
        f.write(idx_to_label[i]+'\n')