## Bag of Words (BOW) 实现

In [1]:
from collections import defaultdict # 为字典元素赋初始值
import time
import random
import dynet as dy
import numpy as np

In [2]:
# 数据初始化
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 

In [3]:
train = list(read_dataset("data/classes/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("data/classes/test.txt"))
nwords = len(w2i)
ntags = len(t2i)

In [4]:
# 定义模型和参数
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [5]:
W_sm = model.add_lookup_parameters((nwords, ntags))
b_sm = model.add_parameters((ntags)) 

In [6]:
def calc_scores(words):
    dy.renew_cg()
    b_sm_exp = dy.parameter(b_sm)
    score = dy.esum([dy.lookup(W_sm, x) for x in words])
    
    return score + b_sm_exp

In [7]:
for ITER in range(100):
    # 训练集表现
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    for words, tag in train:
        my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)
        train_loss += my_loss.value()
        my_loss.backward()
        trainer.update()
    print("iter %s: train loss=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
    # 测试集表现
    test_correct = 0.0
    for wirds, tag in dev:
        scores = calc_scores(words).npvalue()
        predict = np.argmax(scores)
        if predict == tag:
            test_correct += 1
    print("iter %s: test acc=%.4f" %(ITER, test_correct/len(dev)))

iter 0: train loss=2.4340, time=0.23s
iter 0: test acc=0.2864
iter 1: train loss=2.0407, time=0.24s
iter 1: test acc=0.2308
iter 2: train loss=1.8487, time=0.24s
iter 2: test acc=0.2864
iter 3: train loss=1.7146, time=0.24s
iter 3: test acc=0.1262
iter 4: train loss=1.6076, time=0.23s
iter 4: test acc=0.1760
iter 5: train loss=1.5172, time=0.23s
iter 5: test acc=0.1760
iter 6: train loss=1.4414, time=0.24s
iter 6: test acc=0.1760
iter 7: train loss=1.3737, time=0.26s
iter 7: test acc=0.1760
iter 8: train loss=1.3132, time=0.26s
iter 8: test acc=0.2864
iter 9: train loss=1.2596, time=0.27s
iter 9: test acc=0.1805
iter 10: train loss=1.2098, time=0.25s
iter 10: test acc=0.2308
iter 11: train loss=1.1656, time=0.23s
iter 11: test acc=0.1262
iter 12: train loss=1.1247, time=0.24s
iter 12: test acc=0.2864
iter 13: train loss=1.0869, time=0.28s
iter 13: test acc=0.1805
iter 14: train loss=1.0516, time=0.23s
iter 14: test acc=0.2864
iter 15: train loss=1.0190, time=0.24s
iter 15: test acc=0.2