# Continuous Bag of Words (CBOW) Text Classifier

The code below implements a continuous bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- The lookup allows for extracting embeddings for each tokenized input
- The embedding vectors are added together
- The resulting vector is multiplied with a weight matrix, which is then added a bias vector; this results in scores
- The scores are applied a softmax to generate probabilities which are used for the final classification

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

![img txt](../img/cbow.png?raw=true)

In [1]:
import torch
import random
import torch.nn as nn

In [None]:
% % capture
''' uncomment to download the data
# download the files
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt

# create the data folders
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes
'''

## Read and Process Data

In [2]:
# function to read in data, process each line and split columns by " ||| "
def read_data(filename):        # 读取文件数据
    data = []       # 空列表用于存储数据
    with open(filename, 'r') as f:      # 打开指定文件，使用只读模式，并赋值给f
        for line in f:      # 遍历每一行
            line = line.lower().strip()     # 将当前的内容转换为小写字母，并去除两端的空格和换行符
            line = line.split(' ||| ')      # 将当前行分为多个列
            data.append(line)               # 将分割后的行数据添加到列表中
    return data


# 读取数据
train_data = read_data('data/classes/train.txt')
test_data = read_data('data/classes/test.txt')

# creating the word and tag indices
word_to_index = {}      # 创建空字典用于存储单词到索引的映射关系
word_to_index["<unk>"] = len(word_to_index)  # add <UNK> to dictionary
tag_to_index = {}       # 创建空字典用于存储标签到索引的映射关系

# create word to index dictionary and tag to index dictionary from data
def create_dict(data, check_unk=False):
    for line in data:       # 对数据中每一行进行迭代
        for word in line[1].split(" "):     # 对当前行的第二个元素孔明下哦美好迭代使用空格作为分隔符将其分割为多个单词，并对这些单词进行迭代
            if check_unk == False:      # 是否需要进行未知单词处理
                if word not in word_to_index:       # 如果单词不在字典中
                    word_to_index[word] = len(word_to_index)    # 将其添加到字典中
            else:
                if word not in word_to_index:       # 需要对未知单词进行处理
                    word_to_index[word] = word_to_index["<unk>"]    # 将其添加到字典中

        if line[0] not in tag_to_index:         # 如果当前行的第一个元素（标签）不在字典中
            tag_to_index[line[0]] = len(tag_to_index)   # 将当前行的第一个单词作为键，将len(tag_to_index)作为值加入到字典中

# 基于训练数据和测试数据创建单词到索引和标签到索引的字典
create_dict(train_data)
create_dict(test_data, check_unk=True)

# create word and tag tensors from data
def create_tensor(data):
    for line in data:       # 对数据集的每一行进行迭代
        # 对当前行的第二个元素进行处理，将其按空格进行分割，得到一个包含单词的列表
        # 将单词转换为索引
        # 生成一个包含当前行的单词索引的列表
        # tag_to_index[line[0]]获取当前行的标签索引
        # 生成一个包含单词索引和标签索引的元组
        yield ([word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]])


train_data = list(create_tensor(train_data))        # 将训练数据集转换为生成器对象，使用list将其转换成一个包含所有生成器元素的列表
test_data = list(create_tensor(test_data))

number_of_words = len(word_to_index)        # 单词数量
number_of_tags = len(tag_to_index)          # 标签数量

## Model

In [1]:
# cpu or gpu
# 根据当前环境中是否有可用的CUDA设备
device = "cuda" if torch.cuda.is_available() else "cpu"

# create a simple neural network with embedding layer, bias, and xavier initialization
# 神经网络模型
class CBoW(torch.nn.Module):
    def __init__(self, nwords, ntags, emb_size):    # 构造函数
        super(CBoW, self).__init__()

        # layers
        # 创建一个嵌入层，将单词索引转换为对应的嵌入表示
        self.embedding = torch.nn.Embedding(nwords, emb_size)
        # 创建一个线性层，将嵌入表示映射到标签的输出空间
        self.linear = torch.nn.Linear(emb_size, ntags)

        # use xavier initialization for weights
        # 使用 Xavier 初始化方法对嵌入层的权重和线性层的权重进行初始化
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, x):       # 前向传播方法
        emb = self.embedding(x)  # 形状大小为seq*emb_size       # 经过嵌入层得到嵌入表示
        out = torch.sum(emb, dim=0)  # emb_size     # 对嵌入表示进行求和，得到一个表示整个序列的向量
        out = out.view(1, -1)  # reshape to (1, emb_size)       # 进行形状变换
        out = self.linear(out)  # 1 x ntags         # 通过线性层进行线性变换
        return out


EMB_SIZE = 64       # 嵌入层维度大小
model = CBoW(number_of_words, number_of_tags, EMB_SIZE)     # 创建实例
criterion = torch.nn.CrossEntropyLoss()     # 定义损失函数
optimizer = torch.optim.Adam(model.parameters())        # 定义优化器
type = torch.LongTensor     # 定义默认张量类型

if torch.cuda.is_available():       # 根据可用设备选择张张量类型
    model.to(device)
    type = torch.cuda.LongTensor

NameError: name 'torch' is not defined

## Train the Model

In [4]:
# perform training of the Bow model

for epoch in range(10):     # 十轮周期循环
    # perform training
    model.train()       # 训练模式
    random.shuffle(train_data)      # 每个周期进行随机洗牌
    total_loss = 0.0    # 初始化总损失
    train_correct = 0       # 初始化正确样本数量
    for sentence, tag in train_data:        # 对训练数据中的每个样本进行训练
        sentence = torch.tensor(sentence).type(type)        # 将单词索引转换成张量，并根据之前的设备类型进行类型转换
        tag = torch.tensor([tag]).type(type)        # 将标签转换成张量，并根据之前的设备类型进行类型转换
        output = model(sentence)        # 前向传播
        predicted = torch.argmax(output.data.detach()).item()       # 选取输出中最大值所对应的索引

        loss = criterion(output, tag)       # 计算预测结果与真实标签之间的损失
        total_loss += loss.item()           # 累加当前样本的损失

        optimizer.zero_grad()       # 将优化器的梯度缓冲区清零准备进行反向传播
        loss.backward()             # 反向传播，计算梯度
        optimizer.step()            # 根据梯度更新模型参数

        if predicted == tag:
            train_correct += 1      # 预测正确样本数量+1

    # perform testing of the model
    model.eval()            # 评估模式
    test_correct = 0        # 初始化测试正确的样本数量
    for sentence, tag in test_data:     # 遍历测试数据中的每个样本
        sentence = torch.tensor(sentence).type(type)    # 将单词索引转换成张量，并根据之前的设备类型进行类型转换
        output = model(sentence)        # 前向传播
        predicted = torch.argmax(output.data.detach()).item()       # 选取输出中最大值所对应的索引
        if predicted == tag:
            test_correct += 1       # 测试的正确数量+1

    # print model performance results
    log = f'epoch: {epoch+1} | ' \
        f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
        f'train accuracy: {train_correct/len(train_data):.4f} | ' \
        f'test accuracy: {test_correct/len(test_data):.4f}'
    print(log)

epoch: 1 | train loss/sent: 1.4111 | train accuracy: 0.3841 | test accuracy: 0.3982
epoch: 2 | train loss/sent: 0.8886 | train accuracy: 0.6522 | test accuracy: 0.3991
epoch: 3 | train loss/sent: 0.5147 | train accuracy: 0.8148 | test accuracy: 0.3950
epoch: 4 | train loss/sent: 0.3200 | train accuracy: 0.8878 | test accuracy: 0.3796
epoch: 5 | train loss/sent: 0.2148 | train accuracy: 0.9247 | test accuracy: 0.3738
epoch: 6 | train loss/sent: 0.1536 | train accuracy: 0.9466 | test accuracy: 0.3783
epoch: 7 | train loss/sent: 0.1097 | train accuracy: 0.9618 | test accuracy: 0.3638
epoch: 8 | train loss/sent: 0.0797 | train accuracy: 0.9716 | test accuracy: 0.3692
epoch: 9 | train loss/sent: 0.0568 | train accuracy: 0.9805 | test accuracy: 0.3661
epoch: 10 | train loss/sent: 0.0435 | train accuracy: 0.9853 | test accuracy: 0.3529
