# Bag of Words Text Classifier

The code below implements a simple bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- The lookup allows for extracting embeddings for each tokenized inputs
- The embedding vectors are added together with a bias vector
- The resulting vector is referred to as the scores
- The score are applied a softmax to generate probabilities which are used for the classification task

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

![img txt](../img/bow.png?raw=true)

In [4]:
# 导包
import torch
import random
import torch.nn as nn

### Download the Data

In [2]:
%%capture

# download the files
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt

# create the data folders
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes

### Read the Data

In [5]:
# function to read in data, process each line and split columns by " ||| "
def read_data(filename):            # 用于读取数据文件并处理每一行数据
    data = []           # 空列表
    with open(filename, 'r') as f:      # 使用open函数以只读模式打开文件filename
        for line in f:          # 获取每一行
            line = line.lower().strip()     # 将行数据转换为小写，并去除首尾的空格和换行符
            line = line.split(' ||| ')      # 将行数据按照字符串" ||| "进行分割，返回一个包含分割后元素的列表
            data.append(line)               # 将处理后的行数据添加到列表中
    return data

train_data = read_data('data/classes/train.txt')        # 训练数据
test_data = read_data('data/classes/test.txt')          # 测试数据

### Construct the Vocab and Datasets

In [6]:
# creating the word and tag indices
word_to_index = {}      # 存储词语到索引的映射关系
word_to_index["<unk>"] = len(word_to_index) # 特殊标记<unk>添加到词语索引字典中，索引为当前字典的长度
tag_to_index = {}       # 存储标签到索引的映射关系

# create word to index dictionary and tag to index dictionary from data
def create_dict(data, check_unk=False):     # 从数据中创建词语和标签的索引字典，选择是否检查未知词语
    for line in data:                       # 数据中的每一行
        for word in line[1].split(" "):     # 将词语按空格进行分割，进行迭代
            if check_unk == False:
                if word not in word_to_index:           # 如果词语不存在于字典中，将词语添加到词语索引字典word_to_index中
                    word_to_index[word] = len(word_to_index)
            else:
                if word not in word_to_index:
                    word_to_index[word] = word_to_index["<unk>"]

        if line[0] not in tag_to_index:
            tag_to_index[line[0]] = len(tag_to_index)

# 训练集和测试集的词语索引字典
create_dict(train_data)
create_dict(test_data, check_unk=True)

# 创建词语和标签的张量
def create_tensor(data):
    for line in data:
        # line[1].split(" ")将当前行的文本数据按空格分割为单词列表
        # [word_to_index[word] for word in line[1].split(" ")]将每个单词通过词语索引字典word_to_index转换为对应的索引
        # tag_to_index[line[0]]将当前行的标签通过标签索引字典tag_to_index转换为对应的索引
        yield([word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]])

# 将数据转换为张量列表
train_data = list(create_tensor(train_data))
test_data = list(create_tensor(test_data))

# 计算单词索引和标签索引数量
number_of_words = len(word_to_index)
number_of_tags = len(tag_to_index)

### Model

In [7]:
# cpu or gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# create a simple neural network with embedding layer, bias, and xavier initialization
class BoW(torch.nn.Module):
    def __init__(self, nwords, ntags):      # 初始化函数，单词数量，标签数量
        super(BoW, self).__init__()         # 继承初始化
        self.embedding = nn.Embedding(nwords, ntags)        # 嵌入层，将单词索引转换为词向量表示
        nn.init.xavier_uniform_(self.embedding.weight)      # 对嵌入层的权重进行初始化，使用 Xavier 均匀分布初始化方法

        # 根据当前是否有可用的 CUDA 设备来选择张量类型
        type = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
        # 创建全0张量
        # requires_grad=True可以进行梯度计算
        self.bias = torch.zeros(ntags, requires_grad=True).type(type)

    def forward(self, x):
        emb = self.embedding(x) # seq_len x ntags (for each seq) 将输入序列转换为词向量表示
        out = torch.sum(emb, dim=0) + self.bias # ntags   进行求和并加上偏置项
        out = out.view(1, -1) # reshape to (1, ntags)
        return out

### Pretest the Model

In [8]:
# function to convert sentence into tensor using word_to_index dictionary
def sentence_to_tensor(sentence):
    return torch.LongTensor([word_to_index[word] for word in sentence.split(" ")])      # 将索引列表转换为 torch.LongTensor 张量，并返回该张量

# test the sentence_to_tensor function
type = torch.cuda.LongTensor if torch.cuda.is_available() else torch.LongTensor
out = sentence_to_tensor("i love dogs").type(type)
test_model = BoW(number_of_words, number_of_tags).to(device)
test_model(out)     # 前向传播

tensor([[ 0.0124,  0.0164, -0.0182, -0.0014, -0.0120]], device='cuda:0',
       grad_fn=<ViewBackward0>)

### Train the Model

In [9]:
# train and test the BoW model
model = BoW(number_of_words, number_of_tags).to(device)     # 创建实例并移动到指定设备
criterion = nn.CrossEntropyLoss()                           # 交叉熵损失函数
optimizer = torch.optim.Adam(model.parameters())            # 优化器
# 根据 CUDA 设备的可用性，定义了 type 变量，用于将输入数据转换为相应的张量类型
type = torch.LongTensor
if torch.cuda.is_available():
    model.to(device)
    type = torch.cuda.LongTensor

# perform training of the Bow model
def train_bow(model, optimizer, criterion, train_data):
    for ITER in range(10):      # 迭代
        # perform training
        model.train()           # 训练模式
        random.shuffle(train_data)      # 打乱训练数据
        total_loss = 0.0        # 初始化总损失
        train_correct = 0       # 初始化正确数量
        for sentence, tag in train_data:                    # 遍历每个样本
            # 将输入句子和标签转换为张量
            sentence = torch.tensor(sentence).type(type)
            tag = torch.tensor([tag]).type(type)
            output = model(sentence)        # 输出
            # 使用torch.argmax函数找到概率最大的类别索引，并使用.item()方法将张量的值转换为Python标量
            # 从模型的输出中获取预测的标签
            predicted = torch.argmax(output.data.detach()).item()
            
            loss = criterion(output, tag)       # 损失
            total_loss += loss.item()           # 将当前样本的损失累加到总损失

            optimizer.zero_grad()               # 梯度清零
            loss.backward()                     # 根据损失值计算梯度
            optimizer.step()                    # 优化器参数更新

            if predicted == tag: train_correct+=1       # 如果预测标签与实际标签相等，则将正确数量增加1

        # perform testing of the model
        model.eval()
        test_correct = 0
        for sentence, tag in test_data:
            sentence = torch.tensor(sentence).type(type)
            output = model(sentence)
            predicted = torch.argmax(output.data.detach()).item()
            if predicted == tag: test_correct += 1
        
        # print model performance results
        log = f'ITER: {ITER+1} | ' \
            f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
            f'train accuracy: {train_correct/len(train_data):.4f} | ' \
            f'test accuracy: {test_correct/len(test_data):.4f}'
        print(log)

# call the train_bow function
train_bow(model, optimizer, criterion, train_data)      # 训练并测试

ITER: 1 | train loss/sent: 1.4733 | train accuracy: 0.3631 | test accuracy: 0.4009
ITER: 2 | train loss/sent: 1.1216 | train accuracy: 0.6040 | test accuracy: 0.4118
ITER: 3 | train loss/sent: 0.9123 | train accuracy: 0.7117 | test accuracy: 0.4154
ITER: 4 | train loss/sent: 0.7688 | train accuracy: 0.7664 | test accuracy: 0.4140
ITER: 5 | train loss/sent: 0.6631 | train accuracy: 0.8065 | test accuracy: 0.4068
ITER: 6 | train loss/sent: 0.5814 | train accuracy: 0.8324 | test accuracy: 0.4059
ITER: 7 | train loss/sent: 0.5171 | train accuracy: 0.8507 | test accuracy: 0.4077
ITER: 8 | train loss/sent: 0.4640 | train accuracy: 0.8695 | test accuracy: 0.4036
ITER: 9 | train loss/sent: 0.4191 | train accuracy: 0.8830 | test accuracy: 0.3991
ITER: 10 | train loss/sent: 0.3818 | train accuracy: 0.8929 | test accuracy: 0.3964
