### Thinking1
常用的文本分类方法都有哪些

答：有传统的机器学习方法，如决策树、SVM、KNN，也有深度学习方法如TextCNN（将Text的词向量拼接成形如一个channel的图，然后使用CNN）、Bi-LSTM（将Text按正序和反序输入的结果进行拼接，相较单向的LSTM能更好地获取上下文信息）、基于双向Transformer构建的BERT。

### Thinking2
RNN为什么会出现梯度消失

### Action1
cnews 中文文本分类：
由清华大学根据新浪新闻RSS订阅频道2005-2011年间的历史数据筛选过滤生成
训练集 50000
验证集 5000
测试集 10000
词汇（字） 5000
10个分类，包括：'体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐'

In [None]:
# 数据探索
filename = 'cnews.train.txt'
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()
lines[:2]

In [None]:
# 将数据集变成小样本
num_cat = {}
num_max = 100 # 每一类最多采100个

contents, labels = [], []
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        label, content = line.strip().split('\t')
        #print(label)
        if content:
            if label not in num_cat:
                num_cat[label] = 1
                contents.append(content)
                labels.append(label)
            else:
                if num_cat[label] < num_max:
                    num_cat[label] = num_cat[label] + 1
                    contents.append(content)
                    labels.append(label)

# 写文件
with open('cnews.train.small.txt', 'w', encoding='utf-8', errors='ignore') as f:
    for content, label in zip(contents, labels):
        f.write(label + '\t' + content+'\n')
    f.close()
print(len(contents))
print(contents[0])
print(labels[0])
print(num_cat)

#### 模型定义

In [None]:
# 定义模型
import torch
from torch import nn

class TextRNN(nn.Module):   
    def __init__(self):
        super(TextRNN, self).__init__()
        # 三个待输入的数据
        self.embedding = nn.Embedding(5000, 64)  # 进行词嵌入，5000个词，每个64维
        self.rnn = nn.LSTM(input_size=64, hidden_size=128, num_layers=2, bidirectional=True) # bidirectional指双向的LSTM,输出为128*2维
#         self.rnn = nn.GRU(input_size=64, hidden_size=128, num_layers=2, bidirectional=True)
        self.f1 = nn.Sequential(nn.Linear(256,128),
                                nn.Dropout(0.8),
                                nn.ReLU())
        self.f2 = nn.Sequential(nn.Linear(128,10),
                                nn.Softmax(dim=1))
        self.dropout = nn.Dropout(p=0.8)
 
    def forward(self, x):
        out = self.embedding(x)
        out,_ = self.rnn(out)
        out = self.dropout(out)
        out = self.f1(out[:,-1,:]) #相当于return_sequences=False
        out = self.f2(out)
        return out

In [None]:
from torch import optim

def train(epochs=100):
    model = TextRNN().cuda()
    # 定义损失函数
    Loss = nn.MultiLabelSoftMarginLoss()
    # 定义优化器
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    best_val_acc = 0
    for epoch in range(epochs):
        print('epoch=', epoch)
        # 分批训练
        for step, (x_batch, y_batch) in enumerate(train_loader):
            x = x_batch.cuda()
            y = y_batch.cuda()
            # 前向传播
            out = model(x)
            loss = Loss(out, y)
            print('loss=', loss)
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            accuracy = np.mean((torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy())
#             print('accuracy:', accuracy)
            
        # 对模型进行验证
        if epoch % 5 == 0:
            sum_acc = 0
            for step, (x_batch, y_batch) in enumerate(val_loader):
                x = x_batch.cuda()
                y = y_batch.cuda()
                # 前向传播
                out = model(x)
                sum_acc += np.mean((torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy())
            if sum_acc > best_val_acc:
                torch.save(model, 'model.pkl')
                best_val_acc = sum_acc
                print('model saved')

In [None]:
def test(model):
    model.eval() # 固定参数
    correct, total = 0, 0
    for x_batch, y_batch in test_loader:
        x = x_batch.cuda()
        y = y_batch.cuda()
        # 前向传播
        out = model(x)
        # 预测结果
        _, pred = torch.max(out.data, dim=1)# 取评分最高的类(dim=0是每列的最大值，dim=1是每行的最大值，返回两个值，一个是最大值tensor组一个是最大值所在的位置)
        _, label = torch.max(y.data, dim=1)
        # 对比结果与真实值
        total += y.size()[0]
        correct += (pred==label).sum().item()
    print('测试准确率为：{: .4f}%'.format(100*correct/total))          

#### 载入数据

In [7]:
# 设置数据目标文件名
train_file = 'cnews.train.txt'
test_file = 'cnews.test.txt'
val_file = 'cnews.val.txt'
vocab_file = 'cnews.vocab.txt'

In [8]:
from cnews_loader import read_vocab, read_category, process_file
# 获取文本的类别及其对应id的字典
categories, cat_to_id = read_category()
print(categories)
# 获取训练文本中所出现过的字及其对应的id
words, word_to_id = read_vocab('cnews.vocab.txt')
# print(words)
# 获取训练书记每一个字的id和对应标签的one_hot编码
x_train, y_train = process_file(train_file, word_to_id, cat_to_id, 600)
print(x_train.shape)
print('x_train:', x_train)
x_val, y_val = process_file(val_file, word_to_id, cat_to_id, 600)
x_test, y_test = process_file(test_file, word_to_id, cat_to_id, 600)

['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
(50000, 600)
x_train: [[1609  659   56 ...    9  311    3]
 [   2  101   16 ... 1168    3   24]
 [ 465  855  521 ...  116  136   85]
 ...
 [  49   18   79 ...  836 1928 1072]
 [ 166  110  714 ...  836 1928 1072]
 [   1   80  551 ...   78  192    3]]


In [10]:
import numpy as np
import torch.utils.data as Data
# 设置GPU
cuda = torch.device('cuda')
x_train, y_train = torch.LongTensor(x_train), torch.Tensor(y_train)
x_val, y_val = torch.LongTensor(x_val), torch.Tensor(y_val)
x_test, y_test = torch.LongTensor(x_test), torch.Tensor(y_test)

train_dataset = Data.TensorDataset(x_train, y_train)
train_loader = Data.DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
val_dataset = Data.TensorDataset(x_val, y_val)
val_loader = Data.DataLoader(dataset=val_dataset, batch_size=64)
test_dataset = Data.TensorDataset(x_test, y_test)
test_loader = Data.DataLoader(dataset=test_dataset, batch_size=64)
train()

In [11]:
best_model = torch.load('model.pkl')
test(best_model)

测试准确率为： 41.8700%
