# CNN文本分类

使用CNN模型搭建文本分类器

数据集：intent分类数据集 https://arxiv.org/abs/1909.02027


# 准备数据

我们使用意图分类数据，每一行数据由如下格式组成：

[domain]  [intent]  [utterance]

该数据集共有150个类别标签

In [1]:
import random
train_file = "/home/data/tmp/nlp5_text_cnn/ind_train"
val_file = "/home/data/tmp/nlp5_text_cnn/ind_val"
ind_label_list = "/home/data/tmp/nlp5_text_cnn/ind_label_list"

# train data
print('------------train data------')
with open(train_file) as f:
    res = f.readlines()
    random.shuffle(res)
    for i in res[:5]:
        print(i.strip())

print('------------labels----------')
with open(ind_label_list) as f:
    res = f.readlines()
    print(len(res))
    for i in res[:4]:
        print(i.strip())

------------train data------
travel	international_visa	do i need a visa to go to cancun
travel	plug_type	should i bring my socket converter while traveling to england
banking	routing_number	let me know my routing number
small_talk	what_are_your_hobbies	what are some things that you enjoy
work	pto_request_status	has my vacation time been signed off on
------------labels----------
150
auto_&_commute@current_location
auto_&_commute@directions
auto_&_commute@distance
auto_&_commute@gas


为了简单起见，我们直接使用了transformers库中所提供的tokenizer

In [2]:
from transformers import BertTokenizer

bert_path = "/home/data/tmp/nlp5_text_cnn"
tokz = BertTokenizer.from_pretrained(bert_path)
utter = "The man who changed China"
print(tokz.tokenize(utter))
print(tokz.convert_tokens_to_ids(tokz.tokenize(utter)))
print(tokz.decode(tokz.convert_tokens_to_ids(tokz.tokenize(utter))))

['the', 'man', 'who', 'changed', 'china']
[1996, 2158, 2040, 2904, 2859]
the man who changed china


读取标签，并生成标签词典

In [3]:
with open(ind_label_list) as f:
    res = [i.strip().lower() for i in f.readlines() if len(i.strip()) != 0]
label2index = dict(zip(res, range(len(res))))

# 定义Dataset

pytorch程序中最重要的就是根据自己的数据特点定义数据集

In [4]:
from torch.utils.data import Dataset
import torch
import json


class ClsDataset(Dataset):
    def __init__(self, paths, tokz, intent2id, max_lengths=100):
        self.tokz = tokz
        self.max_lengths = max_lengths
        self.intent2id = intent2id 
        self.data = self.make_dataset(paths, tokz, intent2id, max_lengths)

    def make_dataset(self, paths, tokz, intent2id, max_lengths):
        dataset = []
        print('reading ind data from {}'.format(paths))
        for path in paths:
            with open(path, 'r', encoding='utf8') as f:
                lines = [i.strip().lower().split('\t') for i in f.readlines() if len(i.strip()) != 0]
                for line in lines:
                    dataset.append([intent2id[line[0] + '@' + line[1]],
                                    tokz.convert_tokens_to_ids(tokz.tokenize(line[2])[:max_lengths])])
        print('{} data record loaded'.format(len(dataset)))
        return dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label, text = self.data[idx]
        # encode label here
        text = [self.tokz.cls_token_id] + text + [self.tokz.sep_token_id]
        return {"text": text, "text_len": len(text), "label": int(label)}


# 自定义batch操作
class PadBatchSeq:
    def __init__(self, pad_id):
        self.pad_id = pad_id

    def __call__(self, batch):
        res = dict()
        res['label'] = torch.LongTensor([i['label'] for i in batch])
        res['text_len'] = torch.LongTensor([i['text_len'] for i in batch])
        text_max_len = max([len(i['text']) for i in batch])
        res['text'] = torch.LongTensor([i['text'] + [self.pad_id] * (text_max_len - len(i['text'])) for i in batch])
        return res

测试一下刚才所定义的数据集类

In [5]:
tmp = ClsDataset([val_file], tokz, label2index)
print(len(tmp))
print(tmp[100])
print(tokz.decode(tmp[100]['text']))

reading ind data from ['/home/data/tmp/nlp5_text_cnn/ind_val']
3000 data record loaded
3000
{'text': [101, 4863, 2129, 2079, 1045, 2131, 2047, 5427, 102], 'text_len': 9, 'label': 138}
[CLS] explain how do i get new insurance [SEP]


当我们把句子传进模型的时候，我们是按照一个个 batch 穿进去的，也就是说，我们一次传入了好几个句子，而且每个batch中的句子必须是相同的长度。为了确保句子的长度相同，我们需要把不够长的句子补齐。这个行为在PadBatchSeq中控制

In [6]:
Pad_class = PadBatchSeq(tokz.pad_token_id)
print(Pad_class([tmp[i] for i in range(5)]))

{'text': tensor([[  101,  1999,  3009,  1010,  3113,  2033,  4826,  2003,  2056,  2129,
           102,     0,     0],
        [  101,  1999,  2413,  1010,  2129,  2079,  1045,  2360,  1010,  2156,
          2017,  2101,   102],
        [  101,  2129,  2079,  2017,  2360,  7592,  1999,  2887,   102,     0,
             0,     0,     0],
        [  101,  2129,  2079,  1045,  3198,  2055,  1996,  4633,  1999,  2822,
           102,     0,     0],
        [  101,  2129,  2064,  1045,  2360,  1000, 17542,  2026,  2344,  1000,
          1999,  2413,   102]]), 'text_len': tensor([11, 13,  9, 11, 13]), 'label': tensor([115, 115, 115, 115, 115])}


# 定义CNN模型

把模型超参数定义在一起

In [7]:
from attrdict import AttrDict
config={"max_length": 40,
  "embedding_size": 768,
  "feature_size": [256, 256, 256, 256],
  "kernel_size": [1, 2, 3, 4],
  "fc_size": [256, 256],
  "embedding_dropout": 0.1,
  "dropout": 0.1,
  "eval_steps": 90,
  "lr": 4e-5,
  "batch_size": 60,
  "n_epochs": 30,
}
config = AttrDict(config)

# 如果没有GPU，那么就使用CPU
# device = torch.device('cpu')

# 如果有GPU，那么就使用GPU
device = torch.device('cuda', 3)

定义模型

In [8]:
import torch.nn as nn
import torch.nn.functional as F
class Dense(nn.Module):
    def __init__(self, in_size, out_size, activation=F.relu):
        super(Dense, self).__init__()
        self.linear = nn.Linear(in_size, out_size)
        self.activation = activation

    def forward(self, x):
        return self.activation(self.linear(x))


class CNNModule(nn.Module):
    def __init__(self, num_cls, n_embeddings, embedding_size, padding_idx, embed_dropout,
                 feature_size=[128, 128, 128], kernel_size=[2, 3, 4], fc_size=[265, 265], dropout=0.2):
        super(CNNModule, self).__init__()
        self.embeddings = nn.Embedding(n_embeddings, embedding_size, padding_idx=padding_idx)
        self.embed_dropout = nn.Dropout(embed_dropout)

        self.convs = nn.ModuleList([nn.Conv1d(embedding_size, fs, ks) for fs, ks in zip(feature_size, kernel_size)])
        self.dropout = nn.Dropout(dropout)
        fc_size = list(fc_size)
        fc_size = list(zip([sum(feature_size)] + fc_size[1:], fc_size))
        self.fc = nn.ModuleList([Dense(i, j) for i, j in fc_size])
        self.output_layer = nn.Linear(fc_size[-1][-1], num_cls)

    def forward(self, x, x_len):
        '''x: [bs, len], x_len: [bs]'''
        x_embed = self.embeddings(x) # x_embed: [bs, len, embed_size]
        x_embed = self.embed_dropout(x_embed)
        mask = torch.arange(x_embed.shape[1], device=x_len.device)[None, :] < x_len[:, None]  # [bs, max_len]
        x_embed = x_embed * mask.unsqueeze(2)  # x_embed: [bs, len, embed_size]
        x_embed = x_embed.permute([0, 2, 1])   # [bs, embed_size, len]
        x_embed = [conv(x_embed) for conv in self.convs]  # [(bs, fs, len), ...]
        x_embed = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x_embed]  # [(bs, fs), ...]
        x_embed = torch.cat(x_embed, 1)  # [bs, sum(fs)]
        x_embed = self.dropout(x_embed)
        for fc in self.fc:
            x_embed = fc(x_embed)
        x_embed = self.dropout(x_embed)
        logits = self.output_layer(x_embed)
        return logits   # [bs, logits]

# 模型训练

加载数据集以及定义DataLoader

In [9]:
from torch.utils.data import DataLoader
train_dataset = ClsDataset([train_file], tokz, label2index, max_lengths=config.max_length)
val_dataset = ClsDataset([val_file], tokz, label2index, max_lengths=config.max_length)

train_dataloader = DataLoader(train_dataset, sampler=torch.utils.data.RandomSampler(train_dataset), 
                                  batch_size=config.batch_size, collate_fn=PadBatchSeq(tokz.pad_token_id))
val_dataloader = DataLoader(val_dataset, sampler=None, batch_size=config.batch_size, collate_fn=PadBatchSeq(tokz.pad_token_id))


reading ind data from ['/home/data/tmp/nlp5_text_cnn/ind_train']
15000 data record loaded
reading ind data from ['/home/data/tmp/nlp5_text_cnn/ind_val']
3000 data record loaded


定义模型，并搬运到相应的设备上

In [10]:
model = CNNModule(len(label2index), len(tokz), config.embedding_size, tokz.pad_token_id, config.embedding_dropout,
                  config.feature_size, config.kernel_size, config.fc_size, config.dropout)
model = model.to(device)

定义优化器，以及损失函数

In [11]:
import torch.nn as nn
from torch.optim import Adam
criterion = nn.CrossEntropyLoss().to(device)
optimizer = Adam(model.parameters(), lr=config.lr, weight_decay=0.01)

开始训练

In [12]:
for epoch in range(config.n_epochs):
    print('---------epoch {}----------'.format(epoch))
    model.train()

    loss, acc, step_count = 0, 0, 0
    total = len(train_dataloader)
    for i, data in enumerate(train_dataloader):
        d_data = data

        text, label = d_data['text'].to(device), d_data['label'].to(device)
        text_len = d_data['text_len'].to(device)

        outputs = model(text, text_len)
        batch_loss = criterion(outputs, label)
        batch_acc = (torch.argmax(outputs, dim=1) == label).float().mean()

        batch_loss.backward()

        loss += batch_loss.item()
        acc += batch_acc.item()
        step_count += 1

        # update weights
        optimizer.step()
        optimizer.zero_grad()
        curr_step = optimizer.state[optimizer.param_groups[0]["params"][-1]]["step"]

        if curr_step % config.eval_steps == 0:
            
            # eval
            model.eval()

            val_logits = []
            val_label = []
            for d_data in val_dataloader:
                text, label = d_data['text'].to(device), d_data['label'].to(device)
                text_len = d_data['text_len'].to(device)
                outputs = model(text, text_len)
                val_label.append(label)
                val_logits.append(outputs)


            val_logits = torch.cat(val_logits, dim=0)
            val_label = torch.cat(val_label, dim=0)

            val_loss = criterion(val_logits, val_label).float()
            val_acc = (torch.argmax(val_logits, dim=1) == val_label).float().mean()
            
            loss /= step_count
            acc /= step_count
            print('step {}, train loss {:>4.3f}, train acc {:>4.2f}%, val loss {:>4.3f}, val acc {:>4.2f}%'.format(curr_step, loss, acc * 100, val_loss, val_acc * 100))
            loss, acc, step_count = 0, 0, 0
            model.train()


---------epoch 0----------
step 90, train loss 4.995, train acc 1.30%, val loss 4.969, val acc 1.83%
step 180, train loss 4.941, train acc 3.52%, val loss 4.897, val acc 4.80%
---------epoch 1----------
step 270, train loss 4.779, train acc 11.42%, val loss 4.775, val acc 12.40%
step 360, train loss 4.666, train acc 12.24%, val loss 4.552, val acc 20.47%
step 450, train loss 4.390, train acc 16.30%, val loss 4.213, val acc 21.80%
---------epoch 2----------
step 540, train loss 3.863, train acc 23.13%, val loss 3.844, val acc 24.90%
step 630, train loss 3.666, train acc 26.07%, val loss 3.506, val acc 33.43%
step 720, train loss 3.354, train acc 31.44%, val loss 3.198, val acc 38.43%
---------epoch 3----------
step 810, train loss 2.954, train acc 39.47%, val loss 2.904, val acc 43.73%
step 900, train loss 2.776, train acc 42.06%, val loss 2.660, val acc 50.00%
step 990, train loss 2.502, train acc 48.04%, val loss 2.439, val acc 52.73%
---------epoch 4----------
step 1080, train loss 2

In [14]:
from tqdm import tqdm
test_file = "/home/data/tmp/nlp5_text_cnn/ind_val"
test_dataset = ClsDataset([test_file], tokz, label2index, max_lengths=config.max_length)
test_dataloader = DataLoader(test_dataset, sampler=None, batch_size=config.batch_size, collate_fn=PadBatchSeq(tokz.pad_token_id))

test_logits = []
test_label = []
model.eval()
for d_data in test_dataloader:
    text, label = d_data['text'].to(device), d_data['label'].to(device)
    text_len = d_data['text_len'].to(device)
    outputs = model(text, text_len)
    test_label.append(label)
    test_logits.append(outputs)

model.train()
test_logits = torch.cat(test_logits, dim=0)
test_label = torch.cat(test_label, dim=0)

test_loss = criterion(test_logits, test_label).cpu().detach().numpy()
test_acc = (torch.argmax(test_logits, dim=1) == test_label).cpu().float().mean().numpy()
print('test results:')
print('loss', test_loss)
print('acc', test_acc)

reading ind data from ['/home/data/tmp/nlp5_text_cnn/ind_val']
3000 data record loaded
test results:
loss 0.6381175
acc 0.86733335
