In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install tensorflow==1.14
!pip install pytorch==1.6.0
!pip install tqdm==4.47
!pip install transformers==3.0

In [None]:
!python -V

In [None]:
train_set = '/kaggle/input/nlp-news-text/train_set.csv'
test_set_a = '/kaggle/input/nlp-news-text/test_a.csv'
test_set_b = '/kaggle/input/nlp-news-text/test_b.csv'

In [None]:
train_rows = 3000
test_rows = 300

In [None]:
import logging
import random


import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

# set cuda
gpu = 0
use_cuda = gpu >= 0 and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(gpu)
    device = torch.device("cuda", gpu)
else:
    device = torch.device("cpu")
print("Use cuda: %s, gpu id: %d."%(use_cuda, gpu))

In [None]:
!mkdir user_data
!mkdir prediction_result
!mkdir model

# 数据加载

## 构造全局变量

In [None]:
_global_dict = {}
class gl(object):
    def set_value(name, value):
        _global_dict[name] = value

    def get_value(name, defValue=None):
        try:
            return _global_dict[name]
        except KeyError:
            return defValue


## 加载数据方法

In [None]:
from tqdm import tqdm, tqdm_notebook

# ### 定义 sentence_split，把文章划分为句子
# 
# 作用是：根据一篇文章，把这篇文章分割成多个句子
# text 是一个新闻的文章
# vocab 是词典
# max_sent_len 表示每句话的长度
# max_segment 表示最多有几句话
# 最后返回的 segments 是一个list，其中每个元素是 tuple：(句子长度，句子本身)
def sentence_split(text, vocab, max_sent_len=254, max_segment=16):
    
    words = text.strip().split()
    document_len = len(words)
    # 划分句子的索引，句子长度为 max_sent_len
    index = list(range(0, document_len, max_sent_len))
    index.append(document_len)

    segments = []
    for i in range(len(index) - 1):
        # 根据索引划分句子
        segment = words[index[i]: index[i + 1]]
        assert len(segment) > 0
        # 把出现太少的词替换为 UNK
        segment = [word if word in vocab._id2word else '<UNK>' for word in segment]
        # 添加 tuple:(句子长度，句子本身)
        segments.append([len(segment), segment])

    assert len(segments) > 0
    # 如果大于 max_segment 句话，则局数减少一半，返回一半的句子
    if len(segments) > max_segment:
        segment_ = int(max_segment / 2)
        return segments[:segment_] + segments[-segment_:]
    else:
        # 否则返回全部句子
        return segments
    
# ### 定义 get_examples
# 里面调用 sentence_split

# 最后返回的数据是一个 list，每个元素是一个 tuple: (label, 句子数量，doc)
# 其中 doc 又是一个 list，每个 元素是一个 tuple: (句子长度，word_ids, token_type_ids)
def get_examples(data, word_encoder, vocab, max_sent_len=256, max_segment=8):
    label2id = vocab.label2id
    examples = []

    for text, label in zip(data['text'], data['label']):
        # label
        id = label2id(label)

        # sents_words: 是一个list，其中每个元素是 tuple：(句子长度，句子本身)
        # 由于后面需要添加 一个 CLS 和一个 SEP 的token，所以句子的长度为 max_sent_len-2
        sents_words = sentence_split(text, vocab, max_sent_len-2, max_segment)
        doc = []
        for sent_len, sent_words in sents_words:
            # 把 word 转为 id
            token_ids = word_encoder.encode(sent_words)
            # 这里重新取 sent_len，是因为：在上一步的 encode 函数中，
            # 会给每个句子加上 CLS 和 SEP 的 token，句子的长度会增加 2。
            sent_len = len(token_ids)
            # 构造句子 id
            token_type_ids = [0] * sent_len
            doc.append([sent_len, token_ids, token_type_ids])
        examples.append([id, len(doc), doc])

    print('Total %d docs.' % len(examples))
    return examples

# ### 定义 batch_slice

# build loader
# data 参数就是 get_examples() 得到的
# data是一个 list，每个元素是一个 tuple: (label, 句子数量，doc)
# 其中 doc 又是一个 list，每个 元素是一个 tuple: (句子长度，word_ids, token_type_ids)
def batch_slice(data, batch_size):
    batch_num = int(np.ceil(len(data) / float(batch_size)))
    for i in range(batch_num):
        # 如果 i < batch_num - 1，那么大小为 batch_size，否则就是最后一批数据
        cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i
        docs = [data[i * batch_size + b] for b in range(cur_batch_size)]

        yield docs

# ### 定义 data_iter
# 里面调用 batch_slice


# data 参数就是 get_examples() 得到的
# data是一个 list，每个元素是一个 tuple: (label, 句子数量，doc)
# 其中 doc 又是一个 list，每个 元素是一个 tuple: (句子长度，word_ids, token_type_ids)
def data_iter(data, batch_size, shuffle=True, noise=1.0):
    """
    randomly permute data, then sort by source length, and partition into batches
    ensure that the length of  sentences in each batch
    """

    batched_data = []
    if shuffle:
        # 这里是打乱所有数据
        np.random.shuffle(data)
        # lengths 表示的是 每篇文章的句子数量
        lengths = [example[1] for example in data] 
        noisy_lengths = [- (l + np.random.uniform(- noise, noise)) for l in lengths]
        sorted_indices = np.argsort(noisy_lengths).tolist()
        sorted_data = [data[i] for i in sorted_indices]
    else:
        sorted_data = data
    # 把 batch 的数据放进一个 list    
    batched_data.extend(list(batch_slice(sorted_data, batch_size)))

    if shuffle:
        # 打乱 多个 batch
        np.random.shuffle(batched_data)

    for batch in batched_data:
        yield batch


# 构造字典

Vocab 的作用是：

- 创建 词 和 index 对应的字典，这里包括 2 份字典，分别是：_id2word 和 _id2extword。
- 其中 _id2word 是从新闻得到的， 把词频小于 5 的词替换为了 UNK。对应到模型输入的 batch_inputs1。
- _id2extword 是从 word2vec.txt 中得到的，有 5976 个词。对应到模型输入的 batch_inputs2。
- 后面会有两个 embedding 层，其中 _id2word 对应的 embedding 是可学习的，_id2extword 对应的 embedding 是从文件中加载的，是固定的。
- 创建 label 和 index 对应的字典。
- 上面这些字典，都是基于train_data创建的。

In [None]:
"""字典"""
from collections import Counter
from transformers import BasicTokenizer


basic_tokenizer = BasicTokenizer()

class Vocab():
    def __init__(self, train_data):
        self.min_count = 5
        self.pad = 0
        self.unk = 1
        self._id2word = ['[PAD]', '[UNK]']
        self._id2extword = ['[PAD]', '[UNK]']

        self._id2label = []
        self.target_names = []

        self.build_vocab(train_data)

        reverse = lambda x: dict(zip(x, range(len(x))))
        # 创建词和 index 对应的字典
        self._word2id = reverse(self._id2word)
        # 创建 label 和 index 对应的字典
        self._label2id = reverse(self._id2label)

        logging.info("Build vocab: words %d, labels %d." % (self.word_size, self.label_size))
    
    # 创建词典
    def build_vocab(self, data):
        self.word_counter = Counter()
        # 计算每个词出现的次数
        for text in data['text']:
            words = text.split()
            for word in words:
                self.word_counter[word] += 1

        for word, count in self.word_counter.most_common():
            # 去掉频次小于 min_count = 5 的词，把词存到 _id2word
            if count >= self.min_count:
                self._id2word.append(word)

        label2name = {0: '科技', 1: '股票', 2: '体育', 3: '娱乐', 4: '时政', 5: '社会', 6: '教育', 7: '财经',
                      8: '家居', 9: '游戏', 10: '房产', 11: '时尚', 12: '彩票', 13: '星座'}

        self.label_counter = Counter(data['label'])

        for label in range(len(self.label_counter)):
            count = self.label_counter[label] # 取出 label 对应的次数
            self._id2label.append(label)
            self.target_names.append(label2name[label])# 根据label数字取出对应的名字
    
    # 第一行分别是单词数量、词向量维度
    def load_pretrained_embs(self, embfile):
        with open(embfile, encoding='utf-8') as f:
            lines = f.readlines()
            items = lines[0].split()
            word_count, embedding_dim = int(items[0]), int(items[1])

        index = len(self._id2extword)# 首先添加第一列的单词
        embeddings = np.zeros((word_count + index, embedding_dim))
        # 下面的代码和 vocab.txt 的结构有关
        for line in lines[1:]:
            values = line.split()
            self._id2extword.append(values[0])# 首先添加第一列的单词
            vector = np.array(values[1:], dtype='float64')# 然后添加后面 100 列的词向量
            embeddings[self.unk] += vector
            embeddings[index] = vector
            index += 1
         # unk 的词向量是所有词的平均
        embeddings[self.unk] = embeddings[self.unk] / word_count
        embeddings = embeddings / np.std(embeddings)

        reverse = lambda x: dict(zip(x, range(len(x))))
        self._extword2id = reverse(self._id2extword)

        assert len(set(self._id2extword)) == len(self._id2extword)

        return embeddings
    
    # 根据单词得到 id
    def word2id(self, xs):
        if isinstance(xs, list):
            return [self._word2id.get(x, self.unk) for x in xs]
        return self._word2id.get(xs, self.unk)
    
    # 根据 label 得到 id
    def extword2id(self, xs):
        if isinstance(xs, list):
            return [self._extword2id.get(x, self.unk) for x in xs]
        return self._extword2id.get(xs, self.unk)

    def label2id(self, xs):
        if isinstance(xs, list):
            return [self._label2id.get(x, self.unk) for x in xs]
        return self._label2id.get(xs, self.unk)

    @property
    def word_size(self):
        return len(self._id2word)

    @property
    def extword_size(self):
        return len(self._id2extword)

    @property
    def label_size(self):
        return len(self._id2label)

# 模型及各个模块

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel


bert_path = '/kaggle/input/nlp-news-text/bert-mini/bert-mini/'
dropout = 0.15

sent_hidden_size = 256
sent_num_layers = 2

class WhitespaceTokenizer():
    """WhitespaceTokenizer with vocab."""

    def __init__(self):
        vocab_file = bert_path + 'vocab.txt'
        self._token2id = self.load_vocab(vocab_file)
        # 构造从 id 到 token 的映射
        self._id2token = {v: k for k, v in self._token2id.items()}
        self.max_len = 256
        self.unk = 1 # UNK 在词典中的索引是 1

        logging.info("Build Bert vocab with size %d." % (self.vocab_size))
    
    # 加载词典，返回的是 dict。key 是单词，value 是索引。从 token 到 id 的映射
    def load_vocab(self, vocab_file):
        f = open(vocab_file, 'r')
        lines = f.readlines()
        lines = list(map(lambda x: x.strip(), lines))
        vocab = dict(zip(lines, range(len(lines))))
        return vocab

    def tokenize(self, tokens):
        assert len(tokens) <= self.max_len - 2
        # 在句子前面加上 CLS，最后加上 SEP 
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
         # 把 token 转为 id
        output_tokens = self.token2id(tokens)
        return output_tokens
    
    # 把 token 转为 id，如果没有出现过的词，转为 UNK 对应的 id:1
    def token2id(self, xs):
        if isinstance(xs, list):
            return [self._token2id.get(x, self.unk) for x in xs]
        return self._token2id.get(xs, self.unk)

    @property
    def vocab_size(self):
        return len(self._id2token)


class WordBertEncoder(nn.Module):
    def __init__(self):
        super(WordBertEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.tokenizer = WhitespaceTokenizer()
        # 加载 Bert 模型
        self.bert = BertModel.from_pretrained(bert_path)

        self.pooled = False
        logging.info('Build Bert encoder with pooled {}.'.format(self.pooled))

    def encode(self, tokens):
        tokens = self.tokenizer.tokenize(tokens)
        return tokens

    # 如果参数名字里，包含 ['bias', 'LayerNorm.weight']，那么没有 decay
    # 其他参数都有 0.01 的 decay
    def get_bert_parameters(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in self.bert.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [p for n, p in self.bert.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
        return optimizer_parameters

    def forward(self, input_ids, token_type_ids):
        # bert_len 是句子的长度
        # input_ids: sen_num * bert_len
        # token_type_ids: sen_num  * bert_len

    
        # 256 是 hidden_size
        # sequence_output：sen_num * bert_len * 256。是最后一个 Encoder 输出的 hidden-states
        # pooled_output：sen_num * 256。首先取最后一个 Encoder 层输出的 hidden-states 的第一个位置对应的 hidden-state，
        # 也就是 CLS 对应的 hidden state，是一个 256 维的向量。经过线性变换和 Tanh 激活函数得到最终的 256 维向量。
        # 可以直接用于分类
        sequence_output, pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids)
        # Bert 模型的输出是一个 tuple，包含 4 个元素：last_hidden_state、pooler_output、hidden_states、attentions
        
        if self.pooled:
            reps = pooled_output             # 取第一个元素的 hidden state： sen_num * 256
        else:
            reps = sequence_output[:, 0, :]  # 取第一个元素的 hidden state： sen_num * 256

        if self.training:
            reps = self.dropout(reps)

        return reps # sen_num * 256

class SentEncoder(nn.Module):
    def __init__(self, sent_rep_size):
        super(SentEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.sent_lstm = nn.LSTM(
            input_size=sent_rep_size, # 每个句子经过 CNN 后得到 256 维向量
            hidden_size=sent_hidden_size,# 输出的维度
            num_layers=sent_num_layers,
            batch_first=True,
            bidirectional=True
        )

    def forward(self, sent_reps, sent_masks):
        # sent_reps:  b * doc_len * sent_rep_size
        # sent_masks: b * doc_len
        # sent_hiddens:  b * doc_len * hidden*2
        # sent_hiddens:  batch, seq_len, num_directions * hidden_size
        sent_hiddens, _ = self.sent_lstm(sent_reps)  
        # 对应相乘，用到广播，是为了只保留有句子的位置的数值
        sent_hiddens = sent_hiddens * sent_masks.unsqueeze(2)
        
        if self.training:
            sent_hiddens = self.dropout(sent_hiddens)

        return sent_hiddens


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.weight.data.normal_(mean=0.0, std=0.05)

        self.bias = nn.Parameter(torch.Tensor(hidden_size))
        b = np.zeros(hidden_size, dtype=np.float32)
        self.bias.data.copy_(torch.from_numpy(b))

        self.query = nn.Parameter(torch.Tensor(hidden_size))
        self.query.data.normal_(mean=0.0, std=0.05)

    def forward(self, batch_hidden, batch_masks):
        # batch_hidden: b * doc_len * hidden_size (2 * hidden_size of lstm)
        # batch_masks:  b x doc_len

        # linear
        # key： b * doc_len * hidden
        key = torch.matmul(batch_hidden, self.weight) + self.bias 

        # compute attention
        # matmul 会进行广播
        #outputs: b * doc_len
        outputs = torch.matmul(key, self.query)  
        # 1 - batch_masks 就是取反，把没有单词的句子置为 0
        # masked_fill 的作用是 在 为 1 的地方替换为 value: float(-1e32)
        masked_outputs = outputs.masked_fill((1 - batch_masks).bool(), float(-1e32))
        #attn_scores：b * doc_len
        attn_scores = F.softmax(masked_outputs, dim=1)  

        # 对于全零向量，-1e32的结果为 1/len, -inf为nan, 额外补0
        masked_attn_scores = attn_scores.masked_fill((1 - batch_masks).bool(), 0.0)

        # sum weighted sources
        # masked_attn_scores.unsqueeze(1)：# b * 1 * doc_len
        # key：b * doc_len * hidden
        # batch_outputs：b * hidden
        batch_outputs = torch.bmm(masked_attn_scores.unsqueeze(1), key).squeeze(1)  

        return batch_outputs, attn_scores


class Model(nn.Module):
    """Model Complete Flow"""

    def __init__(self, vocab):
        super(Model, self).__init__()
        self.sent_rep_size = 256 # 经过 Bert 后得到的 256 维向量
        self.doc_rep_size = sent_hidden_size * 2 # lstm 最后输出的向量长度
        self.all_parameters = {}
        parameters = []
        self.word_encoder = WordBertEncoder()
        bert_parameters = self.word_encoder.get_bert_parameters()
        
        parameters.extend(list(filter(lambda p: p.requires_grad, self.word_encoder.parameters())))

        self.sent_encoder = SentEncoder(self.sent_rep_size)
        self.sent_attention = Attention(self.doc_rep_size)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_encoder.parameters())))
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_attention.parameters())))
        # doc_rep_size
        self.out = nn.Linear(self.doc_rep_size, vocab.label_size, bias=True)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.out.parameters())))

        if use_cuda:
            self.to(device)

        if len(parameters) > 0:
            self.all_parameters["basic_parameters"] = parameters

        print('Build model with cnn word encoder, lstm sent encoder.')

        para_num = sum([np.prod(list(p.size())) for p in self.parameters()])
        logging.info('Model param num: %.2f M.' % (para_num / 1e6))
        para_num = sum([np.prod(list(p.size())) for p in self.parameters()])
        print('Model param num: %.2f M.' % (para_num / 1e6))

    def forward(self, batch_inputs):
        # batch_inputs(batch_inputs1, token_type_ids): b * doc_len * sentence_len
        # batch_masks : b * doc_len * sentence_len
        batch_inputs1, token_type_ids, batch_masks = batch_inputs
        batch_size, max_doc_len, max_sent_len = batch_inputs1.shape[0], batch_inputs1.shape[1], batch_inputs1.shape[2]
        # batch_inputs1: sentence_num * sentence_len
        batch_inputs1 = batch_inputs1.view(batch_size * max_doc_len, max_sent_len)  
        # token_type_ids: sentence_num * sentence_len
        token_type_ids = token_type_ids.view(batch_size * max_doc_len, max_sent_len)
        # batch_masks: sentence_num * sentence_len 
        batch_masks = batch_masks.view(batch_size * max_doc_len, max_sent_len)  
        # sent_reps: sentence_num * sentence_rep_size
        # sen_num * (3*out_channel) =  sen_num * 256
        sent_reps = self.word_encoder(batch_inputs1, token_type_ids) 
        
        
        # sent_reps：b * doc_len * sent_rep_size
        sent_reps = sent_reps.view(batch_size, max_doc_len, self.sent_rep_size)  
        # batch_masks：b * doc_len * max_sent_len
        batch_masks = batch_masks.view(batch_size, max_doc_len, max_sent_len)  
        # sent_masks：b * doc_len any(2) 表示在 第二个维度上判断
        # 表示如果如果一个句子中有词 true，那么这个句子就是 true，用于给 lstm 过滤
        sent_masks = batch_masks.bool().any(2).float()  # b x doc_len
        # sent_hiddens:  batch, seq_len, 2 * hidden_size
        sent_hiddens = self.sent_encoder(sent_reps, sent_masks)  
        
        
        # doc_reps: b * (2 * hidden_size)
        # atten_scores: b * doc_len
        doc_reps, atten_scores = self.sent_attention(sent_hiddens, sent_masks)  
        
        # b * num_labels
        batch_outputs = self.out(doc_reps)  

        return batch_outputs

# 优化器

In [None]:
"""优化器"""
from transformers import AdamW, get_linear_schedule_with_warmup
learning_rate = 2e-4
bert_lr = 5e-5
decay = .75
decay_step = 5000

class Optimizer:
    """优化器"""

    def __init__(self, model_parameters, steps):
        self.all_params = []
        self.optims = []
        self.schedulers = []

        for name, parameters in model_parameters.items():
            if name.startswith("basic"):
                optim = torch.optim.Adam(parameters, lr=learning_rate)
                self.optims.append(optim)

                l = lambda step: decay ** (step // decay_step)
                scheduler = torch.optim.lr_scheduler.LambdaLR(optim, lr_lambda=l)
                self.schedulers.append(scheduler)
                self.all_params.extend(parameters)
            elif name.startswith("bert"):
                optim_bert = AdamW(parameters, bert_lr, eps=1e-8)
                self.optims.append(optim_bert)

                scheduler_bert = get_linear_schedule_with_warmup(optim_bert, 0, steps)
                self.schedulers.append(scheduler_bert)

                for group in parameters:
                    for p in group['params']:
                        self.all_params.append(p)
            else:
                Exception("no nameed parameters.")

        self.num = len(self.optims)

    def step(self):
        for optim, scheduler in zip(self.optims, self.schedulers):
            optim.step()
            scheduler.step()
            optim.zero_grad()

    def zero_grad(self):
        for optim in self.optims:
            optim.zero_grad()

    def get_lr(self):
        lrs = tuple(map(lambda x: x.get_lr()[-1], self.schedulers))
        lr = ' %.5f' * self.num
        res = lr % lrs
        return res

# 训练器

In [None]:
"""训练器"""


import time

import torch.nn as nn
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, precision_score, recall_score


clip = 5.0
epochs = 18
# epochs = 1
early_stops = 3
log_interval = 50

test_batch_size = 32
train_batch_size = 32
# test_batch_size = 8
# train_batch_size = 8

class Trainer():
    """训练器"""

    def __init__(self, model, vocab, is_train=True):

        train_data = gl.get_value('train_data')
        dev_data = gl.get_value('dev_data')
        test_data = gl.get_value('test_data')

        self.model = model
        self.report = True
        # get_examples() 返回的结果是 一个 list
        # 每个元素是一个 tuple: (label, 句子数量，doc)
        # 其中 doc 又是一个 list，每个 元素是一个 tuple: (句子长度，word_ids, token_type_ids)
        if is_train:
            self.train_data = get_examples(train_data, model.word_encoder, vocab)
            self.batch_num = int(np.ceil(len(self.train_data) / float(train_batch_size)))
            self.dev_data = get_examples(dev_data, model.word_encoder, vocab)
        self.test_data = get_examples(test_data, model.word_encoder, vocab)

        # criterion
        self.criterion = nn.CrossEntropyLoss()

        # label name
        self.target_names = vocab.target_names

        # optimizer
        if is_train:
            self.optimizer = Optimizer(model.all_parameters, steps=self.batch_num * epochs)

        # count
        self.step = 0
        self.early_stop = -1
        self.best_train_f1, self.best_dev_f1 = 0, 0
        self.last_epoch = epochs

    def train(self):
        logging.info('Start training...')
        for epoch in range(1, epochs + 1):
            train_f1 = self._train(epoch)

            dev_f1 = self._eval(epoch)

            if self.best_dev_f1 <= dev_f1:
                logging.info(
                    "Exceed history dev = %.2f, current dev = %.2f" % (self.best_dev_f1, dev_f1))
                
                save_model = gl.get_value('save_model')
                torch.save(self.model.state_dict(), save_model)

                self.best_train_f1 = train_f1
                self.best_dev_f1 = dev_f1
                self.early_stop = 0
            else:
                self.early_stop += 1
                if self.early_stop == early_stops:
                    logging.info(
                        "Eearly stop in epoch %d, best train: %.2f, dev: %.2f" % (
                            epoch - early_stops, self.best_train_f1, self.best_dev_f1))
                    self.last_epoch = epoch
                    break
    def test(self):
        save_model = gl.get_value('save_model')
        self.model.load_state_dict(torch.load(save_model))
        self._eval(self.last_epoch + 1, test=True)

    def _train(self, epoch):
        self.optimizer.zero_grad()
        self.model.train()

        start_time = time.time()
        epoch_start_time = time.time()
        overall_losses = 0
        losses = 0
        batch_idx = 1
        y_pred = []
        y_true = []
        for batch_data in data_iter(self.train_data, train_batch_size, shuffle=True):
            torch.cuda.empty_cache()
            # batch_inputs: (batch_inputs1, token_type_ids, batch_masks)
            # 形状都是：batch_size * doc_len * sent_len
            # batch_labels: batch_size
            batch_inputs, batch_labels = self.batch2tensor(batch_data)
            # batch_outputs：b * num_labels
            batch_outputs = self.model(batch_inputs)
            # criterion 是 CrossEntropyLoss，真实标签的形状是：N
            # 预测标签的形状是：(N,C)
            loss = self.criterion(batch_outputs, batch_labels)
            
            loss.backward()

            loss_value = loss.detach().cpu().item()
            losses += loss_value
            overall_losses += loss_value
            # 把预测值转换为一维，方便下面做 classification_report，计算 f1
            y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
            y_true.extend(batch_labels.cpu().numpy().tolist())
            # 梯度裁剪
            nn.utils.clip_grad_norm_(self.optimizer.all_params, max_norm=clip)
            for optimizer, scheduler in zip(self.optimizer.optims, self.optimizer.schedulers):
                optimizer.step()
                scheduler.step()
            self.optimizer.zero_grad()

            self.step += 1

            if batch_idx % log_interval == 0:
                elapsed = time.time() - start_time
                
                lrs = self.optimizer.get_lr()
                logging.info(
                    '| epoch {:3d} | step {:3d} | batch {:3d}/{:3d} | lr{} | loss {:.4f} | s/batch {:.2f}'.format(
                        epoch, self.step, batch_idx, self.batch_num, lrs,
                        losses / log_interval,
                        elapsed / log_interval))
                
                losses = 0
                start_time = time.time()
                
            batch_idx += 1
            
        overall_losses /= self.batch_num
        during_time = time.time() - epoch_start_time

        # reformat 保留 4 位数字
        overall_losses = reformat(overall_losses, 4)
        score, f1 = get_score(y_true, y_pred)

        logging.info(
            '| epoch {:3d} | score {} | f1 {} | loss {:.4f} | time {:.2f}'.format(epoch, score, f1,
                                                                                  overall_losses,
                                                                                  during_time))
        # 如果预测和真实的标签都包含相同的类别数目，才能调用 classification_report                                                                        
        if set(y_true) == set(y_pred) and self.report:
            report = classification_report(y_true, y_pred, digits=4, target_names=self.target_names)
            logging.info('\n' + report)

        return f1

     # 这里验证集、测试集都使用这个函数，通过 test 来区分使用哪个数据集
    def _eval(self, epoch, test=False):
        self.model.eval()
        start_time = time.time()
        data = self.test_data if test else self.dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                            # batch_inputs: (batch_inputs1, token_type_ids, batch_masks)
            # 形状都是：batch_size * doc_len * sent_len
            # batch_labels: batch_size                                                                  
                batch_inputs, batch_labels = self.batch2tensor(batch_data)
                # batch_outputs：b * num_labels                                                                  
                batch_outputs = self.model(batch_inputs)
                # 把预测值转换为一维，方便下面做 classification_report，计算 f1                                                                  
                y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, f1 = get_score(y_true, y_pred)

            during_time = time.time() - start_time
            
            if test:
                save_test = gl.get_value('save_test')
                df = pd.DataFrame({'label': y_pred})
                df.to_csv(save_test, index=False, sep=',')
            else:
                logging.info('| epoch {:3d} | dev | score {} | f1 {} | time {:.2f}'.format(epoch, score, f1, during_time))
                                                                                  
                if set(y_true) == set(y_pred) and self.report:
                    report = classification_report(y_true, y_pred, digits=4, target_names=self.target_names)
                    logging.info('\n' + report)

        return f1

    # data 参数就是 get_examples() 得到的，经过了分 batch
    # batch_data是一个 list，每个元素是一个 tuple: (label, 句子数量，doc)
    # 其中 doc 又是一个 list，每个 元素是一个 tuple: (句子长度，word_ids, token_type_ids)
    def batch2tensor(self, batch_data):
        '''
            [[label, doc_len, [[sent_len, [sent_id0, ...], [sent_id1, ...]], ...]]
        '''
        batch_size = len(batch_data)
        doc_labels = []
        doc_lens = []
        doc_max_sent_len = []
        for doc_data in batch_data:
            doc_labels.append(doc_data[0])
            doc_lens.append(doc_data[1])
            sent_lens = [sent_data[0] for sent_data in doc_data[2]]
            max_sent_len = max(sent_lens)
            doc_max_sent_len.append(max_sent_len)

        max_doc_len = max(doc_lens)
        max_sent_len = max(doc_max_sent_len)

        batch_inputs1 = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64)
        batch_inputs2 = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64)
        batch_masks = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.float32)
        batch_labels = torch.LongTensor(doc_labels)

        for b in range(batch_size):
            for sent_idx in range(doc_lens[b]):
                sent_data = batch_data[b][2][sent_idx]
                for word_idx in range(sent_data[0]):
                    batch_inputs1[b, sent_idx, word_idx] = sent_data[1][word_idx]
                    batch_inputs2[b, sent_idx, word_idx] = sent_data[2][word_idx]
                    batch_masks[b, sent_idx, word_idx] = 1


        if use_cuda:
            batch_inputs1 = batch_inputs1.to(device)
            batch_inputs2 = batch_inputs2.to(device)
            batch_masks = batch_masks.to(device)
            batch_labels = batch_labels.to(device)

        return (batch_inputs1, batch_inputs2, batch_masks), batch_labels

def get_score(y_ture, y_pred):
    y_ture = np.array(y_ture)
    y_pred = np.array(y_pred)
    f1 = f1_score(y_ture, y_pred, average='macro') * 100
    p = precision_score(y_ture, y_pred, average='macro') * 100
    r = recall_score(y_ture, y_pred, average='macro') * 100

    return str((reformat(p, 2), reformat(r, 2), reformat(f1, 2))), reformat(f1, 2)


def reformat(num, n):
    return float(format(num, '0.' + str(n) + 'f'))


## 一次训练全流程

#### K折交叉验证
初始采样分割成K个子样本，一个单独的子样本被保留作为验证模型的数据，其他K-1个样本用来训练。交叉验证重复K次，每个子样本验证一次，平均K次的结果或者使用其它结合方式，最终得到一个单一估测。这个方法的优势在于，同时重复运用随机产生的子样本进行训练和验证，每次的结果验证一次，10折交叉验证是最常用的 

In [None]:
"""训练流程"""

from sklearn.model_selection import StratifiedKFold, KFold


def dataset_split(data_file, test_data_file, fold_num):
    """划分数据集"""

    # train data
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8',nrows=train_rows)
    #f = pd.read_csv(data_file, sep='\t', encoding='UTF-8', nrows=3000) # 小规模数据测试

    traincsv_texts = f['text'].tolist()
    traincsv_labels = f['label'].tolist()

    # 交叉验证数据集，随机采样
    fold_idx = {}
    # StratifiedKFold 将X_train和 X_test 做有放回抽样，随机分三次，取出索引
    kfold = StratifiedKFold(fold_num, shuffle=True, random_state=seed)

    for fold_i, [train_idx, val_idx] in enumerate(kfold.split(traincsv_texts, traincsv_labels)):

        logging.info("Fold id: %s, Train lens %s, Val lens %s", str(fold_i), str(len(train_idx)), str(len(val_idx)))
        # print(val_idx[:10])

        # shuffle
        np.random.seed(seed)
        np.random.shuffle(train_idx)
        np.random.seed(seed)
        np.random.shuffle(val_idx)
        # 交叉验证保留取得的索引,根据索引再取数据
        fold_idx[fold_i] = {'train': train_idx, 'val': val_idx}

    # test data
    f = pd.read_csv(test_data_file, sep='\t', encoding='UTF-8',nrows=test_rows)
    #f = pd.read_csv(test_data_file, sep='\t', encoding='UTF-8', nrows=300) # 小规模数据测试

    texts = f['text'].tolist()
    test_data = {'label': [0] * len(texts), 'text': texts}
    print("Test lens %s"%str(len(texts)))

    gl.set_value('test_data', test_data)

    return [traincsv_texts, traincsv_labels, fold_idx, test_data]



def train_flow(data_file, test_data_file, fold_num, run_fold, save_name, is_train=True):
    """训练全流程"""

    # 划分数据集
    [traincsv_texts, traincsv_labels, fold_idx, test_data] = dataset_split(data_file, test_data_file, fold_num)


    for fold_i in range(fold_num):
        if fold_i not in run_fold:
            continue

        print("======Fold id: %s, Start Data Loader and Encoder"%str(fold_i))
        train_idx = fold_idx[fold_i]['train']
        val_idx = fold_idx[fold_i]['val']

        labels = []
        texts = []
        for idx in train_idx:
            labels.append(traincsv_labels[idx])
            texts.append(traincsv_texts[idx])
        train_data = {'label': labels, 'text': texts}
    
        labels = []
        texts = []
        for idx in val_idx:
            labels.append(traincsv_labels[idx])
            texts.append(traincsv_texts[idx])
        dev_data = {'label': labels, 'text': texts}

        vocab = Vocab(train_data)
        model = Model(vocab)


        save_model = './model/' + save_name + '_' + str(fold_i) + '.bin'
        save_test = './user_data/' + save_name + '_' + str(fold_i) + '.csv'

        gl.set_value('train_data', train_data)
        gl.set_value('dev_data', dev_data)
        gl.set_value('save_model', save_model)
        gl.set_value('save_test', save_test)

        trainer = Trainer(model, vocab, is_train)

        # train
        if is_train:
            print("======Fold id: %s, Start Training "%str(fold_i))
            trainer.train()

        # test
        print("======Fold id: %s, Start Testing "%str(fold_i))
        trainer.test()

## 伪标签标注

In [None]:
def pseudo_label(save_name, run_fold, data_file, test_file, data_file_pseudo):
    """伪标签生成"""

    save_tests = {}
    for fold_i in run_fold:
        save_test = './user_data/' + save_name + '_' + str(fold_i) + '.csv'
        save_tests[fold_i] = save_test


    df_merge = pd.DataFrame()
    for fold_i in run_fold:
        df = pd.read_csv(save_tests[fold_i])
        col = 'label_'+str(fold_i)
        df_merge[col] = df['label']
    df_merge.to_csv('./user_data/' + save_name + '_merge.csv', index=None)

    # 投票
    df_vote = pd.DataFrame()
    df_vote['label'] = df_merge.apply(lambda x:x.value_counts().idxmax(), axis=1)
    df_vote.to_csv('./user_data/' + save_name + '_vote.csv', index=None)

    # 可信度评估
    df_look = pd.DataFrame()
    df_look = df_merge
    df_look['vote'] = df_vote['label']

    def is_all_same(ser):
        for idx in ser.index:
            if ser.iloc[0] != ser.loc[idx]:
                return 0
        return 1
    df_look['all_same'] = df_look.apply(is_all_same, axis=1)


    # 伪标签数据生成
    traincsv_data = pd.read_csv(data_file, sep='\t', encoding='UTF-8', nrows=train_rows)
    testcsv_data = pd.read_csv(test_file, sep='\t', encoding='UTF-8', nrows=test_rows)
    #traincsv_data = pd.read_csv(data_file, sep='\t', encoding='UTF-8', nrows=3000)
    #testcsv_data = pd.read_csv(test_file, sep='\t', encoding='UTF-8', nrows=300)

    test_weaklabel_same = pd.DataFrame()
    test_weaklabel_same['label'] = df_vote[df_look['all_same'] == 1]['label']
    test_weaklabel_same['text'] = testcsv_data[df_look['all_same'] == 1]['text']

    traincsv_weaklabel_same = pd.concat([traincsv_data, test_weaklabel_same]).reset_index()
    del traincsv_weaklabel_same['index']
    traincsv_weaklabel_same.to_csv(data_file_pseudo, index=None, sep='\t')

    logging.info("Pseudo_label_num: %s", str(test_weaklabel_same.shape[0]))
    logging.info("New_train_data_num: %s", str(traincsv_weaklabel_same.shape[0]))

## 各模型集成

In [None]:
"""模型融合集成"""


def vote_weight(save_names, run_folds, weights, pred_name):
    """加权投票"""

    def get_file_name(save_names, fold_ids):
        save_tests = []
        for i in range(len(save_names)):
            save_name = save_names[i]
            for fold_i in fold_ids[i]:
                save_test = './user_data/' + save_name + '_' + str(fold_i) + '.csv'
                save_tests.append(save_test)

        return save_tests

    save_tests = get_file_name(save_names, run_folds)

    file_name = '-'.join(save_names)

    df_merge = pd.DataFrame()
    for save_test in save_tests:
        df = pd.read_csv(save_test)
        df_merge[save_test] = df['label']

    df_merge.to_csv('./user_data/' + file_name + '-merge.csv', index=None)


    def vote_w(ser):
        group_cols_ls = []
        for name in save_names:
            cols_ls = []
            for col in ser.index:
                if name in col:
                    cols_ls.append(col)
            group_cols_ls.append(cols_ls)

        group_value_counts = []
        for i, cols_ls in enumerate(group_cols_ls):
            group_value_counts.append(ser[cols_ls].value_counts() * weights[i])

        for i, count in enumerate(group_value_counts):
            if i == 0:
                value_count = group_value_counts[0]
            else:
                value_count = value_count.add(count, fill_value=0)

        return value_count.idxmax()

    df_vote = pd.DataFrame()

    df_vote['label'] = df_merge.apply(lambda x:x.value_counts().idxmax(), axis=1)
    df_vote.to_csv('./user_data/' + file_name + '-vote.csv', index=None)

    df_vote['label'] = df_merge.apply(vote_w, axis=1)
    df_vote.to_csv('./user_data/' + file_name + '-vote_wight.csv', index=None)
    df_vote.to_csv('./prediction_result/' + pred_name, index=None)

# 全流程代码入口

In [None]:
def _main():

    # 模型和预测结果输出名称
    save_names = ['model_trainset_pseudo0', 'model_trainset_pseudoa', 'model_trainset_pseudob']
    # 训练折数
    fold_nums = [10, 10, 10]
    # 要训练的折
    run_folds = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 3, 6, 9]]
    # 模型权重
    weights = [0.9*2, 0.9*2.5, 0.9*3*2.5]

    # 小规模数据测试
    # # 模型和预测结果输出名称
    # save_names = ['model_trainset_pseudo0', 'model_trainset_pseudoa', 'model_trainset_pseudob']
    # # 训练折数
    # fold_nums = [10, 10, 10]
    # # 要训练的折
    # run_folds = [[0, 1], [0, 1], [0]]
    # # 模型权重
    # weights = [0.9*2, 0.9*2.5, 0.9*3*2.5]


    print('============Semi-Supervised Train 1 (for test_a).')
    # 第一次半监督训练
    data_file = train_set
    train_data_file = data_file
    test_data_file = test_set_a
    fold_num = fold_nums[0]
    run_fold = run_folds[0]
    save_name = save_names[0]
    semi_a_times = 1
    for semi_i in range(semi_a_times):
        train_flow(train_data_file, test_data_file, fold_num, run_fold, save_name, is_train=True)
        data_file_pseudo = './user_data/' + 'train_set_pseudo_a.csv'
        pseudo_label(save_name, run_fold, data_file, test_data_file, data_file_pseudo)
        train_data_file = data_file_pseudo

    print('============Semi-Supervised Train 2 (for test_b).')
    # 第二次半监督训练
    data_file = './user_data/train_set_pseudo_a.csv'
    train_data_file = data_file
    test_data_file = test_set_b
    fold_num = fold_nums[1]
    run_fold = run_folds[1]
    save_name = save_names[1]
    semi_b_times = 1
    for semi_i in range(semi_b_times):
        train_flow(train_data_file, test_data_file, fold_num, run_fold, save_name, is_train=True)
        data_file_pseudo = './user_data/' + 'train_set_pseudo_b.csv'
        pseudo_label(save_name, run_fold, data_file, test_data_file, data_file_pseudo)
        train_data_file = data_file_pseudo


    print('============Last Train (for test_b).')
    # 最后一次数据再次训练
    data_file = './user_data/train_set_pseudo_b.csv'
    test_data_file = test_set_b
    fold_num = fold_nums[2]
    run_fold = run_folds[2]
    save_name = save_names[2]
    train_flow(data_file, test_data_file, fold_num, run_fold, save_name, is_train=True)


    print('============No Pseudo Test (for test_b).')
    # 预测未加伪标签数据的模型的test_b的结果
    data_file = train_set
    test_data_file = test_set_b
    fold_num = fold_nums[0]
    run_fold = run_folds[0]
    save_name = save_names[0]
    train_flow(data_file, test_data_file, fold_num, run_fold, save_name, is_train=False)


    print('============Model Ensemble, Predicted Results Vote.')
    # 模型集成融合
    pred_name = 'predictions.csv'
    vote_weight(save_names, run_folds, weights, pred_name)

In [None]:
_main()