In [1]:
type_num=9
cuda_num = str(3)
date='12.01'
trainingSet_path0= "../data/sentiment/quan_9/train_"+date+".txt"
valSet_path0= "../data/sentiment/quan_9/test_"+date+".txt"
zeng_path0= "../data/sentiment/quan_9/zeng.txt"
model_save_path='../result/classifier_'+date

In [2]:
"""
手动实现transformer.models.bert.BertForSequenceClassification()函数
根据论文[How to Fine-Tune BERT for Text Classification（2019）](https://www.aclweb.org/anthology/P18-1031.pdf)
在分类问题上，把最后四层进行concat然后maxpooling 输出的结果会比直接输出最后一层的要好
这里进行实现测试

"""
import torch
import torch.nn as nn
from transformers import BertModel,BertTokenizer
import torch.nn.functional as F


class bert_lr_last4layer_Config(nn.Module):
    def __init__(self):
        self.bert_path = "../chinese-bert-wwm"
        self.config_path = "../chinese-bert-wwm/config.json"

        # self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.hidden_size = 768
        self.num_labels = type_num
        # self.dropout_bertout = 0.2
        self.dropout_bertout = 0.5
        self.mytrainedmodel = "../result/bert_clf_model.bin"
        """
        current loss: 0.4363991916179657 	 current acc: 0.8125
        current loss: 0.1328232882924341 	 current acc: 0.9527363184079602
        current loss: 0.11797185830000853 	 current acc: 0.9585411471321695
        train loss:  0.11880445411248554 	 train acc: 0.9583704495516361
        valid loss:  0.1511497257672476 	 valid acc: 0.9431549028896258
        """

class bert_lr_last4layer(nn.Module):

    def __init__(self,config):
        super(bert_lr_last4layer, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path,config = config.config_path)
        self.dropout_bertout = nn.Dropout(config.dropout_bertout)
        self.num_labels = config.num_labels
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        for param in self.bert.parameters():
            param.requires_grad = True

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=True,
        return_dict=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # outputs = outputs[2] # [1]是pooled的结果 # [3]是hidden_states 12层
        hidden_states = outputs.hidden_states
        nopooled_output = torch.cat((hidden_states[9],hidden_states[10],hidden_states[11],hidden_states[12]),1)
        batch_size = nopooled_output.shape[0] # 32
        # print(batch_size)
        # print(nopooled_output.shape) # torch.Size([32, 400, 768])
        kernel_hight = nopooled_output.shape[1]
        pooled_output = F.max_pool2d(nopooled_output,kernel_size = (kernel_hight,1))
        # print(pooled_output.shape) # torch.Size([32, 1, 768])

        flatten = pooled_output.view(batch_size,-1)
        # print(flatten.shape) # [32,768]

        flattened_output = self.dropout_bertout(flatten)

        logits = self.classifier(flattened_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return loss,logits

In [3]:
class Config(object):
    def __init__(self):
        self.config_dict = {
            "data_path": {
                # "trainingSet_path": "../data/sentiment/sentiment.train0.data",
                # "valSet_path": "../data/sentiment/sentiment.valid0.data",
                "trainingSet_path": trainingSet_path0,
                "valSet_path": valSet_path0,
                "testingSet_path": "../data/sentiment/sentiment.test0.data",
                "zeng_path": zeng_path0
            },

            "BERT_path": {
                "file_path": '../chinese-bert-wwm/',
                "config_path": '../chinese-bert-wwm/',
                "vocab_path": '../chinese-bert-wwm/',
            },

            "training_rule": {
                "max_length": 300,  # 输入序列长度，别超过512
                "hidden_dropout_prob": 0.3,
                "num_labels": type_num,  # 几分类个数
                "learning_rate": 1e-5,
                "weight_decay": 1e-2,
                "batch_size": 16
            },

            "result": {
                "model_save_path": '../result/bert_clf_model.bin',
                "config_save_path": '../result/bert_clf_config.json',
                "vocab_save_path": '../result/bert_clf_vocab.txt'
            }
        }

    def get(self, section, name):
        return self.config_dict[section][name]

In [4]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class SentimentDataset(Dataset):
    def __init__(self, path_to_file):
#         print(path_to_file)
        self.dataset = pd.read_csv(path_to_file, sep="\t", names=["text", "label"])
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "text"]
        label = self.dataset.loc[idx, "label"]
        sample = {"text": text, "label": label}
        # print(sample)
        return sample

def convert_text_to_ids(tokenizer, text, max_len=100):
    if isinstance(text, str):
        tokenized_text = tokenizer.encode_plus(text, max_length=max_len, add_special_tokens=True, truncation=True)
        input_ids = tokenized_text["input_ids"]
        token_type_ids = tokenized_text["token_type_ids"]
    elif isinstance(text, list):
        input_ids = []
        token_type_ids = []
        for t in text:
            tokenized_text = tokenizer.encode_plus(t, max_length=max_len, add_special_tokens=True, truncation=True)
            input_ids.append(tokenized_text["input_ids"])
            token_type_ids.append(tokenized_text["token_type_ids"])
    else:
        print("Unexpected input")
    return input_ids, token_type_ids

def seq_padding(tokenizer, X):
    pad_id = tokenizer.convert_tokens_to_ids("[PAD]")
    if len(X) <= 1:
        return torch.tensor(X)
    L = [len(x) for x in X]
    ML = max(L)
    X = torch.Tensor([x + [pad_id] * (ML - len(x)) if len(x) < ML else x for x in X])
    return X

In [5]:
import random

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from sklearn import metrics
import numpy
import transformers
from transformers import AdamW

In [6]:
class transformers_bert_binary_classification(object):
    def __init__(self):
        self.config = Config()
        self.device_setup()

    def device_setup(self):
        """
        设备配置并加载BERT模型
        :return:
        """
        self.freezeSeed()
        # 使用GPU，通过model.to(device)的方式使用
        device_s = "cuda:" + cuda_num
        self.device = torch.device(device_s if torch.cuda.is_available() else "cpu")

        # import os
        # result_dir = "../result"
        # MODEL_PATH = self.config.get("BERT_path", "file_path")
        # config_PATH = self.config.get("BERT_path", "config_path")
        vocab_PATH = self.config.get("BERT_path", "vocab_path")

        # num_labels = self.config.get("training_rule", "num_labels")
        # hidden_dropout_prob = self.config.get("training_rule", "hidden_dropout_prob")

        # 通过词典导入分词器
        self.tokenizer = transformers.BertTokenizer.from_pretrained(vocab_PATH)
        # self.model_config = BertConfig.from_pretrained(config_PATH, num_labels=num_labels,
        #                                                hidden_dropout_prob=hidden_dropout_prob)
        # self.model = BertForSequenceClassification.from_pretrained(MODEL_PATH, config=self.model_config)
        """
        train loss:  0.10704718510208534 	 train acc: 0.9637151849872321
        valid loss:  0.17820182011222863 	 valid acc: 0.9459971577451445
        """
        # 如果想换模型，换成下边这句子
        # bert+lr 跟官方方法差不都
        # self.model = bert_lr(bert_lr_Config())
        # self.model = bert_cnn(bert_cnn_Config())
        self.model = bert_lr_last4layer(bert_lr_last4layer_Config())

        self.model.to(self.device)

    def model_setup(self, zeng=0):
        weight_decay = self.config.get("training_rule", "weight_decay")
        learning_rate = self.config.get("training_rule", "learning_rate")
        print("**model_setup:")
        print("zeng",zeng)
        if zeng == 1:
            learning_rate = learning_rate * 2
        # 定义优化器和损失函数
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': weight_decay},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
        self.optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
        self.criterion = nn.CrossEntropyLoss(weight=torch.from_numpy(np.array([3.0,2.0,5.0,10.0,7.0,2.3,10.0,10.0,10.0])).float())
        self.criterion.to(self.device)

    def get_data(self):
        """
        读取数据
        :return:
        """
        train_set_path = self.config.get("data_path", "trainingSet_path")
        valid_set_path = self.config.get("data_path", "valSet_path")
        batch_size = self.config.get("training_rule", "batch_size")
        zeng_set_path = self.config.get("data_path", "zeng_path")
        print(train_set_path,valid_set_path,batch_size,zeng_set_path)

        # 数据读入
        # 加载数据集
        sentiment_train_set = SentimentDataset(train_set_path)
        sentiment_train_loader = DataLoader(sentiment_train_set, batch_size=batch_size, shuffle=True, num_workers=2)
        sentiment_valid_set = SentimentDataset(valid_set_path)
        sentiment_valid_loader = DataLoader(sentiment_valid_set, batch_size=batch_size, shuffle=False, num_workers=2)

        sentiment_zeng_set = SentimentDataset(zeng_set_path)
        sentiment_zeng_loader = DataLoader(sentiment_zeng_set, batch_size=batch_size, shuffle=True, num_workers=2)

        return sentiment_train_loader, sentiment_valid_loader, sentiment_zeng_loader

    def train_an_epoch(self, iterator, zeng=0):
        print("**train_an_epoch")
        print("zeng",zeng)
        self.model_setup(zeng)
        epoch_loss = 0
        epoch_acc = 0

        for i, batch in enumerate(iterator):
            label = batch["label"]
            text = batch["text"]
            # print(label)
            input_ids, token_type_ids = convert_text_to_ids(self.tokenizer, text)
            input_ids = seq_padding(self.tokenizer, input_ids)
            token_type_ids = seq_padding(self.tokenizer, token_type_ids)
            # 标签形状为 (batch_size, 1)
            label = label.unsqueeze(1)
            # 需要 LongTensor
            input_ids, token_type_ids, label = input_ids.long(), token_type_ids.long(), label.long()
            # 梯度清零
            self.optimizer.zero_grad()
            # 迁移到GPU
            input_ids, token_type_ids, label = input_ids.to(self.device), token_type_ids.to(self.device), label.to(
                self.device)
            output = self.model(input_ids=input_ids, token_type_ids=token_type_ids, labels=label)  # 这里不需要labels
            # BertForSequenceClassification的输出loss和logits
            # BertModel原本的模型输出是last_hidden_state，pooler_output
            # bert_cnn的输出是[batch_size, num_class]
            # print(numpy.array(torch.tensor(output).cpu()).shape)

            y_pred_prob = output[1]
            y_pred_label = y_pred_prob.argmax(dim=1)

            # 计算loss
            # 这个 loss 和 output[0] 是一样的
            loss = self.criterion(y_pred_prob.view(-1, type_num), label.view(-1))  # 多分类改这里
            # loss = output[0]
            # 计算acc
            acc = ((y_pred_label == label.view(-1)).sum()).item()
            # 反向传播
            loss.backward()
            self.optimizer.step()
            # epoch 中的 loss 和 acc 累加
            epoch_loss += loss.item()
            epoch_acc += acc
            if i % 200 == 0:
                print("current loss:", epoch_loss / (i + 1), "\t", "current acc:", epoch_acc / ((i + 1) * len(label)))
        return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset.dataset)

    def evaluate(self, iterator):
        self.model.eval()
        epoch_loss = 0
        epoch_acc = 0
        y_pred_label_all = []
        label_all = []
        with torch.no_grad():
            for _, batch in enumerate(iterator):
                label = batch["label"]
                text = batch["text"]

                input_ids, token_type_ids = convert_text_to_ids(self.tokenizer, text)
                input_ids = seq_padding(self.tokenizer, input_ids)
                token_type_ids = seq_padding(self.tokenizer, token_type_ids)
                label = label.unsqueeze(1)
                input_ids, token_type_ids, label = input_ids.long(), token_type_ids.long(), label.long()
                input_ids, token_type_ids, label = input_ids.to(self.device), token_type_ids.to(self.device), label.to(
                    self.device)
                output = self.model(input_ids=input_ids, token_type_ids=token_type_ids, labels=label)
                # 更改了以下部分
                # y_pred_label = output[1].argmax(dim=1)
                y_pred_prob = output[1]
                y_pred_label = y_pred_prob.argmax(dim=1)
                loss = output[0]
                # loss = self.criterion(y_pred_prob.view(-1, 2), label.view(-1))
                acc = ((y_pred_label == label.view(-1)).sum()).item()
                y_pred_label_all += y_pred_label.tolist()
                label_all += label.view(-1).tolist()

                epoch_loss += loss.item()
                epoch_acc += acc

        print(metrics.classification_report(y_pred_label_all, label_all))
        print("准确率:", metrics.accuracy_score(y_pred_label_all, label_all))
        return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset.dataset)

    def train(self, epochs, zeng=0):
        sentiment_train_loader, sentiment_valid_loader, sentiment_zeng_loader = self.get_data()

        for i in range(epochs):
            print('____________________________________________________________________________________')
            print('____________________________________________________________________________________')
            print('epochs:', i)
            print('____________________________________________________________________________________')
            print('____________________________________________________________________________________')
            print('____train____')
            if zeng == 0:
                train_loss, train_acc = self.train_an_epoch(sentiment_train_loader)
            else:
                train_loss, train_acc = self.train_an_epoch(sentiment_zeng_loader, 1)
            print("train loss: ", train_loss, "\t", "train acc:", train_acc)
            print('____evaluate____')
            valid_loss, valid_acc = self.evaluate(sentiment_valid_loader)
            print("valid loss: ", valid_loss, "\t", "valid acc:", valid_acc)
        # self.save_model()

    def save_model(self):
        model_save_path = self.config.get("result", "model_save_path")
        config_save_path = self.config.get("result", "config_save_path")
        vocab_save_path = self.config.get("result", "vocab_save_path")

        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
        torch.save(model_to_save.state_dict(), model_save_path)
        # model_to_save.config.to_json_file(config_save_path) # !!!'bert_lr' object has no attribute 'config'
        # self.tokenizer.save_vocabulary(vocab_save_path)
        print("model saved...")

    def predict(self, sentence):
        # self.model.setup()
        self.model_setup()
        self.model.eval()
        # 转token后padding
        input_ids, token_type_ids = convert_text_to_ids(self.tokenizer, sentence)
        input_ids = seq_padding(self.tokenizer, [input_ids])
        token_type_ids = seq_padding(self.tokenizer, [token_type_ids])
        # 需要 LongTensor
        input_ids, token_type_ids = input_ids.long(), token_type_ids.long()
        # 梯度清零
        self.optimizer.zero_grad()
        # 迁移到GPU
        input_ids, token_type_ids = input_ids.to(self.device), token_type_ids.to(self.device)
        output = self.model(input_ids=input_ids, token_type_ids=token_type_ids)
        # y_pred_prob:各个类别的概率
        y_pred_prob = output[0]
        # 取概率最大的标签
        y_pred_label = y_pred_prob.argmax(dim=1)

        # 将torch.tensor转换回int形式
        return y_pred_prob, y_pred_label.item()

    def freezeSeed(self):
        seed = 1
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)  # Numpy module.
        random.seed(seed)  # Python random module.
        torch.manual_seed(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

In [8]:
classifier = transformers_bert_binary_classification()
classifier.train(3,zeng=0)

../data/sentiment/quan_9/train_12.01.txt ../data/sentiment/quan_9/test_12.01.txt 16 ../data/sentiment/quan_9/zeng.txt
____________________________________________________________________________________
____________________________________________________________________________________
epochs: 0
____________________________________________________________________________________
____________________________________________________________________________________
____train____
**train_an_epoch
zeng 0
**model_setup:
zeng 0
current loss: 3.2899227142333984 	 current acc: 0.0625
current loss: 1.9824948565876899 	 current acc: 0.39552238805970147
train loss:  1.3873074122998796 	 train acc: 0.5885331590983339
____evaluate____
              precision    recall  f1-score   support

           0       0.87      0.72      0.79       443
           1       0.90      0.92      0.91      1025
           2       0.99      0.93      0.96       292
           3       0.85      0.84      0.84       1

In [9]:
print("model saving...")
torch.save(classifier, model_save_path)
print("model saved")

model saving...
model saved


In [10]:
import torch
import jieba
from train99 import transformers_bert_binary_classification

device_s = "cuda:" + cuda_num
device = torch.device(device_s if torch.cuda.is_available() else "cpu")

classifier= torch.load(model_save_path,map_location=device)

ty = ['ICT', '新能源汽车', '生物医药', '医疗器械', '钢铁', '能源', '工业机器人', '先进轨道交通', '其他']
# print(classifier1.predict("『巴西』圣保罗城际铁路听证会延期至10月15日"))  # 0
# print(classifier1.predict("永恒力叉车入驻京东工业品 载重2吨的叉车设备也能线上采购"))  # 0

def read_list(text_path):
    lsit = []
    with open('%s' % text_path, 'r', encoding="utf8") as f:  # 打开一个文件只读模式
        line = f.readlines()  # 读取文件中的每一行，放入line列表中
        for line_list in line:
            lsit.append(line_list.replace('\n', ''))
    return lsit


def test():
    test_list = read_list('test.txt')
    for i in test_list:
        re = classifier.predict(i)  # 0
        result1 = re[1]
        result2 = re[0].tolist()

        if result2[0][result1] < 3.5:
            print(i, '\n', result1, '***** 其他 ***** 原预测:',ty[result1], result2[0][result1], '\n', result2[0], '\n')
        else:
            print(i, '\n', result1, ty[result1], result2[0][result1], '\n', result2[0], '\n')

test()

**model_setup:
zeng 0
e络盟社区发布新一期3D打印电子书 
 6 ***** 其他 ***** 原预测: 工业机器人 1.7983696460723877 
 [1.3854334354400635, 1.6558616161346436, -1.3575588464736938, 0.6565819382667542, -2.8178958892822266, -0.9104854464530945, 1.7983696460723877, -1.6860522031784058, 0.7252152562141418] 

**model_setup:
zeng 0
2021中国国际消费电子博览会和青岛国际软件融合创新博览会开幕 
 0 ICT 3.892798662185669 
 [3.892798662185669, 0.9847543239593506, -1.6201443672180176, 0.16574732959270477, -2.4594860076904297, -1.114639401435852, 0.2824157774448395, -1.5334899425506592, 0.5999731421470642] 

**model_setup:
zeng 0
4D打印软体机器人：打印出来即可工作 
 6 ***** 其他 ***** 原预测: 工业机器人 3.282424211502075 
 [1.4476772546768188, 0.25784653425216675, -1.1309179067611694, 1.3016993999481201, -2.0754218101501465, -1.3914767503738403, 3.282424211502075, -1.3260607719421387, 0.06085821986198425] 

**model_setup:
zeng 0
2021中国国际消费电子博览会和青岛国际软件融合创新博览会盛大开幕 
 0 ICT 3.981077194213867 
 [3.981077194213867, 0.9009576439857483, -1.5962817668914795, 0.1250392645597458, -2.5601296

人事变动 | 原长安总裁周治平任一汽集团党委常委、副总经理 
 1 新能源汽车 4.7309346199035645 
 [1.1381295919418335, 4.7309346199035645, -1.581420660018921, -0.7165270447731018, -1.4886105060577393, -0.503368616104126, -0.811232328414917, -1.1863019466400146, -0.9204074740409851] 

**model_setup:
zeng 0
哈弗大狗上市周年庆 年度宠粉趴半价车豪横大放送 
 1 新能源汽车 5.505380153656006 
 [0.7931630611419678, 5.505380153656006, -1.7287145853042603, -0.8372756838798523, -2.45800518989563, 0.3514062464237213, -0.6154301166534424, -1.3628475666046143, -0.2542288899421692] 

**model_setup:
zeng 0
奋进“十四五”布局新赛道 2021东风汽车品牌秋季发布会即将举办 
 1 新能源汽车 6.126406192779541 
 [1.0796639919281006, 6.126406192779541, -1.760742425918579, -0.9262259006500244, -2.0186493396759033, -0.3545188307762146, -0.8363872766494751, -1.6715068817138672, -0.9059564471244812] 

**model_setup:
zeng 0
2022北京车展将于2022年4月21日开幕 
 1 新能源汽车 4.433758735656738 
 [0.6743661165237427, 4.433758735656738, -1.5155761241912842, -0.9719414114952087, -1.5408525466918945, -0.3479729890823364, -0.875439047813415

汽车电商行业新标准确立  奇瑞iCar生态注入互联网基因 
 1 新能源汽车 4.688958168029785 
 [1.80251145362854, 4.688958168029785, -1.5107767581939697, -0.7911182045936584, -2.151087760925293, -0.6647343039512634, 0.32087796926498413, -1.4818507432937622, -1.118051290512085] 

**model_setup:
zeng 0
传出收购华晨“中华”品牌后！宝马将在沈阳追加投资250亿元 
 1 新能源汽车 5.020426273345947 
 [0.7418861389160156, 5.020426273345947, -1.9221000671386719, -0.7942976951599121, -2.592296600341797, 0.07500428706407547, -0.2933932840824127, -1.2608689069747925, 0.03509420156478882] 

**model_setup:
zeng 0
意法半导体推出具有内置智能的汽车高端驱动器 可节省高达40%的PCB面积 
 1 ***** 其他 ***** 原预测: 新能源汽车 3.496642827987671 
 [3.030519723892212, 3.496642827987671, -2.091475486755371, -1.18655264377594, -2.2697997093200684, -1.0044397115707397, -0.018834155052900314, -1.3575127124786377, 0.24112744629383087] 

**model_setup:
zeng 0
华为徐直军：部分汽车芯片获得了美国许可 
 0 ICT 4.538811683654785 
 [4.538811683654785, 2.8099329471588135, -1.8407950401306152, -0.629777729511261, -2.387277126312256, -0.792543351650238,