In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)、
from tqdm import tqdm#进度条显示
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel, BertConfig
import torch
import torch.nn as nn

In [None]:
MODEL_NAME = 'bert-base-uncased'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device=',device)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)#do_lower_case=False表示不识别非小写的单词，为True表示大小写的单词都识别
config = BertConfig.from_pretrained(MODEL_NAME)#下载配置参数
bert_model = BertModel.from_pretrained(MODEL_NAME, config=config)#下载模型参数

In [None]:
os.mkdir('model')
tokenizer.save_pretrained('model')
config.save_pretrained('model')
bert_model.save_pretrained('model')

In [None]:
#text = "[CLS] i love you! my dear friends all the time! what's your problem? [SEP]"
df = pd.read_csv('../input/feedback-prize-2021/train.csv')
sentences = df['discourse_text'].values
labels = df['discourse_type'].values
digit_dic = {'Concluding Statement': 0, 'Claim': 1, 'Evidence': 2, 'Counterclaim': 3, 'Rebuttal': 4, 'Position': 5, 'Lead': 6}
labels = [digit_dic[label] for label in labels]#将标签映射成数字
#text = tokenizer.tokenize(text)
#setences[0:5]
labels[0:5]

In [None]:
sentences = ['[CLS] ' + sent + ' [SEP]' for sent in tqdm(sentences)]#一定注意添加CLS和SEP，因为bert识别以CLS为开头，以SEP为句子（文本）的分隔符
sentences[0:5]#打印前5条数据

In [None]:
tokenized_sents = [tokenizer.tokenize(sent) for sent in tqdm(sentences)]#句子分词

In [None]:
aver_sentence_len = 0
for sent in tqdm(tokenized_sents):
    aver_sentence_len += len(sent)
print('句子平均长度=',aver_sentence_len/len(tokenized_sents))

In [None]:
#将分割后的句子转化成数字  word-->idx
input_ids = [tokenizer.convert_tokens_to_ids(sent) for sent in tqdm(tokenized_sents)]
#input_ids[0]#显示第一条数据

In [None]:
#定义句子最大长度(传统bert最大承受长度为512)
MAX_LEN=100
input_ids = [(sent + [0] * MAX_LEN)[0: MAX_LEN] for sent in tqdm(input_ids)]#进行PADDING
#print(len(input_ids[0]))

In [None]:
#建立mask
attention_masks = []
for seq in tqdm(input_ids):
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
print("第一个attention mask:",attention_masks[0])

In [None]:
#划分训练集、验证集
#random_state为随机数因子，保证每次随机处理的结果都是一致的
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2022, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2022, test_size=0.2)
print("训练集的一个inputs",train_inputs[0])
print("训练集的一个mask",train_masks[0])

In [None]:
#将训练集、验证集转化成tensor
train_inputs = torch.tensor(train_inputs).to(device)
validation_inputs = torch.tensor(validation_inputs).to(device)
train_labels = torch.tensor(train_labels).to(device)
validation_labels = torch.tensor(validation_labels).to(device)
train_masks = torch.tensor(train_masks).to(device)
validation_masks = torch.tensor(validation_masks).to(device)


In [None]:
class Model(nn.Module):
    def __init__(self, bert_model, num_label):
        super().__init__()
        self.bert_model = bert_model
        self.line1 = nn.Linear(768, num_label)
    def forward(self, input_ids, attention_mask):
        out = self.bert_model(input_ids, attention_mask)
        out = self.line1(out[1])
        return out

In [None]:
model = Model(bert_model, 7)
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
opt = torch.optim.AdamW(optimizer_grouped_parameters, 
                        lr=2e-5)
#opt = torch.optim.Adam(model.parameters(), lr=2e-5)
loss = nn.CrossEntropyLoss()
batch_size = 64
epochs = 5

In [None]:
for epoch in range(epochs):
    model.train()
    for index in range(0, len(train_inputs), batch_size):
        train_L, train_R = index, index + batch_size
        logits = model(train_inputs[train_L: train_R], train_masks[train_L: train_R])
        train_loss = loss(logits, train_labels[train_L: train_R])
        opt.zero_grad()
        train_loss.backward()
        opt.step()
        #------------------计算acc--------------------------
        logits = logits.detach().cpu().numpy()#把数据从GPU上取下来，并且从tensor转换成numpy格式
        train_accuracy = accuracy_score(train_labels[train_L: train_R].detach().cpu().numpy(), np.argmax(logits, 1))
        print('epoch=',epoch,str(index)+'/'+str(len(train_inputs)),' train_loss=', train_loss.item(),' train_accuracy=',train_accuracy)
    #------------训练结束一个epoch，看看验证集上的效果---------------
    val_loss_all = 0
    val_accuracy_all = 0
    ans = 0
    for index in range(0, len(validation_inputs), batch_size):
        val_L, val_R = index, index + batch_size
        logits = model(validation_inputs[val_L: val_R], validation_masks[val_L: val_R])
        val_loss = loss(logits, validation_labels[val_L: val_R])
        val_loss_all += val_loss.item()
        #------------------计算acc--------------------------
        logits = logits.detach().cpu().numpy()
        val_accuracy = accuracy_score(validation_labels[val_L: val_R].detach().cpu().numpy(), np.argmax(logits, 1))
        val_accuracy_all += val_accuracy
        ans += 1
    print('val_loss_all=',val_loss_all/ans,' val_accuracy_all=',val_accuracy_all/ans)
