In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)、
from tqdm import tqdm#进度条显示
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoConfig, AutoModel
import torch
import torch.nn as nn
torch.cuda.manual_seed_all(2022)#GPU上固定随机初始化

In [None]:
MODEL_NAME = 'allenai/longformer-base-4096'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device=',device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = AutoConfig.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME, config=config)
#tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)#do_lower_case=False表示不识别非小写的单词，为True表示大小写的单词都识别
#config = BertConfig.from_pretrained(MODEL_NAME)#下载配置参数
#bert_model = BertModel.from_pretrained(MODEL_NAME, config=config)#下载模型参数

In [None]:
os.mkdir('model')
tokenizer.save_pretrained('model')
config.save_pretrained('model')
bert_model.save_pretrained('model')

In [None]:
s = "Sometimes on the news there is either an accident or a suicide. It might involve someone not looking where they're going or tweet that someone sent. It either injury or death. If a mysterious number says I'm going to kill you and they know where you live but you don't know the person's contact,It makes you puzzled and make you start to freak out. Which can end up really badly. "
len(s.split())


In [None]:
text = 'I love you, my dear friends, all the time!'
tokens = tokenizer.encode_plus(text, max_length=30, padding='max_length', truncation=True, return_offsets_mapping=True)
tokens

In [None]:
def load_data_label(file_path, MAX_LEN, df, label_mapping):
    input_ids = []
    attention_masks = []
    labels = []
    for file_name in tqdm(os.listdir(file_path)):
        file_id = file_name.split('.')[0]
        text = open(file_path + '/' + file_name, 'r').read()
        text = text.replace('\n',' ')#把换行符替换成空格
        text2 = ''
        for char in text:#把连续的空格处理成单个空格
            if  char == ' ' and text2[-1] == ' ':
                continue
            else:
                text2 += char
        text = text2 
        tokens = tokenizer.encode_plus(text, max_length=MAX_LEN, padding='max_length', truncation=True, return_offsets_mapping=True)
        input_ids.append(tokens['input_ids'])
        attention_masks.append(tokens['attention_mask'])
        offset_mapping = tokens['offset_mapping']
        #把bert分词后的结果映射到原始的每个词的位置上，比如what's为一个词，bert分词为what is就变成两个了
        pos_mapping = {}
        pos_mapping[0] = -1
        pos = 0
        pos_mapping[1] = pos
        for i in range(2,len(offset_mapping)):
            if offset_mapping[i][0] == offset_mapping[i-1][1]:
                pos_mapping[i] = pos
            else:
                pos += 1
                pos_mapping[i] = pos
        pos_label_mapping = {}#每个词属于的标签类型
        discourse_type = df[df['id'] == file_id]['discourse_type'].values
        predictionstring = df[df['id'] == file_id]['predictionstring'].values
        for _discourse_type, _predictionstring in zip(discourse_type, predictionstring):
            pre = _predictionstring.split()
            for _pre in pre:
                pos_label_mapping[int(_pre)] = label_mapping[_discourse_type]
        single_label = []
        input_id = tokens['input_ids']
        for i in range(len(input_id)):
            if input_id[i] in [0, 1, 2] or pos_mapping[i] not in pos_label_mapping.keys():#这些都是非文本的内容
                single_label.append(7)
            else:
                single_label.append(pos_label_mapping[pos_mapping[i]])
        labels.append(single_label)
    return input_ids, attention_masks, labels

In [None]:
file_path = '../input/feedback-prize-2021/train'
MAX_LEN = 1024#定义文本最大长度
df = pd.read_csv('../input/feedback-prize-2021/train.csv')
label_mapping = {'Concluding Statement': 0, 'Claim': 1, 'Evidence': 2, 'Counterclaim': 3, 'Rebuttal': 4, 'Position': 5, 'Lead': 6}
input_ids, attention_masks, labels = load_data_label(file_path, MAX_LEN, df, label_mapping)

In [None]:
#划分训练集、验证集
#random_state为随机数因子，保证每次随机处理的结果都是一致的
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2022, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2022, test_size=0.2)
#print("训练集的一个inputs",train_inputs[0])
#print("训练集的一个mask",train_masks[0])

In [None]:
#将训练集、验证集转化成tensor
train_inputs = torch.tensor(train_inputs,dtype=torch.int32).to(device)
validation_inputs = torch.tensor(validation_inputs,dtype=torch.int32).to(device)
train_labels = torch.tensor(train_labels,dtype=torch.long).to(device)
validation_labels = torch.tensor(validation_labels,dtype=torch.long).to(device)
train_masks = torch.tensor(train_masks,dtype=torch.float32).to(device)
validation_masks = torch.tensor(validation_masks,dtype=torch.float32).to(device)


In [None]:
class Model(nn.Module):
    def __init__(self, bert_model, num_label):
        super().__init__()
        self.bert_model = bert_model
        self.line1 = nn.Linear(768, num_label)
    def forward(self, input_ids, attention_mask):
        out = self.bert_model(input_ids, attention_mask)
        out = self.line1(out[0])
        return out

In [None]:
model = Model(bert_model, 8)
model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
opt = torch.optim.AdamW(optimizer_grouped_parameters, 
                        lr=2e-5)
#opt = torch.optim.Adam(model.parameters(), lr=2e-5)
loss = nn.CrossEntropyLoss()#多分类的交叉熵损失函数
batch_size = 4
epochs = 5

# for epoch in range(epochs):
    print('epoch=',epoch)
    model.train()
    with tqdm(np.arange(0, len(train_inputs), batch_size), desc='Training...') as tbar:
        for index in tbar:
            train_L, train_R = index, index + batch_size
            logits = model(train_inputs[train_L: train_R], train_masks[train_L: train_R])
            train_loss = loss(logits.reshape(-1, 8), train_labels[train_L: train_R].reshape(-1))
            #梯度更新
            opt.zero_grad()
            train_loss.backward()
            opt.step()
            #------------------计算acc--------------------------
            logits = logits.detach().cpu().numpy()#把数据从GPU上取下来，并且从tensor转换成numpy格式
            train_accuracy = accuracy_score(train_labels[train_L: train_R].detach().cpu().numpy().reshape(-1), np.argmax(logits.reshape(-1,8), 1))
            #print('epoch=',epoch,str(index)+'/'+str(len(train_inputs)),' train_loss=', train_loss.item(),' train_accuracy=',train_accuracy)
            #---------------打印在进度条上--------------
            tbar.set_postfix(train_loss=train_loss.item(),train_acc=train_accuracy)
            tbar.update()  # 默认参数n=1，每update一次，进度+n
    #------------训练结束一个epoch，看看验证集上的效果---------------
    print('开始验证.....')
    with torch.no_grad():
        val_loss_all = 0
        val_accuracy_all = 0
        ans = 0
        for index in tqdm(np.arange(0, len(validation_inputs), batch_size)):
            val_L, val_R = index, index + batch_size
            logits = model(validation_inputs[val_L: val_R], validation_masks[val_L: val_R])
            val_loss = loss(logits.reshape(-1, 8), validation_labels[val_L: val_R].reshape(-1))
            val_loss_all += val_loss.item()
            #------------------计算acc--------------------------
            logits = logits.detach().cpu().numpy()
            val_accuracy = accuracy_score(validation_labels[val_L: val_R].detach().cpu().numpy().reshape(-1), np.argmax(logits.reshape(-1,8), 1))
            val_accuracy_all += val_accuracy
            ans += 1
    print('val_loss_all=',val_loss_all/ans,' val_accuracy_all=',val_accuracy_all/ans)
