In [None]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
import numpy as np

digit = {'Concluding Statement': 0, 'Claim': 1, 'Evidence': 2, 'Counterclaim': 3, 'Rebuttal': 4, 'Position': 5, 'Lead': 6}
PRETRAINED_MODEL_NAME = "distilbert-base-uncased"

NUM_LABELS = 7

print(torch.cuda.is_available())
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:

def train_test_val_split(x,y,train_ratio = 0.8,validation_ratio = 0.1,test_ratio = 0.1,random_state = 10):
    # random_state for reproduction
    # shuffle must be 'True'
    [x_train, x_test, y_train, y_test] = train_test_split(
x, y, test_size=validation_ratio+test_ratio, random_state=random_state, shuffle=True)
    [x_val, x_test, y_val, y_test] = train_test_split(
    x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=random_state)
    return x_train, y_train, x_test, y_test, x_val, y_val


# 读入训练集和测试集文本
df = pd.read_csv('../input/feedback-prize-2021/train.csv')
texts = df['discourse_text']
labels = df['discourse_type']

# 划分训练集、验证集、测试集
print("===划分训练集为训练集和验证集===")
train_texts, train_labels, test_texts, test_labels, val_texts, val_labels = train_test_val_split(texts, labels)
print("读取训练集文本总数 = ", len(train_texts))
print("读取测试集文本总数 = ", len(test_texts))
print("标记类别种类 = ", set(train_labels))


# 实例话分词器并且编码文本（文本索引化）
tokenizer = DistilBertTokenizerFast.from_pretrained(PRETRAINED_MODEL_NAME)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
print("默认固定文本中包含词语数为%d个" % len(train_encodings[0].tokens))

In [None]:
# 数据集类
class FeedbackDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(digit[self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# 实例化数据集实例
train_dataset = FeedbackDataset(train_encodings, list(train_labels))
val_dataset = FeedbackDataset(val_encodings, list(val_labels))
test_dataset = FeedbackDataset(test_encodings, list(test_labels))
print(test_dataset)

In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# mini-batch
BATCH_SIZE = 7


In [None]:
## 微调训练
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=200,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"
)

model = DistilBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

model=model.to(device)

trainer = Trainer(
    model=model,                         # the instantiated   Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()




In [None]:
# import time
# start = time.time()

# train_acc = []

# EPOCHS = 20
# for epoch in range(EPOCHS):
    
#     running_loss = 0.0
#     for data in trainloader:
        
#         tokens_tensors, segments_tensors, \
#         masks_tensors, labels = [t.to(device) for t in data]

#         optimizer.zero_grad()
#         # forward pass
#         outputs = model(input_ids=tokens_tensors, 
#                         token_type_ids=segments_tensors, 

#                         labels=labels)
#         loss = outputs[0]
#         # backward
#         loss.backward()
#         optimizer.step()
        
#         # 紀錄當前 batch loss
#         running_loss += loss.item()
        
#     # 計算分類準確率
#     _, acc = get_predictions(model, trainloader, compute_acc=True)
#     train_acc.append(acc)

#     print(f"batch size:{BATCH_SIZE}")
#     print(f'[epoch {epoch+1}] loss: {running_loss:3f}, acc: {acc:3f}')

# end = time.time()
# print(f"time:{end-start:.2f}")