In [4]:
from transformers import DataCollatorWithPadding
from datasets import load_dataset

#載入資料
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7766
})

In [5]:
#清理資料
cleared_dataset = dataset.filter(lambda item: item['review'] is not None )
cleared_dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [14]:
#拆分資料集
datasets = cleared_dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

In [17]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

def process_tokenizer(item):     
    '''
    只要先分詞,不要現在轉成tensor,也不用padding,轉成tensor和自動padding,由DataCollator來作
    '''
    
    tokenized:dict = tokenizer(item['review'],max_length=128, truncation=True)
    tokenized['label'] = item['label'] 
    return tokenized

tokenize_dataset = datasets.map(function=process_tokenizer,remove_columns=cleared_dataset.column_names,)
tokenize_dataset

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 777
    })
})

In [18]:
collator = DataCollatorWithPadding(tokenizer=tokenizer,return_tensors='pt')
collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [None]:
from torch.utils.data import DataLoader
trainset, validset = tokenize_dataset['train'], tokenize_dataset['test']
trainloader = DataLoader(trainset, batch_size=32, collate_fn=collator, shuffle=True)
validloader = DataLoader(validset, batch_size=64, collate_fn=collator, shuffle=True)

next(enumerate(trainloader))[1]

In [None]:
from torch.optim import Adam
from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-chinese")
if torch.cuda.is_available():
    model = model.cuda()
optimizer = Adam(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import evaluate

#clf_metrics = evaluate.combine(['accuracy','f1'])
clf_metrics = evaluate.combine([
    'evaluate-main/metrics/accuracy/accuracy.py',
    'evaluate-main/metrics/f1/f1.py'])


In [None]:
def evaluate()->float:
    model.eval() #讓模型進入評估模式
    for batch in validloader: #一次評估一個批次,目前每批次64筆
        if torch.cuda.is_available():
            batch = {k:v.cuda() for k,v in batch.items()}
        output = model(**batch)        
        pred = torch.argmax(output.logits,dim=-1) #pred是一維tensor() 
        clf_metrics.add_batch(predictions=pred.long(),references=batch['labels'].long()) 
    return clf_metrics.compute()
    
def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            #batch是dict,包含input_ids,attention_mask,labels,token_type_ids
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k,v in batch.items()}
            optimizer.zero_grad() #梯度歸零
            output = model(**batch) #前向傳播
            loss = output.loss #取得loss
            loss.backward()
            optimizer.step() #更新模型參數
            if global_step % log_step == 0: #每100個批次,輸出一次損失梯度
                print(f"第{ep+1}趟,執行第{global_step}批次,loss:{loss.item()}")
            global_step += 1
            
        #每訓練一趟就評估一次精準度
        clf = evaluate()
        print(f"第{ep+1}趟,{clf}")
        
train()