## Text Classification 
- 使用模型:    
google-bert/bert-base-chinese

- 使用資料集:  
roberthsu2003/data_for_classification

### 載入套件

In [4]:
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer,DataCollatorWithPadding,AutoModelForSequenceClassification,Trainer,TrainingArguments

### 載入資料集
- roberthsu2003/data_for_classification

In [6]:
#載入資料集
datasets = load_dataset("roberthsu2003/data_for_classification",cache_dir='./my_dataset')
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

In [None]:
#分詞
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

def tokenize_function(item):
    '''
    只要先分詞,不要現在轉成tensor,也不用padding,轉成tensor和自動padding,由DataCollator來作
    由於是classfication,所以目標得到的欄位是'input_ids','token_type_ids','attention_mask'另外還要的是'label'
    '''
    tokenized = tokenizer(item['review'],
                          max_length=512,
                          truncation=True)
    tokenized['label'] = item['label']
    return tokenized

tokenized_datasets = datasets.map(function=tokenize_function,batched=True,
                                  remove_columns=datasets['train'].column_names)
tokenized_datasets

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 777
    })
})

In [None]:
#可以發現input_ids長度是不一樣的
len(tokenized_datasets['train']['input_ids'][0]),len(tokenized_datasets['train']['input_ids'][1])

(135, 119)

### 建立DataCollator

In [11]:
#使用DataCollatorWithPadding,讓長度一樣,並要求傳出pytorch tensor
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,return_tensors="pt")
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

### 載入預訓練模型

In [12]:

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-chinese")
#trainer會自動將模型放到gpu上,所以不用手動cuda()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 建立評估函數

In [13]:
#建立評估函數
import evaluate
acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')
#acc_metric = evaluate.load('evaluate-main/metrics/accuracy/accuracy.py')
#f1_metric = evaluate.load('evaluate-main/metrics/f1/f1.py')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [15]:
#定義評估函數
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions,references=labels)
    f1 = f1_metric.compute(predictions=predictions,references=labels)
    acc.update(f1)
    return acc

### TrainingArguments

In [None]:
#建立TrainingArguments,多加一些參數

train_args = TrainingArguments(
  output_dir='./checkpoints',
  report_to="none",  # Disable W&B logging,在colab訓練時,要關閉
  per_device_train_batch_size=64, #訓練時每個gpu的batch size
  per_device_eval_batch_size=128, #評估時每個gpu的batch size
  num_train_epochs=3, #訓練的epochs,預設是3
  logging_steps=100, #每100個batch log一次,預設是500
  eval_strategy="epoch", #評估策略,每一趟評估1次,預設是no
  save_strategy="epoch", #儲存策略,每一趟儲存1次,預設是no
  learning_rate=2e-5, #學習率,預設是5e-5
  weight_decay=0.01, #權重衰減,預設是0.01
  metric_for_best_model="f1", #最佳模型的指標,預設是eval_loss
  greater_is_better=True, #指標是否越大越好,預設是False
  load_best_model_at_end=True, #訓練完後,是否載入最佳模型,預設是False
  )
#目前的TrainingArguments有預設值,可以直接使用


In [None]:
### 建立Trainer

In [17]:
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    compute_metrics=eval_metric
)


### 訓練模型
trainer.train()

In [None]:
#模型評估
trainer.evaluate()

### 上傳模型,評估資料和tokenizer

In [None]:
from huggingface_hub import login
login()

In [None]:
model_name = "roberthsu2003/models_for_classfication"
trainer.push_to_hub(model_name) #由於有設./checkpoints,所以自動產生checkpoints的repo,也會自動上傳評估至repo

#同時要上傳tokenizer
tokenizer.push_to_hub(model_name)