# Fine-tuning BERT model for Binary Text Classification

In [1]:
# https://github.com/PacktPublishing/Mastering-Transformers/blob/main/CH05/CH05a_BERT_fine-tuning.ipynb

In [None]:
!pip install transformers
!pip install datasets

In [2]:
# Distill的意思是蒸餾
# https://medium.com/nlp-tsupei/distilbert-%E6%9B%B4%E5%B0%8F%E6%9B%B4%E5%BF%AB%E7%9A%84bert%E6%A8%A1%E5%9E%8B-eec345d17230
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
model_path = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path,id2label = {0:"NEG",1:"POS"},label2id={"NEG":0,"POS":1})

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# 讀取資料 (imdb)
from datasets import load_dataset # https://huggingface.co/datasets?sort=trending&search=imdb
imdb_train = load_dataset('imdb',split="train[:2000]+train[-2000:]")
imdb_test = load_dataset('imdb',split="test[:500]+test[-500:]")
imdb_val = load_dataset('imdb',split="test[500:1000]+test[-1000:-500]")

In [6]:
imdb_train.shape, imdb_test.shape, imdb_val.shape # ((4000, 2), (1000, 2), (1000, 2)) # val = validation

((4000, 2), (1000, 2), (1000, 2))

In [7]:
enc_train = imdb_train.map(lambda e:tokenizer(e['text'],padding=True,truncation=True),batched=True,batch_size=1000)
enc_test = imdb_test.map(lambda e:tokenizer(e['text'],padding=True,truncation=True),batched=True,batch_size=1000)
enc_val = imdb_val.map(lambda e:tokenizer(e['text'],padding=True,truncation=True),batched=True,batch_size=1000) # val = validation

Map: 100%|██████████| 1000/1000 [00:00<00:00, 4650.12 examples/s]


In [None]:
"""
資料長成這樣

{"text":"I love sci-fi and am willing to put up with a lot. Sci-fi movies\/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clich\u00e9d and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.","label":0}
"""

In [9]:
import pandas as pd
pd.DataFrame(enc_train).iloc[0]

text              I rented I AM CURIOUS-YELLOW from my video sto...
label                                                             0
input_ids         [101, 1045, 12524, 1045, 2572, 8025, 1011, 375...
attention_mask    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
Name: 0, dtype: object

In [9]:
import torch
from torch import cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [10]:
# https://blog.csdn.net/duzm200542901104/article/details/132762582

from transformers import (AutoTokenizer, AutoConfig,
                              AutoModelForSequenceClassification, TrainingArguments, Trainer)

training_args = TrainingArguments(
    output_dir="./MyIMDBModel2",
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64, # 因為這裡設置為64，所以trainer.evaluate()算出來是
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="steps", # epoch,steps
    save_strategy="steps", # epoch,steps
    fp16=cuda.is_available(),
    load_best_model_at_end=True,
)

In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics (pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {
        "Accuracy":acc,
        "F1":f1,
        "Precision":precision,
        "Recall":recall,
    }

In [12]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = enc_train,
    eval_dataset = enc_val, # enc_val
    compute_metrics = compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [13]:
result = trainer.train() # cpu -> [01:32<55:55,  9.19s/it] 

 13%|█▎        | 50/375 [00:10<01:04,  5.02it/s]

{'loss': 0.6151, 'grad_norm': 2.2415316104888916, 'learning_rate': 2.5e-05, 'epoch': 0.4}


                                                
 14%|█▎        | 51/375 [00:12<04:35,  1.18it/s]

{'eval_loss': 0.3916383683681488, 'eval_Accuracy': 0.845, 'eval_F1': 0.8440755237804946, 'eval_Precision': 0.8533807785439482, 'eval_Recall': 0.845, 'eval_runtime': 2.1576, 'eval_samples_per_second': 463.469, 'eval_steps_per_second': 7.416, 'epoch': 0.4}


 27%|██▋       | 100/375 [00:22<00:55,  4.97it/s]

{'loss': 0.3462, 'grad_norm': 7.698233604431152, 'learning_rate': 4.9e-05, 'epoch': 0.8}


                                                 
 27%|██▋       | 101/375 [00:24<03:52,  1.18it/s]

{'eval_loss': 0.4150848686695099, 'eval_Accuracy': 0.821, 'eval_F1': 0.8170465590954945, 'eval_Precision': 0.8513711135727765, 'eval_Recall': 0.821, 'eval_runtime': 2.146, 'eval_samples_per_second': 465.986, 'eval_steps_per_second': 7.456, 'epoch': 0.8}


 40%|████      | 150/375 [00:34<00:46,  4.88it/s]

{'loss': 0.271, 'grad_norm': 5.171902656555176, 'learning_rate': 4.127272727272727e-05, 'epoch': 1.2}


                                                 
 40%|████      | 150/375 [00:36<00:46,  4.88it/s]

{'eval_loss': 0.3563908040523529, 'eval_Accuracy': 0.867, 'eval_F1': 0.8662067397600373, 'eval_Precision': 0.8759152050018233, 'eval_Recall': 0.867, 'eval_runtime': 2.1522, 'eval_samples_per_second': 464.643, 'eval_steps_per_second': 7.434, 'epoch': 1.2}


 53%|█████▎    | 200/375 [00:47<00:35,  4.93it/s]

{'loss': 0.2178, 'grad_norm': 8.012292861938477, 'learning_rate': 3.2181818181818184e-05, 'epoch': 1.6}


                                                 
 54%|█████▎    | 201/375 [00:49<02:28,  1.17it/s]

{'eval_loss': 0.26697880029678345, 'eval_Accuracy': 0.909, 'eval_F1': 0.9089430894308943, 'eval_Precision': 0.9100250626566415, 'eval_Recall': 0.909, 'eval_runtime': 2.1527, 'eval_samples_per_second': 464.525, 'eval_steps_per_second': 7.432, 'epoch': 1.6}


 67%|██████▋   | 250/375 [00:58<00:23,  5.21it/s]

{'loss': 0.2395, 'grad_norm': 5.997943878173828, 'learning_rate': 2.309090909090909e-05, 'epoch': 2.0}


                                                 
 67%|██████▋   | 251/375 [01:01<01:39,  1.25it/s]

{'eval_loss': 0.26837486028671265, 'eval_Accuracy': 0.899, 'eval_F1': 0.8989708025619404, 'eval_Precision': 0.8994617778151544, 'eval_Recall': 0.899, 'eval_runtime': 2.0205, 'eval_samples_per_second': 494.916, 'eval_steps_per_second': 7.919, 'epoch': 2.0}


 80%|████████  | 300/375 [01:10<00:15,  4.94it/s]

{'loss': 0.0917, 'grad_norm': 3.7232213020324707, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.4}


                                                 
 80%|████████  | 301/375 [01:12<01:02,  1.18it/s]

{'eval_loss': 0.4076792001724243, 'eval_Accuracy': 0.882, 'eval_F1': 0.8816017081462039, 'eval_Precision': 0.8872103018213076, 'eval_Recall': 0.8819999999999999, 'eval_runtime': 2.1502, 'eval_samples_per_second': 465.067, 'eval_steps_per_second': 7.441, 'epoch': 2.4}


 93%|█████████▎| 350/375 [01:22<00:05,  4.89it/s]

{'loss': 0.1096, 'grad_norm': 3.2811977863311768, 'learning_rate': 4.909090909090909e-06, 'epoch': 2.8}


                                                 
 94%|█████████▎| 351/375 [01:25<00:20,  1.18it/s]

{'eval_loss': 0.3522510230541229, 'eval_Accuracy': 0.896, 'eval_F1': 0.8959733691825107, 'eval_Precision': 0.8964059196617336, 'eval_Recall': 0.896, 'eval_runtime': 2.143, 'eval_samples_per_second': 466.625, 'eval_steps_per_second': 7.466, 'epoch': 2.8}


100%|██████████| 375/375 [01:30<00:00,  4.13it/s]

{'train_runtime': 90.852, 'train_samples_per_second': 132.083, 'train_steps_per_second': 4.128, 'train_loss': 0.2571618960698446, 'epoch': 3.0}





Trainer model keeps the best model at the end. Lets evaluate the model on the train/test/validation

In [None]:
!pip install tensorboard

In [11]:
%reload_ext tensorboard
%tensorboard --logdir logs

ModuleNotFoundError: No module named 'tensorboard'

* 因為enc_train 有 4000 筆資料，批次數量將會是：
             $$\frac{4000}{64} \simeq  63$$

In [14]:
import pandas as pd
q = []
for data in [enc_train, enc_val, enc_test]: # val = validation
  trainer.evaluate_result = trainer.evaluate(eval_dataset=data)
  q.append(trainer.evaluate_result)

pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

100%|██████████| 63/63 [00:08<00:00,  7.54it/s]
100%|██████████| 16/16 [00:01<00:00,  8.08it/s]
100%|██████████| 16/16 [00:02<00:00,  7.46it/s]


Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision,eval_Recall
train,0.045514,0.9905,0.9905,0.990504,0.9905
val,0.353074,0.898,0.897993,0.898102,0.898
test,0.275785,0.907,0.906984,0.907275,0.907


In [15]:
model_save_path = "MyBestIMDBModel2"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('MyBestIMDBModel2\\tokenizer_config.json',
 'MyBestIMDBModel2\\special_tokens_map.json',
 'MyBestIMDBModel2\\vocab.txt',
 'MyBestIMDBModel2\\added_tokens.json',
 'MyBestIMDBModel2\\tokenizer.json')

In [16]:
def get_prediction(text):
    inputs = tokenizer(text, padding = True, truncation = True, max_length = 250, return_tensors = "pt").to(device)
    outputs = model(inputs["input_ids"].to(device),inputs["attention_mask"].to(device))
    probs = outputs[0].softmax(1)
    return probs, probs.argmax()

In [15]:
model.to(device)
text = "I didn't like the movie since it bored me"
get_prediction(text)[1].item()

0

Use the model with pipeline



In [17]:
from transformers import pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast
model = DistilBertForSequenceClassification.from_pretrained("./MyBestIMDBModel2/")
tokenizer = DistilBertTokenizerFast.from_pretrained("./MyBestIMDBModel2/")
nlp = pipeline("sentiment-analysis",model=model,tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [18]:
nlp("the movie was very impressive")

[{'label': 'POS', 'score': 0.988654613494873}]