**Fine-tuning example using huggingface's Trainer class:**
https://huggingface.co/docs/transformers/en/training

In [1]:
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:

In [43]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
print(len(dataset['train']))
print(len(dataset['test']))
dataset["train"][100]

650000
50000


{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [45]:
dataset['train'] = dataset['train'].shuffle(seed=42).select(range(1024))
dataset['test'] = dataset['test'].shuffle(seed=42).select(range(128))

# dataset['train'] = dataset['train'].shuffle(seed=42).select(range(5))
# dataset['test'] = dataset['test'].shuffle(seed=42).select(range(5))

In [46]:
# Check class distribution

import pandas as pd

df = pd.DataFrame(dataset['train'])
print(df['label'].value_counts())

df = pd.DataFrame(dataset['test'])
print(df['label'].value_counts())

label
0    214
1    209
3    206
4    203
2    192
Name: count, dtype: int64
label
0    31
4    26
1    25
3    24
2    22
Name: count, dtype: int64


In [47]:
from transformers import AutoTokenizer

# model_name = 'google-bert/bert-base-cased'
model_name = 'google-bert/bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

In [48]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1024
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 128
    })
})

In [49]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [50]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Freeze all layers except the classifier (final) layer
for name, param in model.named_parameters():
    if 'classifier' not in name: # classifier layer
        param.requires_grad = False

# for name, param in model.named_parameters():
#     print(name, param.requires_grad)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [52]:
metric.compute(predictions=np.random.randint(0, 5, size=(10,)), references=np.random.randint(0, 5, size=(10,)))

{'accuracy': 0.1}

In [53]:
# Evaluation function

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch") # report evaluation metrics every epoch

In [None]:
# # Trainer object

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=small_eval_dataset,
#     compute_metrics=compute_metrics, # evaluation metric
# )

In [55]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    # disable_tqdm=True,  # disable progress bar
    output_dir='./train_results',
    save_strategy='epoch', # save model checkpoints at the end of each epoch
    eval_strategy='epoch',  # run validation at the end of each epoch
    # eval_strategy='steps', # experimentation
    # eval_steps=1, # experimentation

    load_best_model_at_end=True,  # load the best model found during training
    # metric_for_best_model="eval_loss",  # Metric to monitor for the best model
    metric_for_best_model='accuracy', # this needs to match a key in the returned dictionary from compute_metrics()
    greater_is_better=True, # higher value for metric_for_best_model is better

    # Logs report training and validation metrics, which may be visualized (including in real time) using TensorBoard.
    logging_dir='./train_logs',
    logging_strategy='epoch',
    # logging_strategy='steps',
    # logging_steps=0.25, # 4 times per epoch
    # logging_steps=1,

    per_device_train_batch_size=128, # batch size per device (CPU core/GPU/etc.)
    per_device_eval_batch_size=128, # batch size per device (CPU core/GPU/etc.)

    # Training hyperparameters
    num_train_epochs=100, # max epochs
    # learning_rate=5e-8, # default is 5e-5
    learning_rate=3e-5,
    # warmup_ratio=0.2, # 20% of training steps
    # max_grad_norm=1.0,
    # weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],  # This is the validation dataset
    # tokenizer=tokenizer, # only used for generative tasks like translation, summarization, question-answering (to convert decoder predictions/logits back to words) or to work with metrics that require post-processing (BLEU, ROUGE)
    data_collator=data_collator, # dynamic padding
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stop after 3 epochs without improvement
    compute_metrics=compute_metrics,  # custom function to compute validation metrics
)

In [56]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [57]:
# %load_ext tensorboard
# %tensorboard --logdir ./train_logs  # Make sure logging_dir is set in TrainingArguments

"""

After I enabled eval per step, this is what the output logging looks like:

{'loss': 1.6268, 'grad_norm': 8.133543014526367, 'learning_rate': 4.986979166666667e-05, 'epoch': 0.0078125}
{'eval_loss': 1.6220676898956299, 'eval_accuracy': 0.2109375, 'eval_runtime': 232.0645, 'eval_samples_per_second': 0.552, 'eval_steps_per_second': 0.069, 'epoch': 0.0078125}
{'loss': 1.6553, 'grad_norm': 6.605373859405518, 'learning_rate': 4.973958333333333e-05, 'epoch': 0.015625}

grad_norm is the norm of all the gradients (before clipping). Use it to track vanishing (small norm) or exploding gradients (large norm).
eval_runtime/eval_steps_per_second indicates that this was very slow.

"""

# Training loss is not decreasing. Neither is grad_norm. Figure out the training hyperparameters.
# Check if anyone else has trained on this dataset; check their hyperparameters.
# Double-check that you're using pipeline correctly.

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.696493,0.164062
2,No log,1.672378,0.210938
3,No log,1.65389,0.203125
4,No log,1.638792,0.242188
5,No log,1.62798,0.265625
6,No log,1.620757,0.289062
7,No log,1.614723,0.296875
8,No log,1.609946,0.28125
9,No log,1.606512,0.242188
10,No log,1.603912,0.257812


TrainOutput(global_step=80, training_loss=1.6312793731689452, metrics={'train_runtime': 368.5549, 'train_samples_per_second': 138.921, 'train_steps_per_second': 1.085, 'total_flos': 2694329778831360.0, 'train_loss': 1.6312793731689452, 'epoch': 10.0})

In [28]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="karimbkh/BERT_fineTuned_Sentiment_Classification_Yelp", truncation=True) # binary output (positive and negative)
# pipe = pipeline(task="text-classification", model="karimbkh/BERT_fineTuned_Sentiment_Classification_Yelp")
# pipe = pipeline(task="sentiment-analysis", model="karimbkh/BERT_fineTuned_Sentiment_Classification_Yelp")
# pipe = pipeline("text-classification", model="overman123/bert-base-cased-finetune-yelp") # not fine-tuned

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [29]:
import numpy as np

iii = np.random.randint(0, len(dataset['test']))
print(iii)
print(dataset['test'][iii]['text'])
print('label:', dataset['test'][iii]['label'])
pipe(dataset['test'][iii]['text'])

65
First time at a cicis, not impressed. We paid the extra $8 for their garlic parmasan and buffalo wings...they were not very good, for the quality, they should, have been included in the buffet.  Good place for kids to eat a slice but not much of a selection.
label: 1


[{'label': 'LABEL_0', 'score': 0.8090751767158508}]

In [42]:
import re
import pandas as pd

n_samples = len(dataset['test'])
out = {'label': np.zeros(n_samples, dtype=float), 'pred': np.zeros(n_samples, dtype=int), 'accurate?': np.zeros(n_samples, dtype=bool), 'score': np.zeros(n_samples)}

for idx in range(n_samples):
    pipe_out = pipe(dataset['test'][idx]['text'])
    if dataset['test'][idx]['label'] == 2:
        out['label'][idx] = 2/4
    else:
        out['label'][idx] = round(dataset['test'][idx]['label']/4,0)

    pred = int(re.sub('LABEL_', '', pipe_out[0]['label']))
    out['pred'][idx] = pred
    out['accurate?'][idx] = (out['label'][idx] == out['pred'][idx])
    out['score'][idx] = pipe_out[0]['score']

out_df = pd.DataFrame(out)
out_df

Unnamed: 0,label,pred,accurate?,score
0,0.5,1,False,0.993511
1,1.0,1,True,0.995754
2,0.0,0,True,0.995852
3,1.0,1,True,0.994323
4,1.0,1,True,0.982678
...,...,...,...,...
123,0.0,0,True,0.991975
124,0.0,0,True,0.995069
125,0.0,0,True,0.988094
126,1.0,1,True,0.991683


In [11]:
stop_event.set()

NameError: name 'stop_event' is not defined

In [None]:
# From ChatGPT
# Not elegant
# Use this just to quickly check GPU usage

import threading
import time
from IPython.display import clear_output, display

# Function to monitor GPU usage
def monitor_gpu():
    while not stop_event.is_set():  # Continue until stopped
        clear_output(wait=True)
        display("Monitoring GPU...")
        !nvidia-smi
        time.sleep(2)  # Update every 2 seconds

# Create an event to signal stopping the monitoring
stop_event = threading.Event()

# Start the GPU monitoring in a separate thread
gpu_monitor_thread = threading.Thread(target=monitor_gpu)
gpu_monitor_thread.start()

# Run your training code
trainer.train()

# Stop the GPU monitoring after training
stop_event.set()
gpu_monitor_thread.join()  # Wait for the thread to finish

'Monitoring GPU...'

Wed Oct  2 16:10:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0              68W /  70W |   1519MiB / 15360MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

KeyboardInterrupt: 

In [None]:
# From ChatGPT - if you want to print the logs but set logging_strategy='no' in TrainingArguments to disable saving to file

class PrintMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        print(logs)  # Print metrics when they are logged

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[PrintMetricsCallback()],  # Custom callback to print metrics
)