# Huggingface: Fine-Tune a Pretrained Model

Ref: https://huggingface.co/docs/transformers/v4.37.2/training

Pipeline: https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/pipelines#transformers.pipeline

In [1]:
! pip install transformers[torch] comet-ml comet-llm datasets evaluate rouge-score --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m599.4/599.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.0/55.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.5/257.5 kB[0m [31m16.6 M

In [2]:
from datasets import load_dataset

In [None]:
import os
import comet_ml
import comet_llm

# initialize comet_ml
comet_ml.init(project_name="clickbait-classification-ft-model-2")

# set this to log HF results and assets to Comet
os.environ["COMET_LOG_ASSETS"] = "True"

In [3]:
hf_dataset = "SotirisLegkas/clickbait"

ds = load_dataset(hf_dataset)

print(f"Train dataset size: {len(ds['train'])}")
print(f"Validation dataset size: {len(ds['validation'])}")
print(f"Test dataset size: {len(ds['test'])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/3.70M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/742k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Train dataset size: 43802
Validation dataset size: 2191
Test dataset size: 8760


In [4]:
ds['train'][10]

{'text': 'CanadaVOTES: CHP candidate Vicki Gunn in York—Simcoe', 'label': 0}

In [5]:
from transformers import AutoTokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [7]:
def tokenize_function(example):
  return tokenizer(example['text'], padding='max_length', truncation=True)


In [8]:
tokenized_datasets = ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/43802 [00:00<?, ? examples/s]

Map:   0%|          | 0/2191 [00:00<?, ? examples/s]

Map:   0%|          | 0/8760 [00:00<?, ? examples/s]

In [9]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_val_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(1000))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="./test_trainer")

comet_ml is installed but `COMET_API_KEY` is not set.


In [12]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(pred):

    #get global experiments
    experiment = comet_ml.get_global_experiment()

    #get y_true and y_preds for eval_dataset
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    #compute precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro')

    #compute accuracy score
    acc = accuracy_score(labels, preds)

    #log confusion matrix
    if experiment:
        epoch = int(experiment.curr_epoch) if experiment.curr_epoch is not None else 0
        experiment.set_epoch(epoch)
        experiment.log_confusion_matrix(
            y_true=labels,
            y_predicted=preds,
            labels=["clickbait", "non-clickbait"]
        )

    return {"accuracy": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall
            }

In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="./test_trainer", evaluation_strategy="epoch")

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.308933,0.901,0.891563,0.913982,0.879458
2,No log,0.369345,0.907,0.898652,0.917543,0.887711
3,No log,0.434743,0.9,0.89367,0.897211,0.890669


TrainOutput(global_step=375, training_loss=0.23935992431640624, metrics={'train_runtime': 356.7793, 'train_samples_per_second': 8.409, 'train_steps_per_second': 1.051, 'total_flos': 789333166080000.0, 'train_loss': 0.23935992431640624, 'epoch': 3.0})

In [18]:
trainer.evaluate()

{'eval_loss': 0.43474310636520386,
 'eval_accuracy': 0.9,
 'eval_f1': 0.8936695078174177,
 'eval_precision': 0.8972114972114973,
 'eval_recall': 0.8906685119238493,
 'eval_runtime': 31.2718,
 'eval_samples_per_second': 31.978,
 'eval_steps_per_second': 3.997,
 'epoch': 3.0}

In [19]:
tokenizer.save_pretrained('./test_trainer')

('./test_trainer/tokenizer_config.json',
 './test_trainer/special_tokens_map.json',
 './test_trainer/vocab.txt',
 './test_trainer/added_tokens.json',
 './test_trainer/tokenizer.json')

In [20]:
# trainer.save_model('./test_trainer')

In [21]:
model.save_pretrained("clickbait-classifier-model-90")

# Load the finetuned model to test the accuracy of  the test dataset

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("clickbait-classifier-model-90")

In [23]:
tester = Trainer(
    model=model,
    eval_dataset=small_test_dataset,
    compute_metrics=compute_metrics,
)

In [24]:
tester.evaluate()

{'eval_loss': 0.4823170602321625,
 'eval_accuracy': 0.884,
 'eval_f1': 0.8751673403325715,
 'eval_precision': 0.8769542146165523,
 'eval_recall': 0.8735167702981244,
 'eval_runtime': 31.6212,
 'eval_samples_per_second': 31.624,
 'eval_steps_per_second': 3.953}

# Using "Pipeline" and "text-classification" to test on our own data

In [25]:
from transformers import pipeline

In [26]:
cls = pipeline("text-classification", model="clickbait-classifier-model-90", tokenizer=tokenizer)

In [37]:
cls("Doctors are stunned by this one weird trick to lose weight!")

[{'label': 'LABEL_1', 'score': 0.5508171916007996}]

# Deploy to Comet

In [29]:
# set existing experiment
import os
from comet_ml import Experiment

COMET_API_KEY = "COMET_API_KEY"

experiment = Experiment(api_key=COMET_API_KEY)
experiment.log_model("clickbait-classifier-model-90", "/content/clickbait-classifier-model-90")
experiment.register_model("clickbait-classifier-model-90")

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/sachs7/clickbait-classification-ft-model-2/06487ed9140d490fac685ad47e03f18c



In [38]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/sachs7/clickbait-classification-ft-model-2/06487ed9140d490fac685ad47e03f18c
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     environment details : 1
[1;38;5;39mCOMET INFO:[0m     filename            : 1
[1;38;5;39mCOMET INFO:[0m     installed packages  : 1
[1;38;5;39mCOMET INFO:[0m     model-element       : 2 (413.20 MB)
[1;38;5;39mCOMET INFO:[0m     notebook            : 2
[1;38;5;39mCOMET INFO:[0m     os packages         : 1
[1;38;5;39mCOMET INFO:[0m     source_code         : 1
[1;38;5;39mCOMET INF