<a href="https://colab.research.google.com/github/seojeongyun/finetune_with_bert/blob/main/finetune_with_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install "torch>=2.0" --extra-index-url https://download.pytorch.org/whl/cu117 --upgrade --quiet

In [13]:
!pip install "transformers==4.30.1" "datasets==2.9.0" "accelerate==0.20.1" "evaluate==0.4.0" tensorboard scikit-learn --upgrade --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.12 requires torch<2.1,>=1.7, but you have torch 2.1.0 which is incompatible.
tensorflow 2.13.0 requires tensorboard<2.14,>=2.13, but you have tensorboard 2.14.1 which is incompatible.[0m[31m
[0m

In [86]:
!sudo apt-get install git-lfs --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [1]:
### Data_Loader

from datasets import load_dataset
from transformers import AutoTokenizer

class banking:
    def __init__(self, task, tokenizer):
        self.task = task
        self.tokenizer = tokenizer
        # Dataset id from huggingface.co/dataset
        self.dataset_id = "banking77"
        #
        self.raw_dataset = self.get_dataset()
        #
        self.tokenized_dataset, self.labels, self.num_labels, self.label2id, self.id2label = self.transform_id_label()


    def get_dataset(self):
        raw_dataset = load_dataset(self.dataset_id)
        if self.task == 'train':
            raw_dataset = raw_dataset['train']
        else:
            raw_dataset = raw_dataset['test']
        return raw_dataset

    # Tokenize helper function
    def tokenize(self,batch):
        return self.tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt")

    def transform_id_label(self):
        raw_dataset = self.raw_dataset.rename_column("label", "labels")  # to match Trainer
        tokenized_dataset = raw_dataset.map(self.tokenize, batched=True, remove_columns=["text"])

        labels = tokenized_dataset.features["labels"].names

        num_labels = len(labels)
        label2id, id2label = dict(), dict()
        for i, label in enumerate(labels):
            label2id[label] = str(i)
            id2label[str(i)] = label

        return tokenized_dataset, labels, num_labels, label2id, id2label

In [2]:
### Engine

import evaluate
import numpy as np
import torch

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments


class engine:
    def __init__(self):
        self.device = torch.device("cuda:1")
        # Model id to load the tokenizer
        self.model_id = "bert-base-uncased"
        # Load Tokenizer
        self.tokenizer = self.get_tokenizer()
        #
        self.train_dataload = self.get_loader('train', self.tokenizer)
        self.test_dataload = self.get_loader('test', self.tokenizer)
        #
        self.model = self.get_model()
        #
        self.metric = evaluate.load("f1")
        #
        self.repo_id, self.train_args, self.trainer = self.get_args()

    def get_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        return tokenizer

    def get_loader(self, task, tokenizer):
        if task == 'train':
            loader = banking(task, tokenizer)
        else:
            loader = banking(task, tokenizer)

        return loader

    def get_model(self):
        model = AutoModelForSequenceClassification.from_pretrained(self.model_id,
                                                                   num_labels=self.train_dataload.num_labels,
                                                                   label2id=self.train_dataload.label2id,
                                                                   id2label=self.train_dataload.id2label)
        return model

        # Metric helper method

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return self.metric.compute(predictions=predictions, references=labels, average="weighted")

    def get_args(self):
        # Id for remote repository
        repository_id = "bert-base-banking77-pt2-jy"

        # Define training args
        training_args = TrainingArguments(
            output_dir=repository_id,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=8,
            learning_rate=5e-5,
            num_train_epochs=3,
            # PyTorch 2.0 specifics
            fp16=True,  # bfloat16 training
            torch_compile=True,  # optimizations
            optim="adamw_torch_fused",  # improved optimizer
            # logging & evaluation strategies
            logging_dir=f"{repository_id}/logs",
            logging_strategy="steps",
            logging_steps=200,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            # push to hub parameters
            report_to="tensorboard",
            push_to_hub=True,
            hub_strategy="every_save",
            hub_model_id=repository_id,
            hub_token=HfFolder.get_token(),

        )

        # Create a Trainer instance
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataload.tokenized_dataset,
            eval_dataset=self.test_dataload.tokenized_dataset,
            compute_metrics=self.compute_metrics,
        )

        return repository_id, training_args, trainer


In [14]:
!pip install --quiet --upgrade --pre torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m452.6 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 GB[0m [31m421.1 kB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.12 requires torch<2.1,>=1.7, but you have torch 2.2.0.dev20231009+cu121 which is incompatible.
torchdata 0.6.1 requires torch==2.0.1, but you have torch 2.2.0.dev20231009+cu121 which is incompatible.
torchtext 0.15.2 requires torch==2.0.1, but you have torch 2.2.0.dev20231009+cu121 which is incompatible.
torchvision 0.15.

In [17]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [18]:
### Train

from huggingface_hub import login

if __name__ == '__main__':
    login(
      token="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", # ADD YOUR TOKEN HERE
      add_to_git_credential=True
    )

    # Start training
    Engine = engine()
    Engine.trainer.train()

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful




  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch,Training Loss,Validation Loss,F1
1,1.1268,0.804414,0.84351
2,0.4033,0.369727,0.917249


Epoch,Training Loss,Validation Loss,F1
1,1.1268,0.804414,0.84351
2,0.4033,0.369727,0.917249
3,0.1989,0.312246,0.925758


In [19]:

    # Save processor and create model card
    Engine.tokenizer.save_pretrained(Engine.repo_id)
    Engine.trainer.create_model_card()
    Engine.trainer.push_to_hub()

Upload file logs/events.out.tfevents.1696938526.0478456f1f90.25697.3: 100%|##########| 11.8k/11.8k [00:00<?, ?…

To https://huggingface.co/SeoJeongYun/bert-base-banking77-pt2-jy
   f9da4a5..f7540b3  main -> main

   f9da4a5..f7540b3  main -> main



'https://huggingface.co/SeoJeongYun/bert-base-banking77-pt2-jy/commit/f7540b3b0ef69a297508cae5f16d3b64e3bad869'

In [21]:
### Test

from transformers import pipeline

if __name__ == '__main__':
    trainer = engine()
    # load model from huggingface.co/models using our repository id
    classifier = pipeline("sentiment-analysis", model=trainer.repo_id, tokenizer=trainer.repo_id, device=0)

    sample = "I have been waiting longer than expected for my bank card, could you provide information on when it will arrive?"

    pred = classifier(sample)
    print(pred)
    # [{'label': 'card_arrival', 'score': 0.9903606176376343}]



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

[{'label': 'card_arrival', 'score': 0.6612871885299683}]
