# Finetune Hugging Face BERT with PyTorch Lightning

Running the following cells will train the model using settings that are shown.

In [None]:
import torch

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from lightning.pytorch.loggers import CSVLogger, CometLogger, TensorBoardLogger
from lightning.pytorch.profilers import PyTorchProfiler

from dvclive.lightning import DVCLiveLogger

from datamodule import AutoTokenizerDataModule
from module import CustomModel, LinearBAModel, LinearBEModel
from utils import create_dirs
from config import Config, DataModuleConfig, ModuleConfig

import numpy as np

ModuleNotFoundError: No module named 'torch'

In [2]:
pl.seed_everything(59631546)

Global seed set to 59631546


59631546

In [3]:
from huggingface_hub import login
import os
token = os.getenv('HUG_FACE_TOKEN')
login(token)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

First, let's configure some basic settings

In [4]:
# model and dataset
model_name = ModuleConfig.model_name
max_length = ModuleConfig.max_length
lr = ModuleConfig.learning_rate
dataset_name = DataModuleConfig.dataset_name
batch_size = DataModuleConfig.batch_size

# paths
cache_dir = Config.cache_dir
log_dir = Config.log_dir
ckpt_dir = Config.ckpt_dir
prof_dir = Config.prof_dir
perf_dir = Config.perf_dir
# creates dirs to avoid failure if empty dir has been deleted
create_dirs([cache_dir, log_dir, ckpt_dir, prof_dir, perf_dir])

# set matmul precision
# see https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html
torch.set_float32_matmul_precision("medium")

In [5]:
class LoRACheckpoint(ModelCheckpoint):
    def _save_checkpoint(self, trainer: pl.Trainer, filepath: str) -> None:
        # trainer.save_checkpoint(filepath, self.save_weights_only)
        trainer.lightning_module.encoder.encoder.save_pretrained(filepath)  # 保存するモデルのパスを指定

        self._last_global_step_saved = trainer.global_step
        self._last_checkpoint_saved = filepath

        # notify loggers
        #if trainer.is_global_zero:
        #    for logger in trainer.loggers:
        #        logger.after_save_checkpoint(proxy(self))

Now, we can define our LightningDataModule, which will be used by Trainer for its DataLoaders

In [5]:
lit_datamodule = AutoTokenizerDataModule(
    model_name=model_name,
    dataset_name=dataset_name,
    cache_dir=cache_dir,
    batch_size=batch_size,
    max_length=max_length
)

In [7]:
lit_datamodule.clear_custom_cache()

In [6]:
lit_datamodule.prepare_data()

Global seed set to 59631546
[2025-03-03 16:31:35.157962] Data cache exists. Loading from cache.


In [7]:
lit_datamodule.setup("fit")

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'non_disaster', 'disaster', 'flood', 'extreme_rain', 'earthquake', 'typhoon', 'landslide', 'tsunami', 'volcano', 'wildfire', 'informative', 'non_informative'],
        num_rows: 3660
    })
    test: Dataset({
        features: ['id', 'text', 'non_disaster', 'disaster', 'flood', 'extreme_rain', 'earthquake', 'typhoon', 'landslide', 'tsunami', 'volcano', 'wildfire', 'informative', 'non_informative'],
        num_rows: 914
    })
})


and our custom LightningModule with ResNet

In [10]:
#lit_datamodule.setup("test")

In [6]:
lit_model = CustomModel(learning_rate=lr)

In [None]:
ba_linear_model = LinearBAModel()

next - we are going to define some common callbacks, and our most basic logger - CSVLogger.

EarlyStopping callback helps us to end training early if a convergence criteria is met before the max-iteration setting is reached.

ModelCheckpoint saves the model periodically, and after training finishes, uses best_model_path to retrieve the path to the best checkpoint file and best_model_score to retrieve its score.

In [12]:
'''
    ModelCheckpoint(
        dirpath=ckpt_dir,
        monitor="val_f1",
        filename="model",
        save_top_k=3,
        mode="max",
        save_weights_only=True,
    ),
'''

'\n    ModelCheckpoint(\n        dirpath=ckpt_dir,\n        monitor="val_f1",\n        filename="model",\n        save_top_k=3,\n        mode="max",\n        save_weights_only=True,\n    ),\n'

In [None]:
callbacks = [
    EarlyStopping(monitor="val_f1", mode="max", patience=3),
    ModelCheckpoint(
        dirpath=ckpt_dir,
        monitor="val_f1",
        filename="model",
        save_top_k=3,
        mode="max",
        save_weights_only=True,
    ),

    LearningRateMonitor(logging_interval='step'),

    ]

In [14]:
logger = CSVLogger(
    save_dir=log_dir,
    name="csv-logs",
)

Finally – we create our Trainer and pass in our flags (settings), the callbacks and loggers.  Then we call fit!

In [16]:
def print_seed():
    torch_init_seed = torch.initial_seed()
    torch_cuda_seed = torch.cuda.initial_seed()
    numpy_seed = np.random.get_state()[1][0]

    print(f"pytorch seed: {torch_init_seed}")
    print(f"cuda seed: {torch_cuda_seed}")
    print(f"numpy seed: {numpy_seed}")

In [17]:
pl.seed_everything(59631546)

Global seed set to 59631546


59631546

In [18]:
print_seed()

pytorch seed: 59631546
cuda seed: 59631546
numpy seed: 59631546


In [None]:
lit_trainer = pl.Trainer(
    accelerator="auto",
    devices="auto",
    strategy="auto",
    precision="16-mixed",
    max_epochs=8,
    deterministic=True,
    logger=[logger, CometLogger(api_key="YOUR_COMET_API_KEY"), DVCLiveLogger(save_dvc_exp=True)],
    callbacks=callbacks,
)

CometLogger will be initialized in online mode


NameError: name 'callbacks' is not defined

In [20]:
lit_model

CustomModel(
  (encoder): BERTEmbeeding(
    (encoder): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (distance_embedding): Embedding(1023, 64)
              )
              (output): BertSelfOutput(
                (

In [21]:
torch.cuda.empty_cache()

In [22]:
#lit_trainer.fit(model=lit_model, datamodule=lit_datamodule)

In [None]:
model = lit_model.load_from_checkpoint("./checkpoints/best.ckpt")

RuntimeError: Error(s) in loading state_dict for CustomModel:
	Missing key(s) in state_dict: "classifier.lstm.weight_ih_l0", "classifier.lstm.weight_hh_l0", "classifier.lstm.bias_ih_l0", "classifier.lstm.bias_hh_l0". 

In [None]:
model = ba_linear_model.load_from_checkpoint(r"e:\BEST-bart-linear\model-v35.ckpt")

In [10]:
model

LinerBAModel(
  (encoder): BARTEmbeddings(
    (encoder): BartModel(
      (shared): Embedding(50265, 1024, padding_idx=1)
      (encoder): BartEncoder(
        (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
        (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
        (layers): ModuleList(
          (0-11): 12 x BartEncoderLayer(
            (self_attn): BartSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096,

In [23]:
model = lit_model.load_from_checkpoint(r"C:\Users\syc\Downloads\best-0.8833.ckpt")

Some weights of BertModel were not initialized from the model checkpoint at Twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at Twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model = lit_model.load_from_checkpoint(r"C:\Users\syc\Downloads\model-v235-0.9606.ckpt")

In [16]:
from sklearn.metrics import classification_report

#lit_trainer.validate(model=model, datamodule=lit_datamodule)
# Run validation
# results = lit_trainer.validate(model=model, datamodule=lit_datamodule)
results = lit_trainer.validate(model=model, datamodule=lit_datamodule)

NameError: name 'lit_trainer' is not defined

In [27]:
results

[{'val_loss': 0.1585824191570282,
  'val_accuracy': 0.9458422064781189,
  'val_precision': 0.8862585425376892,
  'val_recall': 0.88088458776474,
  'val_f1': 0.8830068707466125,
  'val_macro_f1': 0.5132007598876953}]

In [28]:
y_true = [label for batch in lit_datamodule.val_dataloader() for label in batch["label"].numpy()]

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
model = model.to(device)

In [31]:
model.eval()
from tqdm.notebook import tqdm
# Prepare list for storing inf results
y_pred = []

# Disable grad for inf
with torch.no_grad():
    for batch in tqdm(lit_datamodule.val_dataloader()):
        #print(batch)
        input_ids = batch[model.input_key].to(model.device)
        attention_mask = batch[model.mask_key].to(model.device)
        #token_type_ids = batch["token_type_ids"]
        
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        
        #print(torch.sigmoid(logits[:, 1]))
        preds = (torch.sigmoid(logits)> 0.5).int()

        y_pred.extend(preds.cpu().numpy())

  0%|          | 0/115 [00:00<?, ?it/s]

In [32]:
label_order = [
    "non_disaster",
    "disaster",
    "flood",
    "extreme_rain",
    "earthquake",
    "typhoon",
    "landslide",
    "tsunami",
    "volcano",
    "wildfire",
    "informative",
    "non_informative"
]


In [33]:
report = classification_report(y_true, y_pred, target_names=label_order, digits=4)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
print(report)

                 precision    recall  f1-score   support

   non_disaster     0.8045    0.7701    0.7869       187
       disaster     0.9374    0.9490    0.9432       726
          flood     0.8611    0.8212    0.8407       151
   extreme_rain     0.8429    0.9516    0.8939        62
     earthquake     0.9167    0.8627    0.8889        51
        typhoon     0.9091    0.8889    0.8989        90
      landslide     0.9333    0.7368    0.8235        57
        tsunami     0.8125    0.9286    0.8667        28
        volcano     0.8571    0.7895    0.8219        38
       wildfire     0.9599    0.9359    0.9477       281
    informative     0.8559    0.8996    0.8772       548
non_informative     0.8333    0.7756    0.8034       361

      micro avg     0.8876    0.8814    0.8845      2580
      macro avg     0.8770    0.8591    0.8661      2580
   weighted avg     0.8876    0.8814    0.8837      2580
    samples avg     0.8826    0.8765    0.8771      2580

